From 374796d27c48dd018c057686c8adfa62629182e6 Mon Sep 17 00:00:00 2001
From: dANW34V3R <dan.weaver@hotmail.co.uk>
Date: Fri, 12 May 2023 13:59:37 +0100
Subject: [PATCH 1/5] Initial patch

---
 README_RV32.md                                |  13 +
 configs/DEMO_RISCV32.yaml                     | 142 ++++++
 src/include/simeng/Elf.hh                     |  12 +
 src/include/simeng/RegisterValue.hh           |  26 +-
 src/include/simeng/arch/Architecture.hh       |  19 +
 .../simeng/arch/aarch64/Architecture.hh       |   4 +
 src/include/simeng/arch/riscv/Architecture.hh |  44 ++
 src/include/simeng/arch/riscv/Instruction.hh  |  43 +-
 src/include/simeng/models/emulation/Core.hh   |   3 +
 .../simeng/pipeline/PipelineBuffer1.hh        | 107 +++++
 src/lib/CoreInstance.cc                       |   3 +-
 src/lib/Elf.cc                                | 336 +++++++++-----
 src/lib/ModelConfig.cc                        |  11 +-
 src/lib/arch/aarch64/Architecture.cc          |   4 +
 src/lib/arch/riscv/Architecture.cc            | 146 +++++-
 src/lib/arch/riscv/ExceptionHandler.cc        |  19 +-
 src/lib/arch/riscv/Instruction.cc             |   6 +
 src/lib/arch/riscv/InstructionMetadata.cc     |  38 +-
 src/lib/arch/riscv/InstructionMetadata.hh     |  16 +
 src/lib/arch/riscv/Instruction_address.cc     |  21 +-
 src/lib/arch/riscv/Instruction_decode.cc      | 313 ++++++++++++-
 src/lib/arch/riscv/Instruction_execute.cc     | 431 +++++++++++++-----
 src/lib/models/emulation/Core.cc              |  45 +-
 23 files changed, 1527 insertions(+), 275 deletions(-)
 create mode 100644 README_RV32.md
 create mode 100644 configs/DEMO_RISCV32.yaml
 create mode 100644 src/include/simeng/pipeline/PipelineBuffer1.hh

diff --git a/README_RV32.md b/README_RV32.md
new file mode 100644
index 0000000000..fe5942068e
--- /dev/null
+++ b/README_RV32.md
@@ -0,0 +1,13 @@
+# SimEng Updates to support 32-bit RISC-V ISA
+
+- Added 32-bit RISC-V Architecture support
+  - Sample implementation of how a 32 bit mode can be supported as a configuration. The necessary updates to Architecture, Instruction decode and Instruction execution is added.
+  - Added a Demo yaml file DEMO_RISCV32.yaml that can be used as a reference for running using the emulation core.
+  - The exception handler is updated to process a 32-bit register value for only 4 system calls that where used for internal benchmarks but will need wider adoption accross all other system calls.
+- Added Compressed (16-bit) ISA support
+- Added Instruction trace generation support that can be used to log commited instructions. 
+- 32-bit CSR support
+  - Added few CSRs that and sample implementation on how to use them.
+- Added an alternative implementation of pipeline buffer with variable latency support. 
+  - Supports 0 delay that is benefitial for merging pipeline stages if required.
+  - Supports more than 1 cycle delay between pipeline stages.
diff --git a/configs/DEMO_RISCV32.yaml b/configs/DEMO_RISCV32.yaml
new file mode 100644
index 0000000000..3ea962b8bc
--- /dev/null
+++ b/configs/DEMO_RISCV32.yaml
@@ -0,0 +1,142 @@
+---
+# This file is based off of the current tx2.yaml config and serves as an example configuration for RISC-V cores.
+# The following resources where utilised to create the config file and naming schemes:
+# https://en.wikichip.org/wiki/cavium/microarchitectures/vulcan
+
+Core:
+  ISA: rv32
+  Simulation-Mode: emulation
+  Trace: True
+  Clock-Frequency: 2.5
+  Fetch-Block-Size: 32
+Fetch:
+  Fetch-Block-Size: 32
+  Loop-Buffer-Size: 0
+  Loop-Detection-Threshold: 0
+Process-Image:
+  Heap-Size: 1073741824
+  Stack-Size: 1048576
+Register-Set:
+  GeneralPurpose-Count: 154
+  FloatingPoint-Count: 90
+Pipeline-Widths:
+  Commit: 4
+  Dispatch-Rate: 4
+  FrontEnd: 4
+  LSQ-Completion: 2
+Queue-Sizes:
+  ROB: 180
+  Load: 64
+  Store: 36
+Branch-Predictor:
+  BTB-Tag-Bits: 11
+  Saturating-Count-Bits: 2
+  Global-History-Length: 10
+  RAS-entries: 1
+  Fallback-Static-Predictor: "Always-Taken"
+  Branch-Predictor:
+  BTB-bitlength: 16
+L1-Data-Memory:
+  Interface-Type: Flat
+L1-Instruction-Memory:
+  Interface-Type: Flat
+LSQ-L1-Interface:
+  Access-Latency: 4
+  Exclusive: False
+  Load-Bandwidth: 32
+  Store-Bandwidth: 16
+  Permitted-Requests-Per-Cycle: 2
+  Permitted-Loads-Per-Cycle: 2
+  Permitted-Stores-Per-Cycle: 1
+Ports:
+  0:
+    Portname: Port 0
+    Instruction-Support:
+      - INT_SIMPLE
+      - INT_MUL
+  1:
+    Portname: Port 1
+    Instruction-Support:
+      - INT
+  2:
+    Portname: Port 2
+    Instruction-Support:
+      - INT_SIMPLE
+      - INT_MUL
+      - BRANCH
+  3:
+    Portname: Port 4
+    Instruction-Support:
+      - LOAD
+  4:
+    Portname: Port 5
+    Instruction-Support:
+      - LOAD
+  5:
+    Portname: Port 3
+    Instruction-Support:
+      - STORE
+Reservation-Stations:
+  0:
+    Size: 60
+    Dispatch-Rate: 4
+    Ports:
+      - Port 0
+      - Port 1
+      - Port 2
+      - Port 4
+      - Port 5
+      - Port 3
+Execution-Units:
+  0:
+    Pipelined: True
+  1:
+    Pipelined: True
+  2:
+    Pipelined: True
+  3:
+    Pipelined: True
+  4:
+    Pipelined: True
+  5:
+    Pipelined: True
+Latencies:
+  0:
+    Instruction-Groups:
+      - INT_SIMPLE_ARTH
+      - INT_SIMPLE_LOGICAL
+    Execution-Latency: 1
+    Execution-Throughput: 1
+  1:
+    Instruction-Groups:
+      - INT_MUL
+    Execution-Latency: 5
+    Execution-Throughput: 1
+  2:
+    Instruction-Groups:
+      - INT_DIV
+    Execution-Latency: 39
+    Execution-Throughput: 39
+# CPU-Info mainly used to generate a replica of the special (or system) file directory
+# structure
+CPU-Info:
+  # Set Generate-Special-Dir to 'T' to generate the special files directory, or to 'F' to not.
+  # (Not generating the special files directory may require the user to copy over files manually)
+  Generate-Special-Dir: true
+  # Core-Count MUST be 1 as multi-core is not supported at this time. (TX2 true value is 32)
+  Core-Count: 1
+  # Socket-Count MUST be 1 as multi-socket simulations are not supported at this time. (TX2 true value is 2)
+  Socket-Count: 1
+  # SMT MUST be 1 as Simultanious-Multi-Threading is not supported at this time. (TX2 true value is 4)
+  SMT: 1
+  # Below are the values needed to generate /proc/cpuinfo
+  BogoMIPS: 400.00
+  Features: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics cpuid asimdrdm
+  CPU-Implementer: "0x43"
+  CPU-Architecture: 8
+  CPU-Variant: "0x1"
+  CPU-Part: "0x0af"
+  CPU-Revision: 2
+  # Package-Count is used to generate
+  # /sys/devices/system/cpu/cpu{0..Core-Count}/topology/{physical_package_id, core_id}
+  Package-Count: 1
\ No newline at end of file
diff --git a/src/include/simeng/Elf.hh b/src/include/simeng/Elf.hh
index 88e419e6a5..14bcddcb28 100644
--- a/src/include/simeng/Elf.hh
+++ b/src/include/simeng/Elf.hh
@@ -21,6 +21,15 @@ struct ElfHeader {
   uint64_t memorySize;
 };
 
+struct Elf32Header {
+  uint32_t type;
+  uint32_t offset;
+  uint32_t virtualAddress;
+  uint32_t physicalAddress;
+  uint32_t fileSize;
+  uint32_t memorySize;
+};
+
 /** A processed Executable and Linkable Format (ELF) file. */
 class Elf {
  public:
@@ -33,8 +42,11 @@ class Elf {
  private:
   uint64_t entryPoint_;
   std::vector<ElfHeader> headers_;
+  uint32_t entryPoint32_;
+  std::vector<Elf32Header> headers32_;
   bool isValid_ = false;
   uint64_t processImageSize_;
+  bool mode32bit_;
 };
 
 }  // namespace simeng
diff --git a/src/include/simeng/RegisterValue.hh b/src/include/simeng/RegisterValue.hh
index 20004432d0..d85471eda3 100644
--- a/src/include/simeng/RegisterValue.hh
+++ b/src/include/simeng/RegisterValue.hh
@@ -26,10 +26,16 @@ class RegisterValue {
    * number of bytes (defaulting to the size of the template type). */
   template <class T,
             typename std::enable_if_t<!std::is_pointer_v<T>, T>* = nullptr>
-  RegisterValue(T value, uint16_t bytes = sizeof(T)) : bytes(bytes) {
+  RegisterValue(T value, uint16_t bytes = sizeof(T), bool relaxFor32 = true) : bytes(bytes) {
+    relaxedFor32bit_ = relaxFor32;
+    std::memset(this->value, 0, MAX_LOCAL_BYTES);
     if (isLocal()) {
       T* view = reinterpret_cast<T*>(this->value);
-      view[0] = value;
+      if (sizeof(T) > bytes) { // e.g. when T is int64 and bytes is 4
+        std::memcpy(this->value, &value, bytes);
+      } else {
+        view[0] = value;
+      }
 
       if (bytes > sizeof(T)) {
         // Zero the remaining bytes not set by the provided value
@@ -90,11 +96,16 @@ class RegisterValue {
    * the specified datatype. */
   template <class T>
   const T* getAsVector() const {
-    static_assert(alignof(T) <= 8 && "Alignment over 8 bytes not guranteed");
+    static_assert(alignof(T) <= 8 && "Alignment over 8 bytes not guaranteed");
     assert(bytes > 0 && "Attempted to access an uninitialised RegisterValue");
-    assert(sizeof(T) <= bytes &&
-           "Attempted to access a RegisterValue as a datatype larger than the "
-           "data held");
+    assert((sizeof(T) <= bytes || (bytes == 4 && sizeof(T) == 8)) && "Attempted"
+           " to access a RegisterValue as a datatype larger than the "
+           "data held" );
+    if(!relaxedFor32bit_) { // maybe #ifdef if it makes slower?
+      assert(sizeof(T) <= bytes &&
+        "Attempted to access a RegisterValue as a datatype larger than the "
+        "data held");
+    }
     if (isLocal()) {
       return reinterpret_cast<const T*>(value);
     } else {
@@ -129,6 +140,9 @@ class RegisterValue {
   /** The underlying local member value. Aligned to 8 bytes to prevent
    * potential alignment issue when casting. */
   alignas(8) char value[MAX_LOCAL_BYTES];
+
+  /** Switch for different assert checking */
+  bool relaxedFor32bit_;
 };
 
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/Architecture.hh b/src/include/simeng/arch/Architecture.hh
index 4dbe377587..edd404c827 100644
--- a/src/include/simeng/arch/Architecture.hh
+++ b/src/include/simeng/arch/Architecture.hh
@@ -14,6 +14,12 @@ using MacroOp = std::vector<std::shared_ptr<Instruction>>;
 
 namespace arch {
 
+/** Modes. Assume only has 32-bit and 64-bit. */
+enum arch_mode {
+  ARCH_32BIT=1,
+  ARCH_64BIT=0
+};
+
 /** The types of changes that can be made to values within the process state. */
 enum class ChangeType { REPLACEMENT, INCREMENT, DECREMENT };
 
@@ -109,6 +115,19 @@ class Architecture {
   /** Updates System registers of any system-based timers. */
   virtual void updateSystemTimerRegisters(RegisterFileSet* regFile,
                                           const uint64_t iterations) const = 0;
+
+  /** Update trace file */
+  virtual void updateInstrTrace(const std::shared_ptr<Instruction>& instruction,
+                                RegisterFileSet* regFile, uint64_t tick) const = 0;
+
+  /** Return the mode (32-bit or 64-bit) */
+  arch_mode is32BitMode() const {
+    return is32Bit_;
+  }
+
+ protected:
+  /** Mode, either 32-bit or 64-bit */
+  arch_mode is32Bit_;
 };
 
 }  // namespace arch
diff --git a/src/include/simeng/arch/aarch64/Architecture.hh b/src/include/simeng/arch/aarch64/Architecture.hh
index 39d399dc7d..ad14dc1c0e 100644
--- a/src/include/simeng/arch/aarch64/Architecture.hh
+++ b/src/include/simeng/arch/aarch64/Architecture.hh
@@ -84,6 +84,10 @@ class Architecture : public arch::Architecture {
   /** Update the value of SVCRval_. */
   void setSVCRval(const uint64_t newVal) const;
 
+  /** Update trace file */
+  void updateInstrTrace(const std::shared_ptr<simeng::Instruction>& instruction,
+                        RegisterFileSet* regFile, uint64_t tick) const override;
+
  private:
   /** A decoding cache, mapping an instruction word to a previously decoded
    * instruction. Instructions are added to the cache as they're decoded, to
diff --git a/src/include/simeng/arch/riscv/Architecture.hh b/src/include/simeng/arch/riscv/Architecture.hh
index 9e8c169f7a..de6c76c71e 100644
--- a/src/include/simeng/arch/riscv/Architecture.hh
+++ b/src/include/simeng/arch/riscv/Architecture.hh
@@ -2,6 +2,8 @@
 
 #include <forward_list>
 #include <unordered_map>
+#include <fstream>
+#include <iomanip>
 
 #include "simeng/arch/Architecture.hh"
 #include "simeng/arch/riscv/ExceptionHandler.hh"
@@ -14,6 +16,32 @@ namespace simeng {
 namespace arch {
 namespace riscv {
 
+enum riscv_sysreg {
+  SYSREG_MSTATUS = 0x300,
+  SYSREG_MSTATUSH = 0x310,
+  SYSREG_MEPC = 0x341,
+  SYSREG_MCAUSE = 0x342,
+  SYSREG_MHARTID = 0xF14,
+  SYSREG_CYCLE = 0xC00,
+  SYSREG_TIME = 0xC01,
+  SYSREG_INSTRRET = 0xC02
+};
+
+struct constantsPool {
+  const uint8_t alignMask = 0x3;
+  const uint8_t alignMaskCompressed = 0x1;
+  const uint8_t bytesLimit = 4;
+  const uint8_t bytesLimitCompressed = 2;
+  const uint8_t byteLength64 = 8;
+  const uint8_t byteLength32 = 4;
+};
+
+struct archConstants {
+  uint8_t alignMask;
+  uint8_t bytesLimit; /* Minimum bytes the decoder needs to process */
+  uint8_t regWidth; /* Register width in bytes */
+};
+
 /* A basic RISC-V implementation of the `Architecture` interface. */
 class Architecture : public arch::Architecture {
  public:
@@ -63,6 +91,13 @@ class Architecture : public arch::Architecture {
   std::vector<uint16_t> getConfigPhysicalRegisterQuantities(
       YAML::Node config) const override;
 
+  /** Update trace file */
+  void updateInstrTrace(const std::shared_ptr<simeng::Instruction>& instruction,
+                        RegisterFileSet* regFile, uint64_t tick) const override;
+
+  /** Return a struct contains constants */
+  archConstants getConstants() const;
+
  private:
   /** Retrieve an executionInfo object for the requested instruction. If a
    * opcode-based override has been defined for the latency and/or
@@ -95,6 +130,15 @@ class Architecture : public arch::Architecture {
 
   /** A reference to a Linux kernel object to forward syscalls to. */
   kernel::Linux& linux_;
+
+  /** A pointer to the trace file */
+  std::ofstream *traceFile_;
+
+  /** Switch for updateInstrTrace() */
+  bool traceOn_ = false;
+
+  /** A struct contains constants */
+  archConstants constants_;
 };
 
 }  // namespace riscv
diff --git a/src/include/simeng/arch/riscv/Instruction.hh b/src/include/simeng/arch/riscv/Instruction.hh
index 61b83037ca..3f023d28b6 100644
--- a/src/include/simeng/arch/riscv/Instruction.hh
+++ b/src/include/simeng/arch/riscv/Instruction.hh
@@ -47,9 +47,23 @@ enum class InstructionException {
   SupervisorCall,
   HypervisorCall,
   SecureMonitorCall,
+  UnmappedSysReg,
   NoAvailablePort
 };
 
+enum CInstructionFormat {
+  CIF_CR,
+  CIF_CI,
+  CIF_CSS,
+  CIF_CIW,
+  CIF_CL,
+  CIF_CS,
+  CIF_CA,
+  CIF_CB,
+  CIF_CJ,
+  CIF_INVALID
+};
+
 /** A basic RISC-V implementation of the `Instruction` interface. */
 class Instruction : public simeng::Instruction {
  public:
@@ -163,13 +177,22 @@ class Instruction : public simeng::Instruction {
    * automatically supplied as zero. */
   static const Register ZERO_REGISTER;
 
+  static const Register RA_REGISTER;
+  static const Register SP_REGISTER;
+
+  /** Set register byte width */
+  void setArchRegWidth(uint8_t len);
+
+  /** ONLY valid after decode. Return regByteWidth */
+  uint8_t getArchRegWidth() const;
+
  private:
   /** The maximum number of source registers any supported RISC-V instruction
    * can have. */
   static const uint8_t MAX_SOURCE_REGISTERS = 2;
   /** The maximum number of destination registers any supported RISC-V
    * instruction can have. */
-  static const uint8_t MAX_DESTINATION_REGISTERS = 1;
+  static const uint8_t MAX_DESTINATION_REGISTERS = 2; //CSRs can be another destination apart from std RD
 
   /** A reference to the ISA instance this instruction belongs to. */
   const Architecture& architecture_;
@@ -198,11 +221,19 @@ class Instruction : public simeng::Instruction {
   /** The current exception state of this instruction. */
   InstructionException exception_ = InstructionException::None;
 
+  /** The length of instruction in bytes. */
+  uint8_t archRegWidth_;
+
   // Decoding
   /** Process the instruction's metadata to determine source/destination
    * registers. */
   void decode();
 
+  bool decode16();
+
+  /** Deal with CSR when decoding */
+  bool decodeCsr();
+
   /** Invalidate instructions that are currently not yet implemented. This
  prevents errors during speculated branches with unknown destinations;
  non-executable assertions. memory is decoded into valid but not implemented
@@ -238,6 +269,13 @@ class Instruction : public simeng::Instruction {
   bool isLogical_ = false;
   /** Is this a compare instruction? */
   bool isCompare_ = false;
+  /** Is this a csr operation instruction? */
+  bool isCsr_ = false;
+
+  CInstructionFormat instFormat_ = CIF_INVALID;
+
+  /** Extracted value of current immediate from metadata */
+  uint32_t c_imm = 0;
 
   // Memory
   /** Set the accessed memory addresses, and create a corresponding memory data
@@ -252,6 +290,9 @@ class Instruction : public simeng::Instruction {
    * for sending to memory (according to instruction type). Each entry
    * corresponds to a `memoryAddresses` entry. */
   std::vector<RegisterValue> memoryData;
+
+  /** Return integer register value, to support both 32-bit and 64-bit mode */
+  int64_t getSignedInt(RegisterValue& value) const;
 };
 
 }  // namespace riscv
diff --git a/src/include/simeng/models/emulation/Core.hh b/src/include/simeng/models/emulation/Core.hh
index 9152c6df03..c4a4acc453 100644
--- a/src/include/simeng/models/emulation/Core.hh
+++ b/src/include/simeng/models/emulation/Core.hh
@@ -11,6 +11,9 @@
 #include "simeng/arch/Architecture.hh"
 #include "simeng/span.hh"
 
+// TODO: This is architecture-specific, need to be refactored later. See comments in Core.cc
+#include "simeng/arch/riscv/Architecture.hh"
+
 namespace simeng {
 namespace models {
 namespace emulation {
diff --git a/src/include/simeng/pipeline/PipelineBuffer1.hh b/src/include/simeng/pipeline/PipelineBuffer1.hh
new file mode 100644
index 0000000000..dd2ed70ce7
--- /dev/null
+++ b/src/include/simeng/pipeline/PipelineBuffer1.hh
@@ -0,0 +1,107 @@
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+namespace simeng {
+namespace pipeline_hi {
+
+/** A tickable pipelined buffer. Values are shifted from the tail slot to the
+ * head slot each time `tick()` is called. */
+template <class T>
+class PipelineBuffer {
+ public:
+  /** Construct a pipeline buffer of width `width`, and fill all slots with
+   * `initialValue`. */
+  PipelineBuffer(int width, const T& initialValue)
+      : width(width), buffer(width * defaultLength_, initialValue),
+        length_(defaultLength_), headIndex_(defaultLength_-1),
+        tailIndex_(0) {}
+
+  PipelineBuffer(int width, const T& initialValue, int length)
+      : width(width), buffer(width * length, initialValue), length_(length),
+        headIndex_(length_-1), tailIndex_(0) {
+    assert(length_ != 0 && "Pipeline buffer length cannot be 0");
+  }
+
+  /** Tick the buffer and move head/tail pointers, or do nothing if it's
+   * stalled. */
+  void tick() {
+    if (isStalled_) return;
+
+    //length ==1 shortcut? condition check cost
+
+    if (headIndex_) { // when headIndex != 0
+      headIndex_--;
+    } else {
+      headIndex_ = length_ - 1;
+    }
+    if (tailIndex_) { // when tailIndex != 0
+      tailIndex_--;
+    } else {
+      tailIndex_ = length_ - 1;
+    }
+  }
+
+  /** Get a tail slots pointer. */
+  T* getTailSlots() {
+    T* ptr = buffer.data();
+    return &ptr[tailIndex_ * width];
+  }
+
+  /** Get a const tail slots pointer. */
+  const T* getTailSlots() const {
+    const T* ptr = buffer.data();
+    return &ptr[tailIndex_ * width];
+  }
+
+  /** Get a head slots pointer. */
+  T* getHeadSlots() {
+    T* ptr = buffer.data();
+    return &ptr[headIndex_ * width];
+  }
+
+  /** Get a const head slots pointer. */
+  const T* getHeadSlots() const {
+    const T* ptr = buffer.data();
+    return &ptr[headIndex_ * width];
+  }
+
+  /** Check if the buffer is stalled. */
+  bool isStalled() const { return isStalled_; }
+
+  /** Set the buffer's stall flag to `stalled`. */
+  void stall(bool stalled) { isStalled_ = stalled; }
+
+  /** Fill the buffer with a specified value. */
+  void fill(const T& value) { std::fill(buffer.begin(), buffer.end(), value); }
+
+  /** Get the width of the buffer slots. */
+  unsigned short getWidth() const { return width; }
+
+ private:
+  /** The width of each row of slots. */
+  unsigned short width;
+
+  /** The buffer. */
+  std::vector<T> buffer;
+
+  /** Whether the buffer is stalled or not. */
+  bool isStalled_ = false;
+
+  /** Buffer length */
+  const unsigned int length_;
+
+  /**  */
+  unsigned int headIndex_;
+
+  /**  */
+  unsigned int tailIndex_;
+
+  /** The number of stages in the pipeline. */
+  static const unsigned int defaultLength_ = 2;
+};
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/lib/CoreInstance.cc b/src/lib/CoreInstance.cc
index ddf53b20bf..8ba06c8e08 100644
--- a/src/lib/CoreInstance.cc
+++ b/src/lib/CoreInstance.cc
@@ -233,7 +233,8 @@ void CoreInstance::createCore() {
   }
 
   // Create the architecture, with knowledge of the kernel
-  if (config_["Core"]["ISA"].as<std::string>() == "rv64") {
+  if (config_["Core"]["ISA"].as<std::string>() == "rv64" ||
+      config_["Core"]["ISA"].as<std::string>() == "rv32") {
     arch_ =
         std::make_unique<simeng::arch::riscv::Architecture>(kernel_, config_);
   } else if (config_["Core"]["ISA"].as<std::string>() == "AArch64") {
diff --git a/src/lib/Elf.cc b/src/lib/Elf.cc
index 6654cc86a8..6281598403 100644
--- a/src/lib/Elf.cc
+++ b/src/lib/Elf.cc
@@ -47,127 +47,248 @@ Elf::Elf(std::string path, char** imagePointer) {
   // Check whether this is a 32 or 64-bit executable
   char bitFormat;
   file.read(&bitFormat, sizeof(bitFormat));
-  if (bitFormat != ElfBitFormat::Format64) {
+  if (bitFormat != ElfBitFormat::Format32 && bitFormat != ElfBitFormat::Format64) {
     return;
   }
 
+  mode32bit_ = (bitFormat == ElfBitFormat::Format32);
+
   isValid_ = true;
 
-  /**
-   * Starting from the 24th byte of the ELF header a 64-bit value
-   * represents the virtual address to which the system first transfers
-   * control, thus starting the process.
-   * In `elf64_hdr` this value maps to the member `Elf64_Addr e_entry`.
-   */
+  if (bitFormat == ElfBitFormat::Format64) {
+    /**
+     * Starting from the 24th byte of the ELF header a 64-bit value
+     * represents the virtual address to which the system first transfers
+     * control, thus starting the process.
+     * In `elf64_hdr` this value maps to the member `Elf64_Addr e_entry`.
+     */
 
-  // Seek to the entry point of the file.
-  // The information in between is discarded
-  file.seekg(0x18);
-  file.read(reinterpret_cast<char*>(&entryPoint_), sizeof(entryPoint_));
+    // Seek to the entry point of the file.
+    // The information in between is discarded
+    file.seekg(0x18);
+    file.read(reinterpret_cast<char*>(&entryPoint_), sizeof(entryPoint_));
 
-  /**
-   * Starting from the 32nd byte of the ELF Header a 64-bit value
-   * represents the offset of the ELF Program header or
-   * Program header table in the ELF file.
-   * In `elf64_hdr` this value maps to the member `Elf64_Addr e_phoff`.
-   */
+    /**
+     * Starting from the 32nd byte of the ELF Header a 64-bit value
+     * represents the offset of the ELF Program header or
+     * Program header table in the ELF file.
+     * In `elf64_hdr` this value maps to the member `Elf64_Addr e_phoff`.
+     */
 
-  // Seek to the byte representing the start of the header offset table.
-  uint64_t headerOffset;
-  file.read(reinterpret_cast<char*>(&headerOffset), sizeof(headerOffset));
+    // Seek to the byte representing the start of the header offset table.
+    uint64_t headerOffset;
+    file.read(reinterpret_cast<char*>(&headerOffset), sizeof(headerOffset));
 
-  /**
-   * Starting 54th byte of the ELF Header a 16-bit value indicates
-   * the size of each entry in the ELF Program header. In the `elf64_hdr`
-   * struct this value maps to the member `Elf64_Half e_phentsize`. All
-   * header entries have the same size.
-   * Starting from the 56th byte a 16-bit value represents the number
-   * of header entries in the ELF Program header. In the `elf64_hdr`
-   * struct this value maps to `Elf64_Half e_phnum`.
-   */
+    /**
+     * Starting 54th byte of the ELF Header a 16-bit value indicates
+     * the size of each entry in the ELF Program header. In the `elf64_hdr`
+     * struct this value maps to the member `Elf64_Half e_phentsize`. All
+     * header entries have the same size.
+     * Starting from the 56th byte a 16-bit value represents the number
+     * of header entries in the ELF Program header. In the `elf64_hdr`
+     * struct this value maps to `Elf64_Half e_phnum`.
+     */
+
+    // Seek to the byte representing header entry size.
+    file.seekg(0x36);
+    uint16_t headerEntrySize;
+    file.read(reinterpret_cast<char*>(&headerEntrySize), sizeof(headerEntrySize));
+    uint16_t headerEntries;
+    file.read(reinterpret_cast<char*>(&headerEntries), sizeof(headerEntries));
+
+    // Resize the header to equal the number of header entries.
+    headers_.resize(headerEntries);
+    processImageSize_ = 0;
+
+    // Loop over all headers and extract them.
+    for (size_t i = 0; i < headerEntries; i++) {
+      // Since all headers entries have the same size.
+      // We can extract the nth header using the header offset
+      // and header entry size.
+      file.seekg(headerOffset + (i * headerEntrySize));
+      auto& header = headers_[i];
 
-  // Seek to the byte representing header entry size.
-  file.seekg(0x36);
-  uint16_t headerEntrySize;
-  file.read(reinterpret_cast<char*>(&headerEntrySize), sizeof(headerEntrySize));
-  uint16_t headerEntries;
-  file.read(reinterpret_cast<char*>(&headerEntries), sizeof(headerEntries));
-
-  // Resize the header to equal the number of header entries.
-  headers_.resize(headerEntries);
-  processImageSize_ = 0;
-
-  // Loop over all headers and extract them.
-  for (size_t i = 0; i < headerEntries; i++) {
-    // Since all headers entries have the same size.
-    // We can extract the nth header using the header offset
-    // and header entry size.
-    file.seekg(headerOffset + (i * headerEntrySize));
-    auto& header = headers_[i];
+      /**
+       * Like the ELF Header, the ELF Program header is also defined
+       * using a struct:
+       * typedef struct {
+       *    uint32_t   p_type;
+       *    uint32_t   p_flags;
+       *    Elf64_Off  p_offset;
+       *    Elf64_Addr p_vaddr;
+       *    Elf64_Addr p_paddr;
+       *    uint64_t   p_filesz;
+       *    uint64_t   p_memsz;
+       *    uint64_t   p_align;
+       *  } Elf64_Phdr;
+       *
+       * The ELF Program header table is an array of structures,
+       * each describing a segment or other information the system
+       * needs to prepare the program for execution. A segment
+       * contains one or more sections (ELF Program Section).
+       *
+       * The `p_vaddr` field holds the virtual address at which the first
+       * byte of the segment resides in memory and the `p_memsz` field
+       * holds the number of bytes in the memory image of the segment.
+       * It may be zero. The `p_offset` member holds the offset from the
+       * beginning of the file at which the first byte of the segment resides.
+       */
 
+      // Each address-related field is 8 bytes in a 64-bit ELF file
+      const int fieldBytes = 8;
+      file.read(reinterpret_cast<char*>(&(header.type)), sizeof(header.type));
+      file.seekg(4, std::ios::cur);  // Skip flags
+      file.read(reinterpret_cast<char*>(&(header.offset)), fieldBytes);
+      file.read(reinterpret_cast<char*>(&(header.virtualAddress)), fieldBytes);
+      file.read(reinterpret_cast<char*>(&(header.physicalAddress)), fieldBytes);
+      file.read(reinterpret_cast<char*>(&(header.fileSize)), fieldBytes);
+      file.read(reinterpret_cast<char*>(&(header.memorySize)), fieldBytes);
+
+      // To construct the process we look for the largest virtual address and
+      // add it to the memory size of the header. This way we obtain a very
+      // large array which can hold data at large virtual address.
+      // However, this way we end up creating a sparse array, in which most
+      // of the entries are unused. Also SimEng internally treats these
+      // virtual address as physical addresses to index into this large array.
+      if (header.virtualAddress + header.memorySize > processImageSize_) {
+        processImageSize_ = header.virtualAddress + header.memorySize;
+      }
+    }
+
+    *imagePointer = (char*)malloc(processImageSize_ * sizeof(char));
     /**
-     * Like the ELF Header, the ELF Program header is also defined
-     * using a struct:
-     * typedef struct {
-     *    uint32_t   p_type;
-     *    uint32_t   p_flags;
-     *    Elf64_Off  p_offset;
-     *    Elf64_Addr p_vaddr;
-     *    Elf64_Addr p_paddr;
-     *    uint64_t   p_filesz;
-     *    uint64_t   p_memsz;
-     *    uint64_t   p_align;
-     *  } Elf64_Phdr;
-     *
-     * The ELF Program header table is an array of structures,
-     * each describing a segment or other information the system
-     * needs to prepare the program for execution. A segment
-     * contains one or more sections (ELF Program Section).
-     *
-     * The `p_vaddr` field holds the virtual address at which the first
-     * byte of the segment resides in memory and the `p_memsz` field
-     * holds the number of bytes in the memory image of the segment.
-     * It may be zero. The `p_offset` member holds the offset from the
-     * beginning of the file at which the first byte of the segment resides.
+     * The ELF Program header has a member called `p_type`, which represents
+     * the kind of data or memory segments described by the program header.
+     * The value PT_LOAD=1 represents a loadable segment. In other words,
+     * it contains initialized data that contributes to the program's
+     * memory image.
      */
 
-    // Each address-related field is 8 bytes in a 64-bit ELF file
-    const int fieldBytes = 8;
-    file.read(reinterpret_cast<char*>(&(header.type)), sizeof(header.type));
-    file.seekg(4, std::ios::cur);  // Skip flags
-    file.read(reinterpret_cast<char*>(&(header.offset)), fieldBytes);
-    file.read(reinterpret_cast<char*>(&(header.virtualAddress)), fieldBytes);
-    file.read(reinterpret_cast<char*>(&(header.physicalAddress)), fieldBytes);
-    file.read(reinterpret_cast<char*>(&(header.fileSize)), fieldBytes);
-    file.read(reinterpret_cast<char*>(&(header.memorySize)), fieldBytes);
-
-    // To construct the process we look for the largest virtual address and
-    // add it to the memory size of the header. This way we obtain a very
-    // large array which can hold data at large virtual address.
-    // However, this way we end up creating a sparse array, in which most
-    // of the entries are unused. Also SimEng internally treats these
-    // virtual address as physical addresses to index into this large array.
-    if (header.virtualAddress + header.memorySize > processImageSize_) {
-      processImageSize_ = header.virtualAddress + header.memorySize;
+    // Process headers; only observe LOAD sections for this basic implementation
+    for (const auto& header : headers_) {
+      if (header.type == 1) {  // LOAD
+        file.seekg(header.offset);
+        // Read `fileSize` bytes from `file` into the appropriate place in process
+        // memory
+        file.read(*imagePointer + header.virtualAddress, header.fileSize);
+      }
     }
-  }
+  } else {
+    /**
+     * Starting from the 24th byte of the ELF header a 32-bit value
+     * represents the virtual address to which the system first transfers
+     * control, thus starting the process.
+     * In `elf32_hdr` this value maps to the member `Elf32_Addr e_entry`.
+     */
 
-  *imagePointer = (char*)malloc(processImageSize_ * sizeof(char));
-  /**
-   * The ELF Program header has a member called `p_type`, which represents
-   * the kind of data or memory segments described by the program header.
-   * The value PT_LOAD=1 represents a loadable segment. In other words,
-   * it contains initialized data that contributes to the program's
-   * memory image.
-   */
+    // Seek to the entry point of the file.
+    // The information in between is discarded
+    file.seekg(0x18);
+    file.read(reinterpret_cast<char*>(&entryPoint32_), sizeof(entryPoint32_));
+
+    /**
+     * Starting from the 32nd byte of the ELF Header a 64-bit value
+     * represents the offset of the ELF Program header or
+     * Program header table in the ELF file.
+     * In `elf32_hdr` this value maps to the member `Elf32_Addr e_phoff`.
+     */
+
+    // Seek to the byte representing the start of the header offset table.
+    uint32_t headerOffset;
+    file.read(reinterpret_cast<char*>(&headerOffset), sizeof(headerOffset));
+
+    /**
+     * Starting 42th byte of the ELF Header a 16-bit value indicates
+     * the size of each entry in the ELF Program header. In the `elf32_hdr`
+     * struct this value maps to the member `Elf32_Half e_phentsize`. All
+     * header entries have the same size.
+     * Starting from the 44th byte a 16-bit value represents the number
+     * of header entries in the ELF Program header. In the `elf32_hdr`
+     * struct this value maps to `Elf32_Half e_phnum`.
+     */
+
+    // Seek to the byte representing header entry size.
+    file.seekg(0x2a);
+    uint16_t headerEntrySize;
+    file.read(reinterpret_cast<char*>(&headerEntrySize), sizeof(headerEntrySize));
+    uint16_t headerEntries;
+    file.read(reinterpret_cast<char*>(&headerEntries), sizeof(headerEntries));
 
-  // Process headers; only observe LOAD sections for this basic implementation
-  for (const auto& header : headers_) {
-    if (header.type == 1) {  // LOAD
-      file.seekg(header.offset);
-      // Read `fileSize` bytes from `file` into the appropriate place in process
-      // memory
-      file.read(*imagePointer + header.virtualAddress, header.fileSize);
+    // Resize the header to equal the number of header entries.
+    headers32_.resize(headerEntries);
+    processImageSize_ = 0;
+
+    // Loop over all headers and extract them.
+    for (size_t i = 0; i < headerEntries; i++) {
+      // Since all headers entries have the same size.
+      // We can extract the nth header using the header offset
+      // and header entry size.
+      file.seekg(headerOffset + (i * headerEntrySize));
+      auto& header = headers32_[i];
+
+      /**
+       * Like the ELF Header, the ELF Program header is also defined
+       * using a struct:
+       *  typedef struct {
+       *    uint32_t   p_type;
+       *    Elf32_Off  p_offset;
+       *    Elf32_Addr p_vaddr;
+       *    Elf32_Addr p_paddr;
+       *    uint32_t   p_filesz;
+       *    uint32_t   p_memsz;
+       *    uint32_t   p_flags;
+       *    uint32_t   p_align;
+       *  } Elf32_Phdr;
+       *
+       * The ELF Program header table is an array of structures,
+       * each describing a segment or other information the system
+       * needs to prepare the program for execution. A segment
+       * contains one or more sections (ELF Program Section).
+       *
+       * The `p_vaddr` field holds the virtual address at which the first
+       * byte of the segment resides in memory and the `p_memsz` field
+       * holds the number of bytes in the memory image of the segment.
+       * It may be zero. The `p_offset` member holds the offset from the
+       * beginning of the file at which the first byte of the segment resides.
+       */
+
+      // Each address-related field is 4 bytes in a 32-bit ELF file
+      const int fieldBytes = 4;
+      file.read(reinterpret_cast<char*>(&(header.type)), sizeof(header.type));
+      file.read(reinterpret_cast<char*>(&(header.offset)), fieldBytes);
+      file.read(reinterpret_cast<char*>(&(header.virtualAddress)), fieldBytes);
+      file.read(reinterpret_cast<char*>(&(header.physicalAddress)), fieldBytes);
+      file.read(reinterpret_cast<char*>(&(header.fileSize)), fieldBytes);
+      file.read(reinterpret_cast<char*>(&(header.memorySize)), fieldBytes);
+
+      // To construct the process we look for the largest virtual address and
+      // add it to the memory size of the header. This way we obtain a very
+      // large array which can hold data at large virtual address.
+      // However, this way we end up creating a sparse array, in which most
+      // of the entries are unused. Also SimEng internally treats these
+      // virtual address as physical addresses to index into this large array.
+      if (header.virtualAddress + header.memorySize > processImageSize_) {
+        processImageSize_ = header.virtualAddress + header.memorySize;
+      }
+    }
+
+    *imagePointer = (char*)malloc(processImageSize_ * sizeof(char));
+    /**
+     * The ELF Program header has a member called `p_type`, which represents
+     * the kind of data or memory segments described by the program header.
+     * The value PT_LOAD=1 represents a loadable segment. In other words,
+     * it contains initialized data that contributes to the program's
+     * memory image.
+     */
+
+    // Process headers; only observe LOAD sections for this basic implementation
+    for (const auto& header : headers32_) {
+      if (header.type == 1) {  // LOAD
+        file.seekg(header.offset);
+        // Read `fileSize` bytes from `file` into the appropriate place in process
+        // memory
+        file.read(*imagePointer + header.virtualAddress, header.fileSize);
+      }
     }
   }
 
@@ -179,7 +300,12 @@ Elf::~Elf() {}
 
 uint64_t Elf::getProcessImageSize() const { return processImageSize_; }
 
-uint64_t Elf::getEntryPoint() const { return entryPoint_; }
+uint64_t Elf::getEntryPoint() const {
+  if (mode32bit_) {
+    return entryPoint32_;
+  } else
+    return entryPoint_;
+}
 
 bool Elf::isValid() const { return isValid_; }
 
diff --git a/src/lib/ModelConfig.cc b/src/lib/ModelConfig.cc
index 60117a8053..88cc1f7d59 100644
--- a/src/lib/ModelConfig.cc
+++ b/src/lib/ModelConfig.cc
@@ -67,7 +67,7 @@ void ModelConfig::validate() {
                "Streaming-Vector-Length"};
   validISA = nodeChecker<std::string>(
       configFile_[root][subFields[0]], subFields[0],
-      std::vector<std::string>({"AArch64", "rv64"}), ExpectedValue::String);
+      std::vector<std::string>({"AArch64", "rv64", "rv32"}), ExpectedValue::String);
   nodeChecker<std::string>(configFile_[root][subFields[1]], subFields[1],
                            {"emulation", "inorderpipelined", "outoforder"},
                            ExpectedValue::String);
@@ -146,7 +146,8 @@ void ModelConfig::validate() {
               1, group.as<std::string>().size()));
           configFile_["Ports"][i]["Instruction-Opcode-Support"][opcodeIndex] =
               opcode;
-          if (configFile_["Core"]["ISA"].as<std::string>() == "rv64") {
+          if (configFile_["Core"]["ISA"].as<std::string>() == "rv64" ||
+              configFile_["Core"]["ISA"].as<std::string>() == "rv32") {
             // Ensure opcode is between the bounds of 0 and Capstones'
             // RISCV_INSTRUCTION_LIST_END
             boundChecker(
@@ -233,7 +234,8 @@ void ModelConfig::validate() {
 
     // TODO make as many subfields as possible generic to avoid repeated code
     // e.g. AArch64 FloatingPoint/SVE-Count -> FloatingPoint-Count
-    if (configFile_["Core"]["ISA"].as<std::string>() == "rv64") {
+    if (configFile_["Core"]["ISA"].as<std::string>() == "rv64" ||
+        configFile_["Core"]["ISA"].as<std::string>() == "rv32") {
       // Register-Set
       root = "Register-Set";
       subFields = {"GeneralPurpose-Count", "FloatingPoint-Count"};
@@ -707,7 +709,8 @@ void ModelConfig::createGroupMapping() {
                      "STORE_ADDRESS_SME",
                      "STORE_DATA_SME",
                      "STORE_SME"};
-  } else if (configFile_["Core"]["ISA"].as<std::string>() == "rv64") {
+  } else if (configFile_["Core"]["ISA"].as<std::string>() == "rv64" ||
+             configFile_["Core"]["ISA"].as<std::string>() == "rv32") {
     groupOptions_ = {"INT",
                      "INT_SIMPLE",
                      "INT_SIMPLE_ARTH",
diff --git a/src/lib/arch/aarch64/Architecture.cc b/src/lib/arch/aarch64/Architecture.cc
index 08b807eb97..23ebf86ae3 100644
--- a/src/lib/arch/aarch64/Architecture.cc
+++ b/src/lib/arch/aarch64/Architecture.cc
@@ -325,6 +325,10 @@ void Architecture::setSVCRval(const uint64_t newVal) const {
   SVCRval_ = newVal;
 }
 
+void Architecture::updateInstrTrace(const std::shared_ptr<simeng::Instruction>& instruction,
+                                    simeng::RegisterFileSet* regFile, uint64_t tick) const {
+  }
+
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
diff --git a/src/lib/arch/riscv/Architecture.cc b/src/lib/arch/riscv/Architecture.cc
index 5d18349dc5..d1a18777e8 100644
--- a/src/lib/arch/riscv/Architecture.cc
+++ b/src/lib/arch/riscv/Architecture.cc
@@ -16,7 +16,25 @@ std::forward_list<InstructionMetadata> Architecture::metadataCache;
 
 Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
     : linux_(kernel) {
-  cs_err n = cs_open(CS_ARCH_RISCV, CS_MODE_RISCV64, &capstoneHandle);
+  is32Bit_ = ARCH_64BIT;
+  if (config["Core"]["ISA"].as<std::string>() == "rv32") {
+    is32Bit_ = ARCH_32BIT;
+  }
+
+  cs_mode csMode = CS_MODE_RISCV64;
+  constantsPool constantsPool;
+
+  if(is32Bit_) {
+    csMode = CS_MODE_RISCV32GC; // TODO Note: currently using local (1-line)modified capstone
+    constants_.alignMask = constantsPool.alignMaskCompressed;
+    constants_.regWidth = constantsPool.byteLength32;
+    constants_.bytesLimit = constantsPool.bytesLimitCompressed;
+  } else {
+    constants_.alignMask = constantsPool.alignMask;
+    constants_.regWidth = constantsPool.byteLength64;
+    constants_.bytesLimit = constantsPool.bytesLimit;
+  }
+  cs_err n = cs_open(CS_ARCH_RISCV, csMode, &capstoneHandle);
   if (n != CS_ERR_OK) {
     std::cerr << "[SimEng:Architecture] Could not create capstone handle due "
                  "to error "
@@ -26,6 +44,16 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
 
   cs_option(capstoneHandle, CS_OPT_DETAIL, CS_OPT_ON);
 
+  // Generate zero-indexed system register map
+  systemRegisterMap_[SYSREG_MSTATUS] = systemRegisterMap_.size();
+  systemRegisterMap_[SYSREG_MSTATUSH] = systemRegisterMap_.size();
+  systemRegisterMap_[SYSREG_MEPC] = systemRegisterMap_.size();
+  systemRegisterMap_[SYSREG_MCAUSE] = systemRegisterMap_.size();
+  systemRegisterMap_[SYSREG_MHARTID] = systemRegisterMap_.size();
+  systemRegisterMap_[SYSREG_CYCLE] = systemRegisterMap_.size();
+  systemRegisterMap_[SYSREG_TIME] = systemRegisterMap_.size();
+  systemRegisterMap_[SYSREG_INSTRRET] = systemRegisterMap_.size();
+
   // Instantiate an executionInfo entry for each group in the InstructionGroup
   // namespace.
   for (int i = 0; i < NUM_GROUPS; i++) {
@@ -117,19 +145,28 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
       }
     }
   }
+  if (config["Core"]["Trace"].as<bool>()) {
+    traceFile_ = new std::ofstream();
+    traceFile_->open("./trace.log");
+    traceOn_ = true;
+  }
 }
 Architecture::~Architecture() {
   cs_close(&capstoneHandle);
   decodeCache.clear();
   metadataCache.clear();
   groupExecutionInfo_.clear();
+  if(traceOn_) {
+    traceFile_->close();
+  }
 }
 
 uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
                                 uint64_t instructionAddress,
                                 MacroOp& output) const {
   // Check that instruction address is 4-byte aligned as required by RISC-V
-  if (instructionAddress & 0x3) {
+  // 2-byte when Compressed ISA is supported
+  if (instructionAddress & constants_.alignMask) {
     // Consume 1-byte and raise a misaligned PC exception
     auto metadata = InstructionMetadata((uint8_t*)ptr, 1);
     metadataCache.emplace_front(metadata);
@@ -142,8 +179,8 @@ uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
     return 1;
   }
 
-  assert(bytesAvailable >= 4 &&
-         "Fewer than 4 bytes supplied to RISC-V decoder");
+  assert(bytesAvailable >= constants_.bytesLimit &&
+         "Fewer than bytes limit supplied to RISC-V decoder");
 
   // Dereference the instruction pointer to obtain the instruction word
   uint32_t insn;
@@ -175,6 +212,8 @@ uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
     Instruction newInsn(*this, metadataCache.front());
     // Set execution information for this instruction
     newInsn.setExecutionInfo(getExecutionInfo(newInsn));
+    // Set byte length in instruction
+    newInsn.setArchRegWidth(constants_.regWidth);
     // Cache the instruction
     iter = decodeCache.insert({insn, newInsn}).first;
   }
@@ -187,7 +226,7 @@ uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
 
   uop->setInstructionAddress(instructionAddress);
 
-  return 4;
+  return iter->second.getMetadata().lenBytes;
 }
 
 executionInfo Architecture::getExecutionInfo(Instruction& insn) const {
@@ -216,9 +255,9 @@ std::vector<RegisterFileStructure> Architecture::getRegisterFileStructures()
     const {
   uint16_t numSysRegs = static_cast<uint16_t>(systemRegisterMap_.size());
   return {
-      {8, 32},          // General purpose
-      {8, 32},          // Floating Point
-      {8, numSysRegs},  // System
+      {constants_.regWidth, 32},          // General purpose
+      {constants_.regWidth, 32},          // Floating Point
+      {constants_.regWidth, numSysRegs},  // System
   };
 }
 
@@ -234,12 +273,17 @@ ProcessStateChange Architecture::getInitialState() const {
   ProcessStateChange changes;
   // Set ProcessStateChange type
   changes.type = ChangeType::REPLACEMENT;
-
-  uint64_t stackPointer = linux_.getInitialStackPointer();
-  // Set the stack pointer register
   changes.modifiedRegisters.push_back({RegisterType::GENERAL, 2});
-  changes.modifiedRegisterValues.push_back(stackPointer);
-
+  uint64_t stackPointer;
+  // TODO: check if this conditional expression is needed
+  if(is32Bit_) {
+    stackPointer = (uint32_t)linux_.getInitialStackPointer();
+    changes.modifiedRegisterValues.push_back((uint32_t)stackPointer);
+  } else
+  {
+    stackPointer = linux_.getInitialStackPointer();
+    changes.modifiedRegisterValues.push_back(stackPointer);
+  }
   return changes;
 }
 
@@ -247,9 +291,9 @@ uint8_t Architecture::getMaxInstructionSize() const { return 4; }
 
 std::vector<RegisterFileStructure>
 Architecture::getConfigPhysicalRegisterStructure(YAML::Node config) const {
-  return {{8, config["Register-Set"]["GeneralPurpose-Count"].as<uint16_t>()},
-          {8, config["Register-Set"]["FloatingPoint-Count"].as<uint16_t>()},
-          {8, getNumSystemRegisters()}};
+  return {{constants_.regWidth, config["Register-Set"]["GeneralPurpose-Count"].as<uint16_t>()},
+          {constants_.regWidth, config["Register-Set"]["FloatingPoint-Count"].as<uint16_t>()},
+          {constants_.regWidth, getNumSystemRegisters()}};
 }
 
 std::vector<uint16_t> Architecture::getConfigPhysicalRegisterQuantities(
@@ -267,6 +311,76 @@ void Architecture::updateSystemTimerRegisters(RegisterFileSet* regFile,
                                               const uint64_t iterations) const {
 }
 
+void Architecture::updateInstrTrace(const std::shared_ptr<simeng::Instruction>& instruction,
+                                    RegisterFileSet* regFile, uint64_t tick) const {
+  if(traceOn_) {
+    Instruction instr_ = *static_cast<Instruction*>(instruction.get());
+    auto& metadata = instr_.getMetadata();
+    std::stringstream s;
+    s << "0x" << std::hex << instr_.getInstructionAddress() << " ";
+    if (tick < 100000000)
+      s << "t(" << std::setfill('0') << std::setw(8) << std::dec << (uint32_t)tick << ") ";
+    else
+      s << "t(" << std::setfill('0') << std::setw(16) << std::dec << (uint32_t)tick << ") ";
+    s << "(";
+    if(metadata.len == IL_16B) {
+      s << "0000";
+    }
+    for(int8_t i=metadata.lenBytes; i>0; i--) {
+      s << std::hex << std::setfill('0') << std::setw(2) << static_cast<unsigned int>(metadata.encoding[i-1]);
+    }
+    s << ") ";
+    s << metadata.mnemonic << " " << metadata.operandStr;
+    auto sources = instr_.getOperandRegisters();
+    auto destinations = instr_.getDestinationRegisters();
+    int8_t num_src = (int8_t)sources.size();
+    int8_t num_dest = (int8_t)destinations.size();
+    if((num_src + num_dest) >0) {
+      s << "    ";
+      if (num_dest > 0) {
+        s << "(d: ";
+        for(int8_t i=0;i<num_dest; i++) {
+          auto reg = destinations[i];
+          if(reg.type == RegisterType::GENERAL) {
+            s << "x" << std::dec << std::setfill('0') << std::setw(2) << reg.tag << "=0x";
+          } else if(reg.type == RegisterType::FLOAT) {
+            s << "f" << std::dec << std::setfill('0') << std::setw(2) << reg.tag << "=0x";
+          } else if(reg.type == RegisterType::SYSTEM) {
+            s << "csr_0x" << std::hex << std::setfill('0') << std::setw(3) << metadata.csr << "=0x";
+          }
+          s << std::hex << std::setfill('0') << std::setw(8) << regFile->get(reg).get<uint32_t>();
+          if(i < (num_dest-1)) {
+            s << " ";
+          }
+        }
+        s << ") ";
+      }
+      if (num_src > 0) {
+        s << "(s: ";
+        for(int8_t i=0;i<num_src; i++) {
+          auto reg = sources[i];
+          if(reg.type == RegisterType::GENERAL) {
+            s << "x" << std::dec << std::setfill('0') << std::setw(2) << reg.tag << "=0x";
+          } else if(reg.type == RegisterType::FLOAT) {
+            s << "f" << std::dec << std::setfill('0') << std::setw(2) << reg.tag << "=0x";
+          } else if(reg.type == RegisterType::SYSTEM) {
+            s << "csr_0x" << std::hex << std::setfill('0') << std::setw(3) << metadata.csr << "=0x";
+          }
+          s << std::hex << std::setfill('0') << std::setw(8) << regFile->get(reg).get<uint32_t>();
+          if(i < (num_src-1)) {
+            s << " ";
+          }
+        }
+        s << ") ";
+      }
+    }
+    s << std::endl;
+    *traceFile_ << s.str();
+    traceFile_->flush(); //Helps with debugging sometimes as all the state of previous committed instr is written to file.
+  }
+}
+archConstants Architecture::getConstants() const { return constants_; }
+
 }  // namespace riscv
 }  // namespace arch
 }  // namespace simeng
diff --git a/src/lib/arch/riscv/ExceptionHandler.cc b/src/lib/arch/riscv/ExceptionHandler.cc
index 8f76c4cc3a..c88448048d 100644
--- a/src/lib/arch/riscv/ExceptionHandler.cc
+++ b/src/lib/arch/riscv/ExceptionHandler.cc
@@ -97,7 +97,8 @@ bool ExceptionHandler::init() {
       }
       case 57: {  // close
         int64_t fd = registerFileSet.get(R0).get<int64_t>();
-        stateChange = {ChangeType::REPLACEMENT, {R0}, {linux_.close(fd)}};
+        stateChange = {ChangeType::REPLACEMENT, {R0}};
+        stateChange.modifiedRegisterValues.push_back(RegisterValue(linux_.close(fd), instruction_.getArchRegWidth()));
         break;
       }
       case 61: {  // getdents64
@@ -185,7 +186,8 @@ bool ExceptionHandler::init() {
         return readBufferThen(bufPtr, count, [=]() {
           int64_t retval = linux_.write(fd, dataBuffer.data(), count);
           ProcessStateChange stateChange = {
-              ChangeType::REPLACEMENT, {R0}, {retval}};
+              ChangeType::REPLACEMENT, {R0}};
+          stateChange.modifiedRegisterValues.push_back(RegisterValue(retval, instruction_.getArchRegWidth()));
           return concludeSyscall(stateChange);
         });
       }
@@ -354,7 +356,8 @@ bool ExceptionHandler::init() {
 
         kernel::stat statOut;
         stateChange = {
-            ChangeType::REPLACEMENT, {R0}, {linux_.fstat(fd, statOut)}};
+            ChangeType::REPLACEMENT, {R0}};
+        stateChange.modifiedRegisterValues.push_back(RegisterValue(linux_.fstat(fd, statOut), instruction_.getArchRegWidth()));
         stateChange.memoryAddresses.push_back({statbufPtr, sizeof(statOut)});
         stateChange.memoryAddressValues.push_back(statOut);
         break;
@@ -554,7 +557,8 @@ bool ExceptionHandler::init() {
       case 214: {  // brk
         auto result = linux_.brk(registerFileSet.get(R0).get<uint64_t>());
         stateChange = {
-            ChangeType::REPLACEMENT, {R0}, {static_cast<uint64_t>(result)}};
+            ChangeType::REPLACEMENT, {R0}};
+        stateChange.modifiedRegisterValues.push_back(RegisterValue(static_cast<uint64_t>(result), instruction_.getArchRegWidth()));
         break;
       }
       case 215: {  // munmap
@@ -823,6 +827,9 @@ void ExceptionHandler::printException(const Instruction& insn) const {
     case InstructionException::NoAvailablePort:
       std::cout << "unsupported execution port";
       break;
+    case InstructionException::UnmappedSysReg:
+      std::cout << "unmapped system register";
+      break;
     default:
       std::cout << "unknown (id: " << static_cast<unsigned int>(exception)
                 << ")";
@@ -835,9 +842,9 @@ void ExceptionHandler::printException(const Instruction& insn) const {
             << insn.getInstructionAddress() << ": ";
 
   auto& metadata = insn.getMetadata();
-  for (uint8_t byte : metadata.encoding) {
+  for (int8_t i = metadata.lenBytes; i > 0; i--) {
     std::cout << std::setfill('0') << std::setw(2)
-              << static_cast<unsigned int>(byte) << " ";
+              << static_cast<unsigned int>(metadata.encoding[i-1]);
   }
   std::cout << std::dec << "    ";
   if (exception == InstructionException::EncodingUnallocated) {
diff --git a/src/lib/arch/riscv/Instruction.cc b/src/lib/arch/riscv/Instruction.cc
index 530890e9a6..6cfc173b9d 100644
--- a/src/lib/arch/riscv/Instruction.cc
+++ b/src/lib/arch/riscv/Instruction.cc
@@ -11,6 +11,8 @@ namespace arch {
 namespace riscv {
 
 const Register Instruction::ZERO_REGISTER = {RegisterType::GENERAL, 0};
+const Register Instruction::RA_REGISTER = {RegisterType::GENERAL, 1};
+const Register Instruction::SP_REGISTER = {RegisterType::GENERAL, 2};
 
 Instruction::Instruction(const Architecture& architecture,
                          const InstructionMetadata& metadata)
@@ -165,6 +167,10 @@ const std::vector<uint16_t>& Instruction::getSupportedPorts() {
 
 const InstructionMetadata& Instruction::getMetadata() const { return metadata; }
 
+void Instruction::setArchRegWidth(uint8_t len) { archRegWidth_ = len; }
+
+uint8_t Instruction::getArchRegWidth() const { return archRegWidth_; }
+
 }  // namespace riscv
 }  // namespace arch
 }  // namespace simeng
diff --git a/src/lib/arch/riscv/InstructionMetadata.cc b/src/lib/arch/riscv/InstructionMetadata.cc
index 595f5f6ece..f2b5a9b736 100644
--- a/src/lib/arch/riscv/InstructionMetadata.cc
+++ b/src/lib/arch/riscv/InstructionMetadata.cc
@@ -14,7 +14,9 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
       implicitSourceCount(insn.detail->regs_read_count),
       implicitDestinationCount(insn.detail->regs_write_count),
       operandCount(insn.detail->riscv.op_count) {
-  std::memcpy(encoding, insn.bytes, sizeof(encoding));
+  setLength(insn.size);
+  std::memset(encoding, 0, 4);
+  std::memcpy(encoding, insn.bytes, insn.size);
   // Copy printed output
   std::strncpy(mnemonic, insn.mnemonic, CS_MNEMONIC_SIZE);
   operandStr = std::string(insn.op_str);
@@ -36,7 +38,7 @@ InstructionMetadata::InstructionMetadata(const uint8_t* invalidEncoding,
       opcode(Opcode::RISCV_INSTRUCTION_LIST_END),
       implicitSourceCount(0),
       implicitDestinationCount(0),
-      operandCount(0) {
+      operandCount(0), len(IL_INVALID) {
   assert(bytes <= sizeof(encoding));
   std::memcpy(encoding, invalidEncoding, bytes);
   mnemonic[0] = '\0';
@@ -252,6 +254,28 @@ void InstructionMetadata::alterPseudoInstructions(const cs_insn& insn) {
       }
       break;
     }
+    case Opcode::RISCV_CSRRW:
+    case Opcode::RISCV_CSRRS:
+    case Opcode::RISCV_CSRRC:
+    case Opcode::RISCV_CSRRWI:
+    case Opcode::RISCV_CSRRSI:
+    case Opcode::RISCV_CSRRCI: {
+      //Extract CSR info
+      csr = ((uint32_t)encoding[3] << 4) | ((uint32_t)encoding[2] >> 4);
+      //If there are less than 2 operands provided add necessary x0 operand
+      if(operandCount == 1) {
+        if(strcmp(mnemonic, "csrr") == 0) { //csrrs rd,csr,x0
+          operands[1].type = RISCV_OP_REG;
+          operands[1].reg = 1;
+        } else { //csrrxx x0,csr,rs/imm
+          operands[1] = operands[0];
+          operands[0].type = RISCV_OP_REG;
+          operands[0].reg = 1;
+        }
+        operandCount = 2;
+      }
+      break;
+    }
   }
 }
 
@@ -278,6 +302,16 @@ void InstructionMetadata::includeZeroRegisterPosZero() {
   operandCount = 3;
 }
 
+
+void InstructionMetadata::setLength(uint8_t size) {
+  lenBytes = size;
+    switch(size) {
+      case 2: len = IL_16B; break;
+      case 4: len = IL_32B; break;
+      default: len = IL_INVALID;
+  }
+}
+
 }  // namespace riscv
 }  // namespace arch
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/lib/arch/riscv/InstructionMetadata.hh b/src/lib/arch/riscv/InstructionMetadata.hh
index af5bebf815..4ce164a346 100644
--- a/src/lib/arch/riscv/InstructionMetadata.hh
+++ b/src/lib/arch/riscv/InstructionMetadata.hh
@@ -14,6 +14,12 @@ namespace Opcode {
 #include "RISCVGenInstrInfo.inc"
 }  // namespace Opcode
 
+enum INSTR_LENGTH {
+  IL_16B,
+  IL_32B,
+  IL_INVALID
+};
+
 /** A simplified RISC-V-only version of the Capstone instruction structure. */
 struct InstructionMetadata {
  public:
@@ -70,6 +76,13 @@ struct InstructionMetadata {
   /** The number of explicit operands. */
   uint8_t operandCount;
 
+  /** The instruction length for variable instruction length support. */
+  INSTR_LENGTH len;
+  uint8_t lenBytes;
+
+  /** RISC-V CSR encoding */
+  uint32_t csr = 0;
+
  private:
   /** Detect instruction aliases and update metadata to match the de-aliased
    * instruction. */
@@ -85,6 +98,9 @@ struct InstructionMetadata {
   /** RISC-V helper function
    * Use register zero as operands[0] and immediate value as operands[2] */
   void includeZeroRegisterPosZero();
+
+  /** Set the byte length of instruction */
+  void setLength(uint8_t size);
 };
 
 }  // namespace riscv
diff --git a/src/lib/arch/riscv/Instruction_address.cc b/src/lib/arch/riscv/Instruction_address.cc
index e893ce3644..52ee7484c0 100644
--- a/src/lib/arch/riscv/Instruction_address.cc
+++ b/src/lib/arch/riscv/Instruction_address.cc
@@ -31,7 +31,7 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
       setMemoryAddresses({{address, 4}});
     } else {
       // Double
-      setMemoryAddresses({{address, 8}});
+      setMemoryAddresses({{address, archRegWidth_}});
     }
     return getGeneratedAddresses();
   }
@@ -40,7 +40,7 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
     case Opcode::RISCV_SD:
       [[fallthrough]];
     case Opcode::RISCV_LD: {
-      setMemoryAddresses({{address, 8}});
+      setMemoryAddresses({{address, archRegWidth_}});
       break;
     }
     case Opcode::RISCV_SW:
@@ -86,7 +86,7 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
     case Opcode::RISCV_LR_D_RL:
       [[fallthrough]];
     case Opcode::RISCV_LR_D_AQ_RL: {
-      setMemoryAddresses({{operands[0].get<uint64_t>(), 8}});
+      setMemoryAddresses({{operands[0].get<uint64_t>(), archRegWidth_}});
       break;
     }
     case Opcode::RISCV_SC_W:
@@ -106,9 +106,22 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
     case Opcode::RISCV_SC_D_RL:
       [[fallthrough]];
     case Opcode::RISCV_SC_D_AQ_RL: {
-      setMemoryAddresses({{operands[1].get<uint64_t>(), 8}});
+      setMemoryAddresses({{operands[1].get<uint64_t>(), archRegWidth_}});
       break;
     }
+    case Opcode::RISCV_C_LW:
+    case Opcode::RISCV_C_FLWSP:
+    case Opcode::RISCV_C_LWSP: {
+      setMemoryAddresses({{operands[0].get<uint32_t>() + c_imm, 4}});
+      break;
+    }
+    case Opcode::RISCV_C_SW:
+    case Opcode::RISCV_C_FSWSP:
+    case Opcode::RISCV_C_SWSP: {
+      setMemoryAddresses({{operands[1].get<uint32_t>() + c_imm, 4}});
+      break;
+    }
+
     default:
       exceptionEncountered_ = true;
       exception_ = InstructionException::ExecutionNotYetImplemented;
diff --git a/src/lib/arch/riscv/Instruction_decode.cc b/src/lib/arch/riscv/Instruction_decode.cc
index 6db263796b..8bdd5041eb 100644
--- a/src/lib/arch/riscv/Instruction_decode.cc
+++ b/src/lib/arch/riscv/Instruction_decode.cc
@@ -60,6 +60,15 @@ void Instruction::invalidateIfNotImplemented() {
     return;
   if (metadata.opcode == Opcode::RISCV_FENCE) return;
 
+  //C Extention
+  if (metadata.opcode >= Opcode::RISCV_C_ADD &&
+      metadata.opcode <= Opcode::RISCV_C_XOR)
+    return;
+  //CSR operations
+  if (metadata.opcode >= Opcode::RISCV_CSRRC &&
+      metadata.opcode <= Opcode::RISCV_CSRRWI)
+    return;
+
   exception_ = InstructionException::EncodingUnallocated;
   exceptionEncountered_ = true;
   return;
@@ -77,6 +86,15 @@ void Instruction::decode() {
     return;
   }
 
+  //Handle Compressed instruction separately for now.
+  if (decode16()) {
+    return;
+  }
+
+  if (decodeCsr()) {
+    return;
+  }
+
   // Identify branches
   switch (metadata.opcode) {
     case Opcode::RISCV_BEQ:
@@ -153,7 +171,7 @@ void Instruction::decode() {
         if (sourceRegisters[sourceRegisterCount] ==
             Instruction::ZERO_REGISTER) {
           // Catch zero register references and pre-complete those operands
-          operands[sourceRegisterCount] = RegisterValue(0, 8);
+          operands[sourceRegisterCount] = RegisterValue(0, architecture_.getConstants().regWidth);
         } else {
           operandsPending++;
         }
@@ -195,7 +213,7 @@ void Instruction::decode() {
 
       if (sourceRegisters[sourceRegisterCount] == Instruction::ZERO_REGISTER) {
         // Catch zero register references and pre-complete those operands
-        operands[sourceRegisterCount] = RegisterValue(0, 8);
+        operands[sourceRegisterCount] = RegisterValue(0, architecture_.getConstants().regWidth);
       } else {
         operandsPending++;
       }
@@ -258,6 +276,297 @@ void Instruction::decode() {
   }
 }
 
+bool Instruction::decode16() {
+  if (metadata.len != IL_16B) {
+    return false;
+  }
+
+  switch (metadata.opcode) {
+    case Opcode::RISCV_C_JR:
+    case Opcode::RISCV_C_JALR:
+      isBranch_ = true;
+      instFormat_ = CIF_CR;
+      assert(metadata.operandCount==1 &&
+             metadata.operands[0].type == RISCV_OP_REG &&
+             csRegToRegister(metadata.operands[0].reg) != Instruction::ZERO_REGISTER &&
+             "Invalid operand for JR,JALR:- CR instructions");
+      sourceRegisters[sourceRegisterCount++] = csRegToRegister(metadata.operands[0].reg);
+      operandsPending++;
+      if (metadata.opcode == Opcode::RISCV_C_JALR) {
+        destinationRegisters[destinationRegisterCount++] = Instruction::RA_REGISTER;
+      }
+      branchType_ = BranchType::Unconditional;
+      break;
+    case Opcode::RISCV_C_MV:
+      instFormat_ = CIF_CR;
+      assert(metadata.operandCount==2 &&
+             metadata.operands[0].type == RISCV_OP_REG &&
+             metadata.operands[1].type == RISCV_OP_REG &&
+             csRegToRegister(metadata.operands[0].reg) != Instruction::ZERO_REGISTER &&
+             csRegToRegister(metadata.operands[1].reg) != Instruction::ZERO_REGISTER &&
+             "Invalid operand for MV:- CR instructions");
+      destinationRegisters[destinationRegisterCount++] = csRegToRegister(metadata.operands[0].reg);
+      sourceRegisters[sourceRegisterCount++] = csRegToRegister(metadata.operands[1].reg);
+      operandsPending++;
+      break;
+    case Opcode::RISCV_C_EBREAK://TODO
+      instFormat_ = CIF_CR;
+      break;
+    case Opcode::RISCV_C_ADD:
+      instFormat_ = CIF_CR;
+      assert(metadata.operandCount==2 &&
+             metadata.operands[0].type == RISCV_OP_REG &&
+             metadata.operands[1].type == RISCV_OP_REG &&
+             csRegToRegister(metadata.operands[0].reg) != Instruction::ZERO_REGISTER &&
+             csRegToRegister(metadata.operands[1].reg) != Instruction::ZERO_REGISTER &&
+             "Invalid operand for MV:- CR instructions");
+      destinationRegisters[destinationRegisterCount++] = csRegToRegister(metadata.operands[0].reg);
+      sourceRegisters[sourceRegisterCount++] = csRegToRegister(metadata.operands[0].reg);
+      operandsPending++;
+      sourceRegisters[sourceRegisterCount++] = csRegToRegister(metadata.operands[1].reg);
+      operandsPending++;
+      break;
+    case Opcode::RISCV_C_NOP://TODO
+      instFormat_ = CIF_CI;
+      break;
+    case Opcode::RISCV_C_ADDI:
+    //case Opcode::RISCV_C_ADDIW:
+    case Opcode::RISCV_C_LI:
+    case Opcode::RISCV_C_ADDI16SP:
+    case Opcode::RISCV_C_LUI:
+    case Opcode::RISCV_C_SLLI:
+      instFormat_ = CIF_CI;
+      assert(metadata.operandCount==2 &&
+             metadata.operands[0].type == RISCV_OP_REG &&
+             metadata.operands[1].type == RISCV_OP_IMM &&
+             csRegToRegister(metadata.operands[0].reg) != Instruction::ZERO_REGISTER &&
+             !(metadata.opcode == Opcode::RISCV_C_LUI && csRegToRegister(metadata.operands[0].reg) == Instruction::SP_REGISTER) &&
+             !(metadata.opcode == Opcode::RISCV_C_ADDI16SP && csRegToRegister(metadata.operands[0].reg) != Instruction::SP_REGISTER) &&
+             "Invalid operand for CI instructions");
+      if (metadata.opcode != Opcode::RISCV_C_LUI && metadata.opcode != Opcode::RISCV_C_LI ) {
+        sourceRegisters[sourceRegisterCount++] = csRegToRegister(metadata.operands[0].reg);
+        operandsPending++;
+      }
+      destinationRegisters[destinationRegisterCount++] = csRegToRegister(metadata.operands[0].reg);
+      c_imm = metadata.operands[1].imm;
+      break;
+    case Opcode::RISCV_C_ADDI4SPN:
+      instFormat_ = CIF_CIW;
+      assert(metadata.operandCount==3 &&
+             metadata.operands[0].type == RISCV_OP_REG &&
+             metadata.operands[1].type == RISCV_OP_REG &&
+             metadata.operands[2].type == RISCV_OP_IMM &&
+             metadata.operands[2].imm != 0 &&
+             metadata.operands[1].reg == 0x3 &&
+             "Invalid operand for CIW instructions");
+      sourceRegisters[sourceRegisterCount++] = Instruction::SP_REGISTER;
+      operandsPending++;
+      c_imm = metadata.operands[2].imm;
+      destinationRegisters[destinationRegisterCount++] = csRegToRegister(metadata.operands[0].reg);
+      break;
+    case Opcode::RISCV_C_SUB:
+    case Opcode::RISCV_C_XOR:
+    case Opcode::RISCV_C_OR:
+    case Opcode::RISCV_C_AND:
+      //case Opcode::RISCV_C_SUBW:
+      //case Opcode::RISCV_C_ADDW:
+      instFormat_ = CIF_CA;
+      assert(metadata.operandCount==2 &&
+             metadata.operands[0].type == RISCV_OP_REG &&
+             metadata.operands[1].type == RISCV_OP_REG &&
+             metadata.operands[0].reg > 8 &&
+             metadata.operands[1].reg > 8 &&
+             "Invalid operand for CA instructions");
+      destinationRegisters[destinationRegisterCount++] = csRegToRegister(metadata.operands[0].reg);
+      sourceRegisters[sourceRegisterCount++] = csRegToRegister(metadata.operands[0].reg);
+      operandsPending++;
+      sourceRegisters[sourceRegisterCount++] = csRegToRegister(metadata.operands[1].reg);
+      operandsPending++;
+      break;
+    case Opcode::RISCV_C_SRAI:
+    case Opcode::RISCV_C_SRLI:
+    case Opcode::RISCV_C_ANDI:
+      instFormat_ = CIF_CB;
+      assert(metadata.operandCount==2 &&
+             metadata.operands[0].type == RISCV_OP_REG &&
+             metadata.operands[1].type == RISCV_OP_IMM &&
+             metadata.operands[0].reg > 8 &&
+             "Invalid operand for CI instructions");
+      sourceRegisters[sourceRegisterCount++] = csRegToRegister(metadata.operands[0].reg);
+      operandsPending++;
+      destinationRegisters[destinationRegisterCount++] = csRegToRegister(metadata.operands[0].reg);
+      c_imm = metadata.operands[1].imm;
+      break;
+    case Opcode::RISCV_C_BEQZ:
+    case Opcode::RISCV_C_BNEZ:
+      isBranch_ = true;
+      instFormat_ = CIF_CB;
+      assert(metadata.operandCount==2 &&
+             metadata.operands[0].type == RISCV_OP_REG &&
+             metadata.operands[1].type == RISCV_OP_IMM &&
+             "Invalid operand for CB instructions");
+      sourceRegisters[sourceRegisterCount++] = csRegToRegister(metadata.operands[0].reg);
+      //No zero register check required. can assert for register >=X8
+      operandsPending++;
+      c_imm = metadata.operands[1].imm;
+      branchType_ = BranchType::Conditional;
+      knownTarget_ = instructionAddress_ + metadata.operands[1].imm;
+      break;
+    case Opcode::RISCV_C_FLD:
+    case Opcode::RISCV_C_FLW:
+    case Opcode::RISCV_C_LD:
+    case Opcode::RISCV_C_LW:
+      instFormat_ = CIF_CL;
+      isLoad_ = true;
+      assert(metadata.operandCount==3 &&
+             metadata.operands[0].type == RISCV_OP_REG &&
+             metadata.operands[1].type == RISCV_OP_IMM &&
+             metadata.operands[2].type == RISCV_OP_REG &&
+             metadata.operands[0].reg > 8 &&
+             metadata.operands[2].reg > 8 &&
+             "Invalid operand for CL instructions");
+      destinationRegisters[destinationRegisterCount++] = csRegToRegister(metadata.operands[0].reg);
+      c_imm = metadata.operands[1].imm;
+      sourceRegisters[sourceRegisterCount++] = csRegToRegister(metadata.operands[2].reg);
+      operandsPending++;
+      break;
+    //case Opcode::RISCV_C_FLDSP:
+    //case Opcode::RISCV_C_FLWSP:
+    case Opcode::RISCV_C_LWSP:
+      //case Opcode::RISCV_C_LDSP:
+      instFormat_ = CIF_CI;
+      isLoad_ = true;
+      assert(metadata.operandCount==3 &&
+             metadata.operands[0].type == RISCV_OP_REG &&
+             metadata.operands[1].type == RISCV_OP_IMM &&
+             metadata.operands[2].type == RISCV_OP_REG &&
+             csRegToRegister(metadata.operands[0].reg) != Instruction::ZERO_REGISTER &&
+             metadata.operands[2].reg == 0x3 &&
+             "Invalid operand for CI instructions");
+      destinationRegisters[destinationRegisterCount++] = csRegToRegister(metadata.operands[0].reg);
+      c_imm = metadata.operands[1].imm;
+      sourceRegisters[sourceRegisterCount++] = csRegToRegister(metadata.operands[2].reg);
+      operandsPending++;
+      break;
+    case Opcode::RISCV_C_FSD:
+    case Opcode::RISCV_C_FSW:
+    case Opcode::RISCV_C_SW:
+    case Opcode::RISCV_C_SD:
+      instFormat_ = CIF_CS;
+      isStore_ = true;
+      assert(metadata.operandCount==3 &&
+             metadata.operands[0].type == RISCV_OP_REG &&
+             metadata.operands[1].type == RISCV_OP_IMM &&
+             metadata.operands[2].type == RISCV_OP_REG &&
+             metadata.operands[0].reg > 8 &&
+             metadata.operands[2].reg > 8 &&
+             "Invalid operand for CS instructions");
+      sourceRegisters[sourceRegisterCount++] = csRegToRegister(metadata.operands[0].reg);
+      operandsPending++;
+      c_imm = metadata.operands[1].imm;
+      sourceRegisters[sourceRegisterCount++] = csRegToRegister(metadata.operands[2].reg);
+      operandsPending++;
+      break;
+    //case Opcode::RISCV_C_FSDSP:
+    //case Opcode::RISCV_C_FSWSP:
+    case Opcode::RISCV_C_SWSP:
+      //case Opcode::RISCV_C_SDSP:
+      instFormat_ = CIF_CSS;
+      isStore_ = true;
+      assert(metadata.operandCount==3 &&
+             metadata.operands[0].type == RISCV_OP_REG &&
+             metadata.operands[1].type == RISCV_OP_IMM &&
+             metadata.operands[2].type == RISCV_OP_REG &&
+             metadata.operands[2].reg == 0x3 &&
+             "Invalid operand for CSS instructions");
+      sourceRegisters[sourceRegisterCount] = csRegToRegister(metadata.operands[0].reg);
+      if (sourceRegisters[sourceRegisterCount] ==
+          Instruction::ZERO_REGISTER) {
+        // Catch zero register references and pre-complete those operands
+        operands[sourceRegisterCount] = RegisterValue(0, 4);
+      } else {
+        operandsPending++;
+      }
+      sourceRegisterCount++;
+      c_imm = metadata.operands[1].imm;
+      sourceRegisters[sourceRegisterCount] = csRegToRegister(metadata.operands[2].reg);
+      sourceRegisterCount++;
+      operandsPending++;
+      break;
+    case Opcode::RISCV_C_J:
+    case Opcode::RISCV_C_JAL:
+      instFormat_ = CIF_CJ;
+      isBranch_ = true;
+      //Add assertion when first operand is not of type imm
+      assert(metadata.operandCount==1 &&
+             metadata.operands[0].type == RISCV_OP_IMM && "Invalid operand for CJ instructions");
+      c_imm = metadata.operands[0].imm;
+      if (metadata.opcode == Opcode::RISCV_C_JAL) {
+        destinationRegisters[destinationRegisterCount++] = Instruction::RA_REGISTER;
+      }
+      branchType_ = BranchType::Unconditional;
+      knownTarget_ = instructionAddress_ + metadata.operands[0].imm;
+      break;
+    case Opcode::RISCV_C_UNIMP:
+      break;
+  }
+
+  assert(instFormat_!= CIF_INVALID && "Invalid format defined for a RISCV compressed instruction");
+  return true;
+}
+
+bool Instruction::decodeCsr() {
+  //CSR operations
+  if (!(metadata.opcode >= Opcode::RISCV_CSRRC &&
+        metadata.opcode <= Opcode::RISCV_CSRRWI)) {
+    return false;
+  }
+
+  isCsr_ = true;
+  uint32_t sysRegTag = architecture_.getSystemRegisterTag(metadata.csr);
+  if (sysRegTag == -1) {
+    exceptionEncountered_ = true;
+    exception_ = InstructionException::UnmappedSysReg;
+    sourceRegisterCount = 0;
+    destinationRegisterCount = 0;
+    return true;
+  }
+
+  // CSR becomes first source and destination
+  sourceRegisters[sourceRegisterCount++] = {
+      RegisterType::SYSTEM, static_cast<uint16_t>(sysRegTag)};
+  operandsPending++;
+  destinationRegisters[destinationRegisterCount++] = {
+      RegisterType::SYSTEM, static_cast<uint16_t>(sysRegTag)};
+
+  // First operand from metadata is rd, second operand from metadata is rs1
+  if (csRegToRegister(metadata.operands[1].reg) != Instruction::ZERO_REGISTER) {
+    destinationRegisters[destinationRegisterCount++] =
+        csRegToRegister(metadata.operands[1].reg);
+  }
+
+  if(metadata.operands[0].type == RISCV_OP_IMM) {
+    c_imm = metadata.operands[0].imm;
+  } else if (metadata.operands[0].type == RISCV_OP_REG) {
+    sourceRegisters[sourceRegisterCount] = csRegToRegister(metadata.operands[0].reg);
+    if (sourceRegisters[sourceRegisterCount] ==
+        Instruction::ZERO_REGISTER) {
+      // Catch zero register references and pre-complete those operands
+      operands[sourceRegisterCount] = RegisterValue(0, 4);
+    } else {
+      operandsPending++;
+    }
+    sourceRegisterCount++;
+  } else {
+    exceptionEncountered_ = true;
+    exception_ = InstructionException::EncodingNotYetImplemented;
+    sourceRegisterCount = 0;
+    destinationRegisterCount = 0;
+  }
+
+  return true;
+}
+
 }  // namespace riscv
 }  // namespace arch
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/lib/arch/riscv/Instruction_execute.cc b/src/lib/arch/riscv/Instruction_execute.cc
index 005982a9fc..b7a4a822b4 100644
--- a/src/lib/arch/riscv/Instruction_execute.cc
+++ b/src/lib/arch/riscv/Instruction_execute.cc
@@ -62,6 +62,10 @@ uint64_t zeroExtend(uint64_t bits, uint64_t msb) {
   return rightShift;
 }
 
+inline int64_t Instruction::getSignedInt(RegisterValue& value) const {
+  return (archRegWidth_ == 4) ? (int64_t)value.get<int32_t>() : value.get<int64_t>();
+}
+
 void Instruction::executionNYI() {
   exceptionEncountered_ = true;
   exception_ = InstructionException::ExecutionNotYetImplemented;
@@ -79,32 +83,38 @@ void Instruction::execute() {
   executed_ = true;
   switch (metadata.opcode) {
     case Opcode::RISCV_LB: {  // LB rd,rs1,imm
-      results[0] = RegisterValue(bitExtend(memoryData[0].get<uint8_t>(), 8), 8);
+      results[0] = RegisterValue(bitExtend(memoryData[0].get<uint8_t>(), 8),
+                                 archRegWidth_);
       break;
     }
     case Opcode::RISCV_LBU: {  // LBU rd,rs1,imm
       results[0] =
-          RegisterValue(zeroExtend(memoryData[0].get<uint8_t>(), 8), 8);
+          RegisterValue(zeroExtend(memoryData[0].get<uint8_t>(), 8),
+                                 archRegWidth_);
       break;
     }
     case Opcode::RISCV_LH: {  // LH rd,rs1,imm
       results[0] =
-          RegisterValue(bitExtend(memoryData[0].get<uint16_t>(), 16), 8);
+          RegisterValue(bitExtend(memoryData[0].get<uint16_t>(), 16),
+                                 archRegWidth_);
       break;
     }
     case Opcode::RISCV_LHU: {  // LHU rd,rs1,imm
       results[0] =
-          RegisterValue(zeroExtend(memoryData[0].get<uint16_t>(), 16), 8);
+          RegisterValue(zeroExtend(memoryData[0].get<uint16_t>(), 16),
+                                 archRegWidth_);
       break;
     }
     case Opcode::RISCV_LW: {  // LW rd,rs1,imm
       results[0] =
-          RegisterValue(bitExtend(memoryData[0].get<uint32_t>(), 32), 8);
+          RegisterValue(bitExtend(memoryData[0].get<uint32_t>(), 32),
+                                 archRegWidth_);
       break;
     }
     case Opcode::RISCV_LWU: {  // LWU rd,rs1,imm
       results[0] =
-          RegisterValue(zeroExtend(memoryData[0].get<uint32_t>(), 32), 8);
+          RegisterValue(zeroExtend(memoryData[0].get<uint32_t>(), 32),
+                                 archRegWidth_);
       break;
     }
     case Opcode::RISCV_LD: {  // LD rd,rs1,imm
@@ -123,19 +133,19 @@ void Instruction::execute() {
       break;
     }
     case Opcode::RISCV_SLL: {  // SLL rd,rs1,rs2
-      const int64_t rs1 = operands[0].get<int64_t>();
+      const int64_t rs1 = getSignedInt(operands[0]);
       const int64_t rs2 =
-          operands[1].get<int64_t>() & 63;  // Only use lowest 6 bits
+          getSignedInt(operands[1]) & 63;  // Only use lowest 6 bits
       int64_t out = static_cast<int64_t>(rs1 << rs2);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_SLLI: {  // SLLI rd,rs1,shamt
-      const int64_t rs1 = operands[0].get<int64_t>();
+      const int64_t rs1 = getSignedInt(operands[0]);
       const int64_t shamt =
           metadata.operands[2].imm & 63;  // Only use lowest 6 bits
       int64_t out = static_cast<int64_t>(rs1 << shamt);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_SLLW: {  // SLLW rd,rs1,rs2
@@ -143,7 +153,7 @@ void Instruction::execute() {
       const int32_t rs2 =
           operands[1].get<int32_t>() & 63;  // Only use lowest 6 bits
       int64_t out = signExtendW(static_cast<int32_t>(rs1 << rs2));
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_SLLIW: {  // SLLIW rd,rs1,shamt
@@ -151,7 +161,7 @@ void Instruction::execute() {
       const int32_t shamt =
           metadata.operands[2].imm & 63;  // Only use lowest 6 bits
       uint64_t out = signExtendW(static_cast<uint32_t>(rs1 << shamt));
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_SRL: {  // SRL rd,rs1,rs2
@@ -159,7 +169,7 @@ void Instruction::execute() {
       const uint64_t rs2 =
           operands[1].get<uint64_t>() & 63;  // Only use lowest 6 bits
       uint64_t out = static_cast<uint64_t>(rs1 >> rs2);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_SRLI: {  // SRLI rd,rs1,shamt
@@ -167,7 +177,7 @@ void Instruction::execute() {
       const uint64_t shamt =
           metadata.operands[2].imm & 63;  // Only use lowest 6 bits
       uint64_t out = static_cast<uint64_t>(rs1 >> shamt);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_SRLW: {  // SRLW rd,rs1,rs2
@@ -175,7 +185,7 @@ void Instruction::execute() {
       const uint32_t rs2 =
           operands[1].get<uint32_t>() & 63;  // Only use lowest 6 bits
       uint64_t out = signExtendW(static_cast<uint64_t>(rs1 >> rs2));
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_SRLIW: {  // SRLIW rd,rs1,shamt
@@ -183,23 +193,23 @@ void Instruction::execute() {
       const uint32_t shamt =
           metadata.operands[2].imm & 63;  // Only use lowest 6 bits
       uint64_t out = signExtendW(static_cast<uint32_t>(rs1 >> shamt));
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_SRA: {  // SRA rd,rs1,rs2
-      const int64_t rs1 = operands[0].get<int64_t>();
+      const int64_t rs1 = getSignedInt(operands[0]);
       const int64_t rs2 =
-          operands[1].get<int64_t>() & 63;  // Only use lowest 6 bits
+          getSignedInt(operands[1]) & 63;  // Only use lowest 6 bits
       int64_t out = static_cast<int64_t>(rs1 >> rs2);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_SRAI: {  // SRAI rd,rs1,shamt
-      const int64_t rs1 = operands[0].get<int64_t>();
+      const int64_t rs1 = getSignedInt(operands[0]);
       const int64_t shamt =
           metadata.operands[2].imm & 63;  // Only use lowest 6 bits
       int64_t out = static_cast<int64_t>(rs1 >> shamt);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_SRAW: {  // SRAW rd,rs1,rs2
@@ -207,7 +217,7 @@ void Instruction::execute() {
       const int32_t rs2 =
           operands[1].get<int32_t>() & 63;  // Only use lowest 6 bits
       int64_t out = static_cast<int32_t>(rs1 >> rs2);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_SRAIW: {  // SRAIW rd,rs1,shamt
@@ -215,55 +225,55 @@ void Instruction::execute() {
       const int32_t shamt =
           metadata.operands[2].imm & 63;  // Only use lowest 6 bits
       int64_t out = static_cast<int32_t>(rs1 >> shamt);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_ADD: {  // ADD rd,rs1,rs2
       const uint64_t rs1 = operands[0].get<uint64_t>();
       const uint64_t rs2 = operands[1].get<uint64_t>();
       uint64_t out = static_cast<uint64_t>(rs1 + rs2);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_ADDW: {  // ADDW rd,rs1,rs2
       const int32_t rs1 = operands[0].get<int32_t>();
       const int32_t rs2 = operands[1].get<int32_t>();
       int64_t out = static_cast<int64_t>(static_cast<int32_t>(rs1 + rs2));
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_ADDI: {  // ADDI rd,rs1,imm
       const uint64_t rs1 = operands[0].get<uint64_t>();
       const uint64_t rs2 = metadata.operands[2].imm;
       uint64_t out = static_cast<uint64_t>(rs1 + rs2);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_ADDIW: {  // ADDIW rd,rs1,imm
       const int32_t rs1 = operands[0].get<int32_t>();
       const int32_t imm = metadata.operands[2].imm;
       uint64_t out = signExtendW(rs1 + imm);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_SUB: {  // SUB rd,rs1,rs2
       const uint64_t rs1 = operands[0].get<uint64_t>();
       const uint64_t rs2 = operands[1].get<uint64_t>();
       uint64_t out = static_cast<uint64_t>(rs1 - rs2);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_SUBW: {  // SUBW rd,rs1,rs2
       const int32_t rs1 = operands[0].get<int32_t>();
       const int32_t rs2 = operands[1].get<int32_t>();
       int64_t out = static_cast<int64_t>(static_cast<int32_t>(rs1 - rs2));
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_LUI: {  // LUI rd,imm
       uint64_t out = signExtendW(metadata.operands[1].imm
                                  << 12);  // Shift into upper 20 bits
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_AUIPC: {  // AUIPC rd,imm
@@ -271,58 +281,58 @@ void Instruction::execute() {
       const int64_t uimm = signExtendW(metadata.operands[1].imm
                                        << 12);  // Shift into upper 20 bits
       uint64_t out = static_cast<uint64_t>(pc + uimm);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_XOR: {  // XOR rd,rs1,rs2
       const uint64_t rs1 = operands[0].get<uint64_t>();
       const uint64_t rs2 = operands[1].get<uint64_t>();
       uint64_t out = static_cast<uint64_t>(rs1 ^ rs2);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_XORI: {  // XORI rd,rs1,imm
       const uint64_t rs1 = operands[0].get<uint64_t>();
       const uint64_t imm = metadata.operands[2].imm;
       uint64_t out = static_cast<uint64_t>(rs1 ^ imm);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_OR: {  // OR rd,rs1,rs2
       const uint64_t rs1 = operands[0].get<uint64_t>();
       const uint64_t rs2 = operands[1].get<uint64_t>();
       uint64_t out = static_cast<uint64_t>(rs1 | rs2);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_ORI: {  // ORI rd,rs1,imm
       const uint64_t rs1 = operands[0].get<uint64_t>();
       const uint64_t imm = metadata.operands[2].imm;
       uint64_t out = static_cast<uint64_t>(rs1 | imm);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_AND: {  // AND rd,rs1,rs2
       const uint64_t rs1 = operands[0].get<uint64_t>();
       const uint64_t rs2 = operands[1].get<uint64_t>();
       uint64_t out = static_cast<uint64_t>(rs1 & rs2);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_ANDI: {  // ANDI rd,rs1,imm
       const uint64_t rs1 = operands[0].get<uint64_t>();
       const uint64_t imm = metadata.operands[2].imm;
       uint64_t out = static_cast<uint64_t>(rs1 & imm);
-      results[0] = RegisterValue(out, 8);
+      results[0] = RegisterValue(out, archRegWidth_);
       break;
     }
     case Opcode::RISCV_SLT: {  // SLT rd,rs1,rs2
-      const int64_t rs1 = operands[0].get<int64_t>();
-      const int64_t rs2 = operands[1].get<int64_t>();
+      const int64_t rs1 = getSignedInt(operands[0]);
+      const int64_t rs2 = getSignedInt(operands[1]);
       if (rs1 < rs2) {
-        results[0] = RegisterValue(static_cast<uint64_t>(1), 8);
+        results[0] = RegisterValue(static_cast<uint64_t>(1), archRegWidth_);
       } else {
-        results[0] = RegisterValue(static_cast<uint64_t>(0), 8);
+        results[0] = RegisterValue(static_cast<uint64_t>(0), archRegWidth_);
       }
       break;
     }
@@ -330,19 +340,19 @@ void Instruction::execute() {
       const uint64_t rs1 = operands[0].get<uint64_t>();
       const uint64_t rs2 = operands[1].get<uint64_t>();
       if (rs1 < rs2) {
-        results[0] = RegisterValue(static_cast<uint64_t>(1), 8);
+        results[0] = RegisterValue(static_cast<uint64_t>(1), archRegWidth_);
       } else {
-        results[0] = RegisterValue(static_cast<uint64_t>(0), 8);
+        results[0] = RegisterValue(static_cast<uint64_t>(0), archRegWidth_);
       }
       break;
     }
     case Opcode::RISCV_SLTI: {  // SLTI rd,rs1,imm
-      const int64_t rs1 = operands[0].get<int64_t>();
+      const int64_t rs1 = getSignedInt(operands[0]);
       const int64_t imm = metadata.operands[2].imm;
       if (rs1 < imm) {
-        results[0] = RegisterValue(static_cast<uint64_t>(1), 8);
+        results[0] = RegisterValue(static_cast<uint64_t>(1), archRegWidth_);
       } else {
-        results[0] = RegisterValue(static_cast<uint64_t>(0), 8);
+        results[0] = RegisterValue(static_cast<uint64_t>(0), archRegWidth_);
       }
       break;
     }
@@ -350,9 +360,9 @@ void Instruction::execute() {
       const uint64_t rs1 = operands[0].get<uint64_t>();
       const uint64_t imm = static_cast<int64_t>(metadata.operands[2].imm);
       if (rs1 < imm) {
-        results[0] = RegisterValue(static_cast<uint64_t>(1), 8);
+        results[0] = RegisterValue(static_cast<uint64_t>(1), archRegWidth_);
       } else {
-        results[0] = RegisterValue(static_cast<uint64_t>(0), 8);
+        results[0] = RegisterValue(static_cast<uint64_t>(0), archRegWidth_);
       }
       break;
     }
@@ -383,8 +393,8 @@ void Instruction::execute() {
       break;
     }
     case Opcode::RISCV_BLT: {  // BLT rs1,rs2,imm
-      const int64_t rs1 = operands[0].get<int64_t>();
-      const int64_t rs2 = operands[1].get<int64_t>();
+      const int64_t rs1 = getSignedInt(operands[0]);
+      const int64_t rs2 = getSignedInt(operands[1]);
       if (rs1 < rs2) {
         branchAddress_ = instructionAddress_ +
                          metadata.operands[2].imm;  // Set LSB of result to 0
@@ -409,8 +419,9 @@ void Instruction::execute() {
       break;
     }
     case Opcode::RISCV_BGE: {  // BGE rs1,rs2,imm
-      const int64_t rs1 = operands[0].get<int64_t>();
-      const int64_t rs2 = operands[1].get<int64_t>();
+      const int64_t rs1 = getSignedInt(operands[0]);
+      const int64_t rs2 = getSignedInt(operands[1]);
+
       if (rs1 >= rs2) {
         branchAddress_ = instructionAddress_ +
                          metadata.operands[2].imm;  // Set LSB of result to 0
@@ -438,7 +449,7 @@ void Instruction::execute() {
       branchAddress_ = instructionAddress_ +
                        metadata.operands[1].imm;  // Set LSB of result to 0
       branchTaken_ = true;
-      results[0] = RegisterValue(instructionAddress_ + 4, 8);
+      results[0] = RegisterValue(instructionAddress_ + 4, archRegWidth_);
       break;
     }
     case Opcode::RISCV_JALR: {  // JALR rd,rs1,imm
@@ -446,7 +457,7 @@ void Instruction::execute() {
           (operands[0].get<uint64_t>() + metadata.operands[2].imm) &
           ~1;  // Set LSB of result to 0
       branchTaken_ = true;
-      results[0] = RegisterValue(instructionAddress_ + 4, 8);
+      results[0] = RegisterValue(instructionAddress_ + 4, archRegWidth_);
       break;
     }
       // TODO EBREAK
@@ -481,14 +492,15 @@ void Instruction::execute() {
       // TODO use aq and rl bits to prevent reordering with other memory
       // operations
       results[0] =
-          RegisterValue(bitExtend(memoryData[0].get<uint32_t>(), 32), 8);
+          RegisterValue(bitExtend(memoryData[0].get<uint32_t>(), 32),
+                                 archRegWidth_);
       break;
     }
     case Opcode::RISCV_LR_D:  // LR.D rd,rs1
     case Opcode::RISCV_LR_D_AQ:
     case Opcode::RISCV_LR_D_RL:
     case Opcode::RISCV_LR_D_AQ_RL: {
-      results[0] = RegisterValue(memoryData[0].get<uint64_t>(), 8);
+      results[0] = RegisterValue(memoryData[0].get<uint64_t>(), archRegWidth_);
       break;
     }
     case Opcode::RISCV_SC_W:  // SC.W rd,rs1,rs2
@@ -507,7 +519,7 @@ void Instruction::execute() {
       // TODO use aq and rl bits to prevent reordering with other memory
       // operations
       memoryData[0] = operands[0];
-      results[0] = RegisterValue(static_cast<uint64_t>(0), 8);
+      results[0] = RegisterValue(static_cast<uint64_t>(0), archRegWidth_);
       break;
     }
     case Opcode::RISCV_AMOSWAP_W:  // AMOSWAP.W rd,rs1,rs2
@@ -521,7 +533,7 @@ void Instruction::execute() {
       // TODO account for AQ and RL bits
       int64_t rd = signExtendW(memoryData[0].get<uint32_t>());
       int32_t rs2 = operands[0].get<int32_t>();
-      results[0] = RegisterValue(rd, 8);
+      results[0] = RegisterValue(rd, archRegWidth_);
       memoryData[0] = rs2;
       break;
     }
@@ -531,7 +543,7 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOSWAP_D_AQ_RL: {
       uint64_t rd = memoryData[0].get<uint64_t>();
       uint64_t rs2 = operands[0].get<uint64_t>();
-      results[0] = RegisterValue(rd, 8);
+      results[0] = RegisterValue(rd, archRegWidth_);
       memoryData[0] = rs2;
       break;
     }
@@ -540,7 +552,7 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOADD_W_RL:
     case Opcode::RISCV_AMOADD_W_AQ_RL: {
       int64_t rd = signExtendW(memoryData[0].get<uint32_t>());
-      results[0] = RegisterValue(rd, 8);
+      results[0] = RegisterValue(rd, archRegWidth_);
       memoryData[0] = static_cast<int32_t>(rd + operands[0].get<int64_t>());
       break;
     }
@@ -549,7 +561,7 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOADD_D_RL:
     case Opcode::RISCV_AMOADD_D_AQ_RL: {
       int64_t rd = memoryData[0].get<uint64_t>();
-      results[0] = RegisterValue(rd, 8);
+      results[0] = RegisterValue(rd, archRegWidth_);
       memoryData[0] = static_cast<int64_t>(rd + operands[0].get<int64_t>());
       break;
     }
@@ -558,7 +570,7 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOAND_W_RL:
     case Opcode::RISCV_AMOAND_W_AQ_RL: {
       int64_t rd = signExtendW(memoryData[0].get<uint32_t>());
-      results[0] = RegisterValue(rd, 8);
+      results[0] = RegisterValue(rd, archRegWidth_);
       memoryData[0] = static_cast<int32_t>(rd & operands[0].get<int64_t>());
       break;
     }
@@ -567,7 +579,7 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOAND_D_RL:
     case Opcode::RISCV_AMOAND_D_AQ_RL: {
       int64_t rd = memoryData[0].get<uint64_t>();
-      results[0] = RegisterValue(rd, 8);
+      results[0] = RegisterValue(rd, archRegWidth_);
       memoryData[0] = static_cast<int64_t>(rd & operands[0].get<int64_t>());
       break;
     }
@@ -576,7 +588,7 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOOR_W_RL:
     case Opcode::RISCV_AMOOR_W_AQ_RL: {
       int64_t rd = signExtendW(memoryData[0].get<uint32_t>());
-      results[0] = RegisterValue(rd, 8);
+      results[0] = RegisterValue(rd, archRegWidth_);
       memoryData[0] = static_cast<int32_t>(rd | operands[0].get<int64_t>());
       break;
     }
@@ -585,7 +597,7 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOOR_D_RL:
     case Opcode::RISCV_AMOOR_D_AQ_RL: {
       int64_t rd = memoryData[0].get<uint64_t>();
-      results[0] = RegisterValue(rd, 8);
+      results[0] = RegisterValue(rd, archRegWidth_);
       memoryData[0] = static_cast<int64_t>(rd | operands[0].get<int64_t>());
       break;
     }
@@ -594,7 +606,7 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOXOR_W_RL:
     case Opcode::RISCV_AMOXOR_W_AQ_RL: {
       int64_t rd = signExtendW(memoryData[0].get<uint32_t>());
-      results[0] = RegisterValue(rd, 8);
+      results[0] = RegisterValue(rd, archRegWidth_);
       memoryData[0] = static_cast<int32_t>(rd ^ operands[0].get<int64_t>());
       break;
     }
@@ -603,7 +615,7 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOXOR_D_RL:
     case Opcode::RISCV_AMOXOR_D_AQ_RL: {
       int64_t rd = memoryData[0].get<uint64_t>();
-      results[0] = RegisterValue(rd, 8);
+      results[0] = RegisterValue(rd, archRegWidth_);
       memoryData[0] = static_cast<int64_t>(rd ^ operands[0].get<int64_t>());
       break;
     }
@@ -612,7 +624,8 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOMIN_W_AQ:
     case Opcode::RISCV_AMOMIN_W_RL:
     case Opcode::RISCV_AMOMIN_W_AQ_RL: {
-      results[0] = RegisterValue(signExtendW(memoryData[0].get<int32_t>()), 8);
+      results[0] = RegisterValue(signExtendW(memoryData[0].get<int32_t>()),
+                                 archRegWidth_);
       memoryData[0] =
           std::min(memoryData[0].get<int32_t>(), operands[0].get<int32_t>());
       break;
@@ -622,7 +635,7 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOMIN_D_RL:
     case Opcode::RISCV_AMOMIN_D_AQ_RL: {
       int64_t rd = memoryData[0].get<int64_t>();
-      results[0] = RegisterValue(rd, 8);
+      results[0] = RegisterValue(rd, archRegWidth_);
       memoryData[0] =
           static_cast<int64_t>(std::min(rd, operands[0].get<int64_t>()));
       break;
@@ -631,7 +644,8 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOMINU_W_AQ:
     case Opcode::RISCV_AMOMINU_W_RL:
     case Opcode::RISCV_AMOMINU_W_AQ_RL: {
-      results[0] = RegisterValue(signExtendW(memoryData[0].get<uint32_t>()), 8);
+      results[0] = RegisterValue(signExtendW(memoryData[0].get<uint32_t>()),
+                                 archRegWidth_);
       memoryData[0] =
           std::min(memoryData[0].get<uint32_t>(), operands[0].get<uint32_t>());
       break;
@@ -641,7 +655,7 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOMINU_D_RL:
     case Opcode::RISCV_AMOMINU_D_AQ_RL: {
       uint64_t rd = memoryData[0].get<uint64_t>();
-      results[0] = RegisterValue(rd, 8);
+      results[0] = RegisterValue(rd, archRegWidth_);
       memoryData[0] =
           static_cast<uint64_t>(std::min(rd, operands[0].get<uint64_t>()));
       break;
@@ -651,7 +665,8 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOMAX_W_AQ:
     case Opcode::RISCV_AMOMAX_W_RL:
     case Opcode::RISCV_AMOMAX_W_AQ_RL: {
-      results[0] = RegisterValue(signExtendW(memoryData[0].get<int32_t>()), 8);
+      results[0] = RegisterValue(signExtendW(memoryData[0].get<int32_t>()),
+                                 archRegWidth_);
       memoryData[0] =
           std::max(memoryData[0].get<int32_t>(), operands[0].get<int32_t>());
       break;
@@ -661,7 +676,7 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOMAX_D_RL:
     case Opcode::RISCV_AMOMAX_D_AQ_RL: {
       int64_t rd = memoryData[0].get<int64_t>();
-      results[0] = RegisterValue(rd, 8);
+      results[0] = RegisterValue(rd, archRegWidth_);
       memoryData[0] =
           static_cast<int64_t>(std::max(rd, operands[0].get<int64_t>()));
       break;
@@ -670,7 +685,8 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOMAXU_W_AQ:
     case Opcode::RISCV_AMOMAXU_W_RL:
     case Opcode::RISCV_AMOMAXU_W_AQ_RL: {
-      results[0] = RegisterValue(signExtendW(memoryData[0].get<uint32_t>()), 8);
+      results[0] = RegisterValue(signExtendW(memoryData[0].get<uint32_t>()),
+                                 archRegWidth_);
       memoryData[0] =
           std::max(memoryData[0].get<uint32_t>(), operands[0].get<uint32_t>());
       break;
@@ -680,7 +696,7 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOMAXU_D_RL:
     case Opcode::RISCV_AMOMAXU_D_AQ_RL: {
       uint64_t rd = memoryData[0].get<uint64_t>();
-      results[0] = RegisterValue(rd, 8);
+      results[0] = RegisterValue(rd, archRegWidth_);
       memoryData[0] =
           static_cast<uint64_t>(std::max(rd, operands[0].get<uint64_t>()));
       break;
@@ -688,9 +704,9 @@ void Instruction::execute() {
 
       // Integer multiplication division extension (M)
     case Opcode::RISCV_MUL: {  // MUL rd,rs1,rs2
-      const int64_t rs1 = operands[0].get<int64_t>();
-      const int64_t rs2 = operands[1].get<int64_t>();
-      results[0] = RegisterValue(static_cast<int64_t>(rs1 * rs2), 8);
+      const int64_t rs1 = getSignedInt(operands[0]);
+      const int64_t rs2 = getSignedInt(operands[1]);
+      results[0] = RegisterValue(static_cast<int64_t>(rs1 * rs2), archRegWidth_);
       break;
     }
       //    case Opcode::RISCV_MULH: {//MULH rd,rs1,rs2
@@ -704,7 +720,7 @@ void Instruction::execute() {
     case Opcode::RISCV_MULHU: {  // MULHU rd,rs1,rs2
       const uint64_t rs1 = operands[0].get<uint64_t>();
       const uint64_t rs2 = operands[1].get<uint64_t>();
-      results[0] = RegisterValue(mulhiuu(rs1, rs2), 8);
+      results[0] = RegisterValue(mulhiuu(rs1, rs2), archRegWidth_);
       break;
     }
       //    case Opcode::RISCV_MULHSU: {//MULHSU rd,rs1,rs2
@@ -718,21 +734,21 @@ void Instruction::execute() {
     case Opcode::RISCV_MULW: {  // MULW rd,rs1,rs2
       const uint32_t rs1 = operands[0].get<uint32_t>();
       const uint32_t rs2 = operands[1].get<uint32_t>();
-      results[0] = RegisterValue(signExtendW(rs1 * rs2), 8);
+      results[0] = RegisterValue(signExtendW(rs1 * rs2), archRegWidth_);
       break;
     }
 
     case Opcode::RISCV_DIV: {  // DIV rd,rs1,rs2
-      const int64_t rs1 = operands[0].get<int64_t>();
-      const int64_t rs2 = operands[1].get<int64_t>();
+      const int64_t rs1 = getSignedInt(operands[0]);
+      const int64_t rs2 = getSignedInt(operands[1]);
       if (rs2 == 0) {
         // divide by zero
-        results[0] = RegisterValue(static_cast<uint64_t>(-1), 8);
+        results[0] = RegisterValue(static_cast<uint64_t>(-1), archRegWidth_);
       } else if (rs1 == static_cast<int64_t>(0x8000000000000000) && rs2 == -1) {
         // division overflow
-        results[0] = RegisterValue(rs1, 8);
+        results[0] = RegisterValue(rs1, archRegWidth_);
       } else {
-        results[0] = RegisterValue(static_cast<int64_t>(rs1 / rs2), 8);
+        results[0] = RegisterValue(static_cast<int64_t>(rs1 / rs2), archRegWidth_);
       }
       break;
     }
@@ -741,13 +757,15 @@ void Instruction::execute() {
       const int32_t rs2 = operands[1].get<int32_t>();
       if (rs2 == 0) {
         // divide by zero
-        results[0] = RegisterValue(static_cast<uint64_t>(-1), 8);
+        results[0] = RegisterValue(static_cast<uint64_t>(-1), archRegWidth_);
       } else if (rs1 == static_cast<int32_t>(0x80000000) && rs2 == -1) {
         // division overflow
-        results[0] = RegisterValue(static_cast<int64_t>(signExtendW(rs1)), 8);
+        results[0] = RegisterValue(static_cast<int64_t>(signExtendW(rs1)),
+                                   archRegWidth_);
       } else {
         results[0] =
-            RegisterValue(static_cast<int64_t>(signExtendW(rs1 / rs2)), 8);
+            RegisterValue(static_cast<int64_t>(signExtendW(rs1 / rs2)),
+                                   archRegWidth_);
       }
       break;
     }
@@ -756,9 +774,9 @@ void Instruction::execute() {
       const uint64_t rs2 = operands[1].get<uint64_t>();
       if (rs2 == 0) {
         // divide by zero
-        results[0] = RegisterValue(static_cast<uint64_t>(-1), 8);
+        results[0] = RegisterValue(static_cast<uint64_t>(-1), archRegWidth_);
       } else {
-        results[0] = RegisterValue(static_cast<uint64_t>(rs1 / rs2), 8);
+        results[0] = RegisterValue(static_cast<uint64_t>(rs1 / rs2), archRegWidth_);
       }
       break;
     }
@@ -767,24 +785,24 @@ void Instruction::execute() {
       const uint32_t rs2 = operands[1].get<uint32_t>();
       if (rs2 == 0) {
         // divide by zero
-        results[0] = RegisterValue(static_cast<uint64_t>(-1), 8);
+        results[0] = RegisterValue(static_cast<uint64_t>(-1), archRegWidth_);
       } else {
         results[0] =
-            RegisterValue(static_cast<uint64_t>(signExtendW(rs1 / rs2)), 8);
+            RegisterValue(static_cast<uint64_t>(signExtendW(rs1 / rs2)), archRegWidth_);
       }
       break;
     }
     case Opcode::RISCV_REM: {  // REM rd,rs1,rs2
-      const int64_t rs1 = operands[0].get<int64_t>();
-      const int64_t rs2 = operands[1].get<int64_t>();
+      const int64_t rs1 = getSignedInt(operands[0]);
+      const int64_t rs2 = getSignedInt(operands[1]);
       if (rs2 == 0) {
         // divide by zero
-        results[0] = RegisterValue(static_cast<uint64_t>(rs1), 8);
+        results[0] = RegisterValue(static_cast<uint64_t>(rs1), archRegWidth_);
       } else if (rs1 == static_cast<int64_t>(0x8000000000000000) && rs2 == -1) {
         // division overflow
-        results[0] = RegisterValue(static_cast<int64_t>(0), 8);
+        results[0] = RegisterValue(static_cast<int64_t>(0), archRegWidth_);
       } else {
-        results[0] = RegisterValue(static_cast<int64_t>(rs1 % rs2), 8);
+        results[0] = RegisterValue(static_cast<int64_t>(rs1 % rs2), archRegWidth_);
       }
       break;
     }
@@ -793,13 +811,15 @@ void Instruction::execute() {
       const int32_t rs2 = operands[1].get<int32_t>();
       if (rs2 == 0) {
         // divide by zero
-        results[0] = RegisterValue(static_cast<int64_t>(signExtendW(rs1)), 8);
+        results[0] = RegisterValue(static_cast<int64_t>(signExtendW(rs1)),
+                                   archRegWidth_);
       } else if (rs1 == static_cast<int32_t>(0x80000000) && rs2 == -1) {
         // division overflow
-        results[0] = RegisterValue(static_cast<int64_t>(0), 8);
+        results[0] = RegisterValue(static_cast<int64_t>(0), archRegWidth_);
       } else {
         results[0] =
-            RegisterValue(static_cast<int64_t>(signExtendW(rs1 % rs2)), 8);
+            RegisterValue(static_cast<int64_t>(signExtendW(rs1 % rs2)),
+                                   archRegWidth_);
       }
       break;
     }
@@ -808,9 +828,9 @@ void Instruction::execute() {
       const uint64_t rs2 = operands[1].get<uint64_t>();
       if (rs2 == 0) {
         // divide by zero
-        results[0] = RegisterValue(rs1, 8);
+        results[0] = RegisterValue(rs1, archRegWidth_);
       } else {
-        results[0] = RegisterValue(static_cast<uint64_t>(rs1 % rs2), 8);
+        results[0] = RegisterValue(static_cast<uint64_t>(rs1 % rs2), archRegWidth_);
       }
       break;
     }
@@ -819,13 +839,214 @@ void Instruction::execute() {
       const uint32_t rs2 = operands[1].get<uint32_t>();
       if (rs2 == 0) {
         // divide by zero
-        results[0] = RegisterValue(static_cast<int64_t>(signExtendW(rs1)), 8);
+        results[0] = RegisterValue(static_cast<int64_t>(signExtendW(rs1)),
+                                   archRegWidth_);
       } else {
         results[0] =
-            RegisterValue(static_cast<uint64_t>(signExtendW(rs1 % rs2)), 8);
+            RegisterValue(static_cast<uint64_t>(signExtendW(rs1 % rs2)), archRegWidth_);
       }
       break;
     }
+    case Opcode::RISCV_CSRRC: {
+      uint32_t old_csr_value = operands[0].get<uint32_t>();
+      uint32_t new_csr_value = old_csr_value & ~(operands[1].get<uint32_t>());
+      results[0] = RegisterValue(new_csr_value, 4);
+      results[1] = RegisterValue(old_csr_value, 4);
+    }
+    case Opcode::RISCV_CSRRCI: {
+      uint32_t old_csr_value = operands[0].get<uint32_t>();
+      uint32_t new_csr_value = old_csr_value & ~(c_imm);
+      results[0] = RegisterValue(new_csr_value, 4);
+      results[1] = RegisterValue(old_csr_value, 4);
+    }
+    case Opcode::RISCV_CSRRS: {
+      uint32_t old_csr_value = operands[0].get<uint32_t>();
+      uint32_t new_csr_value = old_csr_value | (operands[1].get<uint32_t>());
+      results[0] = RegisterValue(new_csr_value, 4);
+      results[1] = RegisterValue(old_csr_value, 4);
+      break;
+    }
+    case Opcode::RISCV_CSRRSI: {
+      uint32_t old_csr_value = operands[0].get<uint32_t>();
+      uint32_t new_csr_value = old_csr_value | (c_imm);
+      results[0] = RegisterValue(new_csr_value, 4);
+      results[1] = RegisterValue(old_csr_value, 4);
+      break;
+    }
+    case Opcode::RISCV_CSRRW: {
+      uint32_t old_csr_value = operands[0].get<uint32_t>();
+      uint32_t new_csr_value = operands[1].get<uint32_t>();
+      results[0] = RegisterValue(new_csr_value, 4);
+      results[1] = RegisterValue(old_csr_value, 4);
+      break;
+    }
+    case Opcode::RISCV_CSRRWI: {
+      uint32_t old_csr_value = operands[0].get<uint32_t>();
+      uint32_t new_csr_value = c_imm;
+      results[0] = RegisterValue(new_csr_value, 4);
+      results[1] = RegisterValue(old_csr_value, 4);
+      break;
+    }
+    case Opcode::RISCV_C_ADD: {
+      const uint32_t rs1 = operands[0].get<uint32_t>();
+      const uint32_t rs2 = operands[1].get<uint32_t>();
+      uint32_t out = (rs1 + rs2);
+      results[0] = RegisterValue(out, 4);
+      break;
+    }
+    case Opcode::RISCV_C_ADDI4SPN:
+    case Opcode::RISCV_C_ADDI16SP:
+    case Opcode::RISCV_C_ADDI: {
+      uint32_t out = (operands[0].get<uint32_t>() + c_imm);
+      results[0] = RegisterValue(out, 4);
+      break;
+    }
+    //case Opcode::RISCV_C_ADDIW:
+    //case Opcode::RISCV_C_ADDW:
+    case Opcode::RISCV_C_AND: {
+      const uint32_t rs1 = operands[0].get<uint32_t>();
+      const uint32_t rs2 = operands[1].get<uint32_t>();
+      uint32_t out = (rs1 & rs2);
+      results[0] = RegisterValue(out, 4);
+      break;
+    }
+    case Opcode::RISCV_C_ANDI: {
+      const uint32_t rs1 = operands[0].get<uint32_t>();
+      uint32_t out = (rs1 & c_imm);
+      results[0] = RegisterValue(out, 4);
+      break;
+    }
+    case Opcode::RISCV_C_BEQZ: {
+      const uint32_t rs1 = operands[0].get<uint32_t>();
+      if (rs1 == 0) {
+        branchAddress_ = (uint32_t)(instructionAddress_ + c_imm);  // Set LSB of result to 0
+        branchTaken_ = true;
+      } else {
+        branchAddress_ = instructionAddress_ + 2;
+        branchTaken_ = false;
+      }
+      break;
+    }
+    case Opcode::RISCV_C_BNEZ: {
+      const uint32_t rs1 = operands[0].get<uint32_t>();
+      if (rs1 != 0) {
+        branchAddress_ = (uint32_t)(instructionAddress_ + c_imm);
+        branchTaken_ = true;
+      } else {
+        branchAddress_ = instructionAddress_ + 2;
+        branchTaken_ = false;
+      }
+      break;
+    }
+    case Opcode::RISCV_C_EBREAK:
+      break;
+    case Opcode::RISCV_C_FLD:
+      break;
+    case Opcode::RISCV_C_FLDSP:
+      break;
+    case Opcode::RISCV_C_FLW:
+      break;
+    case Opcode::RISCV_C_FLWSP:
+      break;
+    case Opcode::RISCV_C_FSD:
+      break;
+    case Opcode::RISCV_C_FSDSP:
+      break;
+    case Opcode::RISCV_C_FSW:
+      break;
+    case Opcode::RISCV_C_J:
+    case Opcode::RISCV_C_JAL: {
+      branchAddress_ = (uint32_t)(instructionAddress_ + c_imm);
+      branchTaken_ = true;
+      results[0] = RegisterValue(static_cast<uint32_t>(instructionAddress_ + 2), 4);
+      break;
+    }
+    case Opcode::RISCV_C_JR:
+    case Opcode::RISCV_C_JALR: {
+      branchAddress_ = (operands[0].get<uint32_t>()) & ~1;// Set LSB of result to 0
+      branchTaken_ = true;
+      results[0] = RegisterValue(static_cast<uint32_t>(instructionAddress_ + 2), 4);
+      break;
+    }
+    case Opcode::RISCV_C_LD:
+    case Opcode::RISCV_C_LDSP:
+      break;
+    case Opcode::RISCV_C_LI: {
+      uint32_t out = signExtendW(metadata.operands[1].imm);
+      results[0] = RegisterValue(static_cast<uint32_t>(out), 4);
+      break;
+    }
+    case Opcode::RISCV_C_LUI: {
+      uint32_t out = signExtendW(metadata.operands[1].imm
+                                 << 12);  // Shift into upper 20 bits
+      results[0] = RegisterValue(static_cast<uint32_t>(out), 4);
+      break;
+    }
+    case Opcode::RISCV_C_LW:
+    case Opcode::RISCV_C_LWSP: {
+      results[0] = RegisterValue(bitExtend(memoryData[0].get<uint32_t>(), 32), 4);
+      break;
+    }
+    case Opcode::RISCV_C_MV: {
+      results[0] = RegisterValue(operands[0].get<int32_t>(), 4);
+      break;
+    }
+    case Opcode::RISCV_C_NOP:
+      break;
+    case Opcode::RISCV_C_OR: {
+      const uint32_t rs1 = operands[0].get<uint32_t>();
+      const uint32_t rs2 = operands[1].get<uint32_t>();
+      uint32_t out = (rs1 | rs2);
+      results[0] = RegisterValue(out, 4);
+      break;
+    }
+    case Opcode::RISCV_C_SD:
+      break;
+    case Opcode::RISCV_C_SDSP:
+      break;
+    case Opcode::RISCV_C_SLLI: {
+      const int32_t rs1 = operands[0].get<int32_t>();
+      const int32_t shamt = c_imm & 63;  // Only use lowest 6 bits
+      int32_t out = (rs1 << shamt);
+      results[0] = RegisterValue(out, 4);
+      break;
+    }
+    case Opcode::RISCV_C_SRAI: {
+      const int32_t rs1 = operands[0].get<int32_t>();
+      int32_t out = (rs1 >> (c_imm & 63));
+      results[0] = RegisterValue(out, 4);
+      break;
+    }
+    case Opcode::RISCV_C_SRLI: {
+      const uint32_t rs1 = operands[0].get<uint32_t>();
+      uint32_t out = (rs1 >> (c_imm & 63));
+      results[0] = RegisterValue(out, 4);
+      break;
+    }
+    case Opcode::RISCV_C_SUB: {
+      const uint32_t rs1 = operands[0].get<uint32_t>();
+      const uint32_t rs2 = operands[1].get<uint32_t>();
+      uint32_t out = (rs1 - rs2);
+      results[0] = RegisterValue(out, 4);
+      break;
+    }
+    case Opcode::RISCV_C_SUBW:
+      break;
+    case Opcode::RISCV_C_SW:
+    case Opcode::RISCV_C_FSWSP:
+    case Opcode::RISCV_C_SWSP: {
+      memoryData[0] = operands[0];
+      break;
+    }
+    case Opcode::RISCV_C_UNIMP:
+      break;
+    case Opcode::RISCV_C_XOR: {
+      const uint32_t rs1 = operands[0].get<uint32_t>();
+      const uint32_t rs2 = operands[1].get<uint32_t>();
+      uint32_t out = (rs1 ^ rs2);
+      results[0] = RegisterValue(out, 4);
+      break;
+    }
 
     default:
       return executionNYI();
diff --git a/src/lib/models/emulation/Core.cc b/src/lib/models/emulation/Core.cc
index 1d572ee160..0eff31d5a5 100644
--- a/src/lib/models/emulation/Core.cc
+++ b/src/lib/models/emulation/Core.cc
@@ -44,27 +44,6 @@ void Core::tick() {
     return;
   }
 
-  if (pendingReads_ > 0) {
-    // Handle pending reads to a uop
-    auto& uop = microOps_.front();
-
-    const auto& completedReads = dataMemory_.getCompletedReads();
-    for (const auto& response : completedReads) {
-      assert(pendingReads_ > 0);
-      uop->supplyData(response.target.address, response.data);
-      pendingReads_--;
-    }
-    dataMemory_.clearCompletedReads();
-
-    if (pendingReads_ == 0) {
-      // Load complete: resume execution
-      execute(uop);
-    }
-
-    // More data pending, end cycle early
-    return;
-  }
-
   // Fetch
 
   // Determine if new uops are needed to be fetched
@@ -130,7 +109,13 @@ void Core::tick() {
         previousAddresses_.push_back(target);
       }
       pendingReads_ = addresses.size();
-      return;
+      const auto& completedReads = dataMemory_.getCompletedReads();
+      for (const auto& response : completedReads) {
+        assert(pendingReads_ > 0);
+        uop->supplyData(response.target.address, response.data);
+        pendingReads_--;
+      }
+      dataMemory_.clearCompletedReads();
     } else {
       // Early execution due to lacking addresses
       execute(uop);
@@ -166,6 +151,8 @@ void Core::execute(std::shared_ptr<Instruction>& uop) {
   uop->execute();
 
   if (uop->exceptionEncountered()) {
+    instructionsExecuted_++;
+    isa_.updateInstrTrace(uop, &registerFileSet_, ticks_); // Handle ECALL into trace here
     handleException(uop);
     return;
   }
@@ -197,7 +184,19 @@ void Core::execute(std::shared_ptr<Instruction>& uop) {
     }
   }
 
-  if (uop->isLastMicroOp()) instructionsExecuted_++;
+  if (uop->isLastMicroOp()) {
+    instructionsExecuted_++;
+    // TODO: This is architecture-specific. It's here for the reference and should(will) be refactored later
+    uint16_t sysreg_instrret = isa_.getSystemRegisterTag(arch::riscv::riscv_sysreg::SYSREG_INSTRRET);
+    uint16_t sysreg_cycle = isa_.getSystemRegisterTag(arch::riscv::riscv_sysreg::SYSREG_CYCLE);
+    // NOTE: 64-bit system registers are not implemented yet
+    //TODO: Maybe make use of byteLength and remove is32BitMode() function?
+    if (isa_.is32BitMode()) {
+      registerFileSet_.set(Register{0x2, sysreg_instrret}, RegisterValue(instructionsExecuted_, 4));
+      registerFileSet_.set(Register{0x2, sysreg_cycle}, RegisterValue(ticks_, 4));
+    }
+    isa_.updateInstrTrace(uop, &registerFileSet_, ticks_);
+  }
 
   // Fetch memory for next cycle
   instructionMemory_.requestRead({pc_, FETCH_SIZE});

From 1e2ab32367f115e131a25de077a54f60eeb23671 Mon Sep 17 00:00:00 2001
From: dANW34V3R <dan.weaver@hotmail.co.uk>
Date: Fri, 12 May 2023 14:01:07 +0100
Subject: [PATCH 2/5] Clang format

---
 src/include/simeng/models/emulation/Core.hh   |  3 ++-
 .../simeng/pipeline/PipelineBuffer1.hh        | 19 +++++++++------
 src/lib/Elf.cc                                | 17 ++++++++------
 src/lib/arch/riscv/ExceptionHandler.cc        | 23 ++++++++++---------
 src/lib/arch/riscv/InstructionMetadata.hh     |  6 +----
 src/lib/models/emulation/Core.cc              | 20 ++++++++++------
 6 files changed, 50 insertions(+), 38 deletions(-)

diff --git a/src/include/simeng/models/emulation/Core.hh b/src/include/simeng/models/emulation/Core.hh
index c4a4acc453..2c94356d72 100644
--- a/src/include/simeng/models/emulation/Core.hh
+++ b/src/include/simeng/models/emulation/Core.hh
@@ -11,7 +11,8 @@
 #include "simeng/arch/Architecture.hh"
 #include "simeng/span.hh"
 
-// TODO: This is architecture-specific, need to be refactored later. See comments in Core.cc
+// TODO: This is architecture-specific, need to be refactored later. See
+// comments in Core.cc
 #include "simeng/arch/riscv/Architecture.hh"
 
 namespace simeng {
diff --git a/src/include/simeng/pipeline/PipelineBuffer1.hh b/src/include/simeng/pipeline/PipelineBuffer1.hh
index dd2ed70ce7..e677645fdf 100644
--- a/src/include/simeng/pipeline/PipelineBuffer1.hh
+++ b/src/include/simeng/pipeline/PipelineBuffer1.hh
@@ -15,13 +15,18 @@ class PipelineBuffer {
   /** Construct a pipeline buffer of width `width`, and fill all slots with
    * `initialValue`. */
   PipelineBuffer(int width, const T& initialValue)
-      : width(width), buffer(width * defaultLength_, initialValue),
-        length_(defaultLength_), headIndex_(defaultLength_-1),
+      : width(width),
+        buffer(width * defaultLength_, initialValue),
+        length_(defaultLength_),
+        headIndex_(defaultLength_ - 1),
         tailIndex_(0) {}
 
   PipelineBuffer(int width, const T& initialValue, int length)
-      : width(width), buffer(width * length, initialValue), length_(length),
-        headIndex_(length_-1), tailIndex_(0) {
+      : width(width),
+        buffer(width * length, initialValue),
+        length_(length),
+        headIndex_(length_ - 1),
+        tailIndex_(0) {
     assert(length_ != 0 && "Pipeline buffer length cannot be 0");
   }
 
@@ -30,14 +35,14 @@ class PipelineBuffer {
   void tick() {
     if (isStalled_) return;
 
-    //length ==1 shortcut? condition check cost
+    // length ==1 shortcut? condition check cost
 
-    if (headIndex_) { // when headIndex != 0
+    if (headIndex_) {  // when headIndex != 0
       headIndex_--;
     } else {
       headIndex_ = length_ - 1;
     }
-    if (tailIndex_) { // when tailIndex != 0
+    if (tailIndex_) {  // when tailIndex != 0
       tailIndex_--;
     } else {
       tailIndex_ = length_ - 1;
diff --git a/src/lib/Elf.cc b/src/lib/Elf.cc
index 6281598403..be11a2c753 100644
--- a/src/lib/Elf.cc
+++ b/src/lib/Elf.cc
@@ -47,7 +47,8 @@ Elf::Elf(std::string path, char** imagePointer) {
   // Check whether this is a 32 or 64-bit executable
   char bitFormat;
   file.read(&bitFormat, sizeof(bitFormat));
-  if (bitFormat != ElfBitFormat::Format32 && bitFormat != ElfBitFormat::Format64) {
+  if (bitFormat != ElfBitFormat::Format32 &&
+      bitFormat != ElfBitFormat::Format64) {
     return;
   }
 
@@ -92,7 +93,8 @@ Elf::Elf(std::string path, char** imagePointer) {
     // Seek to the byte representing header entry size.
     file.seekg(0x36);
     uint16_t headerEntrySize;
-    file.read(reinterpret_cast<char*>(&headerEntrySize), sizeof(headerEntrySize));
+    file.read(reinterpret_cast<char*>(&headerEntrySize),
+              sizeof(headerEntrySize));
     uint16_t headerEntries;
     file.read(reinterpret_cast<char*>(&headerEntries), sizeof(headerEntries));
 
@@ -168,8 +170,8 @@ Elf::Elf(std::string path, char** imagePointer) {
     for (const auto& header : headers_) {
       if (header.type == 1) {  // LOAD
         file.seekg(header.offset);
-        // Read `fileSize` bytes from `file` into the appropriate place in process
-        // memory
+        // Read `fileSize` bytes from `file` into the appropriate place in
+        // process memory
         file.read(*imagePointer + header.virtualAddress, header.fileSize);
       }
     }
@@ -210,7 +212,8 @@ Elf::Elf(std::string path, char** imagePointer) {
     // Seek to the byte representing header entry size.
     file.seekg(0x2a);
     uint16_t headerEntrySize;
-    file.read(reinterpret_cast<char*>(&headerEntrySize), sizeof(headerEntrySize));
+    file.read(reinterpret_cast<char*>(&headerEntrySize),
+              sizeof(headerEntrySize));
     uint16_t headerEntries;
     file.read(reinterpret_cast<char*>(&headerEntries), sizeof(headerEntries));
 
@@ -285,8 +288,8 @@ Elf::Elf(std::string path, char** imagePointer) {
     for (const auto& header : headers32_) {
       if (header.type == 1) {  // LOAD
         file.seekg(header.offset);
-        // Read `fileSize` bytes from `file` into the appropriate place in process
-        // memory
+        // Read `fileSize` bytes from `file` into the appropriate place in
+        // process memory
         file.read(*imagePointer + header.virtualAddress, header.fileSize);
       }
     }
diff --git a/src/lib/arch/riscv/ExceptionHandler.cc b/src/lib/arch/riscv/ExceptionHandler.cc
index c88448048d..ffd7895233 100644
--- a/src/lib/arch/riscv/ExceptionHandler.cc
+++ b/src/lib/arch/riscv/ExceptionHandler.cc
@@ -98,7 +98,8 @@ bool ExceptionHandler::init() {
       case 57: {  // close
         int64_t fd = registerFileSet.get(R0).get<int64_t>();
         stateChange = {ChangeType::REPLACEMENT, {R0}};
-        stateChange.modifiedRegisterValues.push_back(RegisterValue(linux_.close(fd), instruction_.getArchRegWidth()));
+        stateChange.modifiedRegisterValues.push_back(
+            RegisterValue(linux_.close(fd), instruction_.getArchRegWidth()));
         break;
       }
       case 61: {  // getdents64
@@ -185,9 +186,9 @@ bool ExceptionHandler::init() {
         uint64_t count = registerFileSet.get(R2).get<uint64_t>();
         return readBufferThen(bufPtr, count, [=]() {
           int64_t retval = linux_.write(fd, dataBuffer.data(), count);
-          ProcessStateChange stateChange = {
-              ChangeType::REPLACEMENT, {R0}};
-          stateChange.modifiedRegisterValues.push_back(RegisterValue(retval, instruction_.getArchRegWidth()));
+          ProcessStateChange stateChange = {ChangeType::REPLACEMENT, {R0}};
+          stateChange.modifiedRegisterValues.push_back(
+              RegisterValue(retval, instruction_.getArchRegWidth()));
           return concludeSyscall(stateChange);
         });
       }
@@ -355,9 +356,9 @@ bool ExceptionHandler::init() {
         uint64_t statbufPtr = registerFileSet.get(R1).get<uint64_t>();
 
         kernel::stat statOut;
-        stateChange = {
-            ChangeType::REPLACEMENT, {R0}};
-        stateChange.modifiedRegisterValues.push_back(RegisterValue(linux_.fstat(fd, statOut), instruction_.getArchRegWidth()));
+        stateChange = {ChangeType::REPLACEMENT, {R0}};
+        stateChange.modifiedRegisterValues.push_back(RegisterValue(
+            linux_.fstat(fd, statOut), instruction_.getArchRegWidth()));
         stateChange.memoryAddresses.push_back({statbufPtr, sizeof(statOut)});
         stateChange.memoryAddressValues.push_back(statOut);
         break;
@@ -556,9 +557,9 @@ bool ExceptionHandler::init() {
       }
       case 214: {  // brk
         auto result = linux_.brk(registerFileSet.get(R0).get<uint64_t>());
-        stateChange = {
-            ChangeType::REPLACEMENT, {R0}};
-        stateChange.modifiedRegisterValues.push_back(RegisterValue(static_cast<uint64_t>(result), instruction_.getArchRegWidth()));
+        stateChange = {ChangeType::REPLACEMENT, {R0}};
+        stateChange.modifiedRegisterValues.push_back(RegisterValue(
+            static_cast<uint64_t>(result), instruction_.getArchRegWidth()));
         break;
       }
       case 215: {  // munmap
@@ -844,7 +845,7 @@ void ExceptionHandler::printException(const Instruction& insn) const {
   auto& metadata = insn.getMetadata();
   for (int8_t i = metadata.lenBytes; i > 0; i--) {
     std::cout << std::setfill('0') << std::setw(2)
-              << static_cast<unsigned int>(metadata.encoding[i-1]);
+              << static_cast<unsigned int>(metadata.encoding[i - 1]);
   }
   std::cout << std::dec << "    ";
   if (exception == InstructionException::EncodingUnallocated) {
diff --git a/src/lib/arch/riscv/InstructionMetadata.hh b/src/lib/arch/riscv/InstructionMetadata.hh
index 4ce164a346..796afc96c2 100644
--- a/src/lib/arch/riscv/InstructionMetadata.hh
+++ b/src/lib/arch/riscv/InstructionMetadata.hh
@@ -14,11 +14,7 @@ namespace Opcode {
 #include "RISCVGenInstrInfo.inc"
 }  // namespace Opcode
 
-enum INSTR_LENGTH {
-  IL_16B,
-  IL_32B,
-  IL_INVALID
-};
+enum INSTR_LENGTH { IL_16B, IL_32B, IL_INVALID };
 
 /** A simplified RISC-V-only version of the Capstone instruction structure. */
 struct InstructionMetadata {
diff --git a/src/lib/models/emulation/Core.cc b/src/lib/models/emulation/Core.cc
index 0eff31d5a5..6357c898d3 100644
--- a/src/lib/models/emulation/Core.cc
+++ b/src/lib/models/emulation/Core.cc
@@ -152,7 +152,8 @@ void Core::execute(std::shared_ptr<Instruction>& uop) {
 
   if (uop->exceptionEncountered()) {
     instructionsExecuted_++;
-    isa_.updateInstrTrace(uop, &registerFileSet_, ticks_); // Handle ECALL into trace here
+    isa_.updateInstrTrace(uop, &registerFileSet_,
+                          ticks_);  // Handle ECALL into trace here
     handleException(uop);
     return;
   }
@@ -186,14 +187,19 @@ void Core::execute(std::shared_ptr<Instruction>& uop) {
 
   if (uop->isLastMicroOp()) {
     instructionsExecuted_++;
-    // TODO: This is architecture-specific. It's here for the reference and should(will) be refactored later
-    uint16_t sysreg_instrret = isa_.getSystemRegisterTag(arch::riscv::riscv_sysreg::SYSREG_INSTRRET);
-    uint16_t sysreg_cycle = isa_.getSystemRegisterTag(arch::riscv::riscv_sysreg::SYSREG_CYCLE);
+    // TODO: This is architecture-specific. It's here for the reference and
+    // should(will) be refactored later
+    uint16_t sysreg_instrret =
+        isa_.getSystemRegisterTag(arch::riscv::riscv_sysreg::SYSREG_INSTRRET);
+    uint16_t sysreg_cycle =
+        isa_.getSystemRegisterTag(arch::riscv::riscv_sysreg::SYSREG_CYCLE);
     // NOTE: 64-bit system registers are not implemented yet
-    //TODO: Maybe make use of byteLength and remove is32BitMode() function?
+    // TODO: Maybe make use of byteLength and remove is32BitMode() function?
     if (isa_.is32BitMode()) {
-      registerFileSet_.set(Register{0x2, sysreg_instrret}, RegisterValue(instructionsExecuted_, 4));
-      registerFileSet_.set(Register{0x2, sysreg_cycle}, RegisterValue(ticks_, 4));
+      registerFileSet_.set(Register{0x2, sysreg_instrret},
+                           RegisterValue(instructionsExecuted_, 4));
+      registerFileSet_.set(Register{0x2, sysreg_cycle},
+                           RegisterValue(ticks_, 4));
     }
     isa_.updateInstrTrace(uop, &registerFileSet_, ticks_);
   }

From 20e5236c95f8cfad63556833d4a35cc35918fb8a Mon Sep 17 00:00:00 2001
From: dANW34V3R <dan.weaver@hotmail.co.uk>
Date: Mon, 15 May 2023 15:49:04 +0100
Subject: [PATCH 3/5] Add Trace config option to node checker

---
 configs/DEMO_RISCV.yaml | 1 +
 src/lib/ModelConfig.cc  | 8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/configs/DEMO_RISCV.yaml b/configs/DEMO_RISCV.yaml
index e5a11d3c54..d00531d2a2 100644
--- a/configs/DEMO_RISCV.yaml
+++ b/configs/DEMO_RISCV.yaml
@@ -6,6 +6,7 @@
 Core:
   ISA: rv64
   Simulation-Mode: outoforder
+  Trace: false
   Clock-Frequency: 2.5
   Fetch-Block-Size: 32
 Fetch:
diff --git a/src/lib/ModelConfig.cc b/src/lib/ModelConfig.cc
index 88cc1f7d59..ee804d9afb 100644
--- a/src/lib/ModelConfig.cc
+++ b/src/lib/ModelConfig.cc
@@ -64,10 +64,12 @@ void ModelConfig::validate() {
                "Timer-Frequency",
                "Micro-Operations",
                "Vector-Length",
-               "Streaming-Vector-Length"};
+               "Streaming-Vector-Length",
+               "Trace"};
   validISA = nodeChecker<std::string>(
       configFile_[root][subFields[0]], subFields[0],
-      std::vector<std::string>({"AArch64", "rv64", "rv32"}), ExpectedValue::String);
+      std::vector<std::string>({"AArch64", "rv64", "rv32"}),
+      ExpectedValue::String);
   nodeChecker<std::string>(configFile_[root][subFields[1]], subFields[1],
                            {"emulation", "inorderpipelined", "outoforder"},
                            ExpectedValue::String);
@@ -86,6 +88,8 @@ void ModelConfig::validate() {
                         {128, 256, 384, 512, 640, 768, 896, 1024, 1152, 1280,
                          1408, 1536, 1664, 1792, 1920, 2048},
                         ExpectedValue::UInteger, 512);
+  nodeChecker<bool>(configFile_[root][subFields[7]], subFields[7],
+                    std::vector<bool>{false, true}, ExpectedValue::Bool, false);
   subFields.clear();
 
   // First check that the ISA config option is valid, this protects reads from

From 6f0c692ab0719ea508ee0b648e9f9860aecb13a6 Mon Sep 17 00:00:00 2001
From: dANW34V3R <dan.weaver@hotmail.co.uk>
Date: Wed, 6 Sep 2023 11:44:35 +0100
Subject: [PATCH 4/5] MCU patch

---
 CMakeLists.txt                                |    4 +-
 Makefile                                      |   33 +
 README_RV32.md                                |   18 +
 configs/DEMO_RISCV.yaml                       |    5 +-
 configs/DEMO_RISCV32_mcu.yaml                 |  145 +
 configs/DEMO_RISCV32_mcu_sst.yaml             |  145 +
 share_ext/share_sample_mcu_model.patch        | 6824 +++++++++++++++++
 src/include/simeng/BranchPredictor.hh         |    5 +
 src/include/simeng/CoreInstance.hh            |    3 +-
 src/include/simeng/Elf.hh                     |   93 +-
 src/include/simeng/GenericPredictor.hh        |    3 +
 src/include/simeng/Instruction.hh             |   23 +-
 src/include/simeng/arch/Architecture.hh       |    5 +-
 .../simeng/arch/aarch64/Architecture.hh       |    5 +-
 .../simeng/arch/aarch64/Instruction.hh        |    2 +-
 src/include/simeng/arch/riscv/Architecture.hh |   41 +-
 .../simeng/arch/riscv/ExceptionHandler.hh     |    3 +
 src/include/simeng/arch/riscv/Instruction.hh  |   25 +-
 .../simeng/arch/riscv/SystemRegister.hh       |  229 +
 src/include/simeng/kernel/Linux.hh            |    5 +
 src/include/simeng/kernel/LinuxProcess.hh     |    5 +
 src/include/simeng/models/emulation/Core.hh   |    3 +
 src/include/simeng/models/mcu/Core.hh         |  181 +
 src/include/simeng/pipeline_hi/DecodeUnit.hh  |   66 +
 .../simeng/pipeline_hi/DispatchIssueUnit.hh   |  150 +
 src/include/simeng/pipeline_hi/ExecuteUnit.hh |  147 +
 src/include/simeng/pipeline_hi/FetchUnit.hh   |  127 +
 .../simeng/pipeline_hi/LoadStoreQueue.hh      |  235 +
 .../simeng/pipeline_hi/PipelineBuffer.hh      |  107 +
 .../simeng/pipeline_hi/PipelineBuffer1.hh     |  133 +
 .../simeng/pipeline_hi/PortAllocator.hh       |   43 +
 src/include/simeng/pipeline_hi/RegDepMap.hh   |   57 +
 .../simeng/pipeline_hi/RegisterAliasTable.hh  |   69 +
 .../simeng/pipeline_hi/ReorderBuffer.hh       |  136 +
 .../simeng/pipeline_hi/StaticPredictor.hh     |   53 +
 .../simeng/pipeline_hi/WritebackUnit.hh       |   62 +
 src/lib/CMakeLists.txt                        |   12 +
 src/lib/CoreInstance.cc                       |   13 +-
 src/lib/Elf.cc                                |  161 +-
 src/lib/GenericPredictor.cc                   |    7 +
 src/lib/Instruction.cc                        |    3 +
 src/lib/ModelConfig.cc                        |    2 +-
 src/lib/arch/aarch64/Architecture.cc          |    6 +-
 src/lib/arch/aarch64/Instruction.cc           |    4 +-
 src/lib/arch/aarch64/Instruction_decode.cc    |   10 +-
 src/lib/arch/riscv/Architecture.cc            |   78 +-
 src/lib/arch/riscv/ExceptionHandler.cc        |   56 +-
 src/lib/arch/riscv/Instruction.cc             |    6 +-
 src/lib/arch/riscv/InstructionMetadata.cc     |    5 +-
 src/lib/arch/riscv/Instruction_decode.cc      |   78 +-
 src/lib/arch/riscv/Instruction_execute.cc     |   85 +-
 src/lib/arch/riscv/SystemRegister.cc          |  124 +
 src/lib/kernel/Linux.cc                       |   10 +-
 src/lib/kernel/LinuxProcess.cc                |   14 +-
 src/lib/models/emulation/Core.cc              |   12 +-
 src/lib/models/mcu/Core.cc                    |  515 ++
 src/lib/pipeline/FetchUnit.cc                 |    2 +-
 src/lib/pipeline_hi/DecodeUnit.cc             |  117 +
 src/lib/pipeline_hi/DispatchIssueUnit.cc      |  269 +
 src/lib/pipeline_hi/ExecuteUnit.cc            |  255 +
 src/lib/pipeline_hi/FetchUnit.cc              |  265 +
 src/lib/pipeline_hi/LoadStoreQueue.cc         |  315 +
 src/lib/pipeline_hi/RegDepMap.cc              |  143 +
 src/lib/pipeline_hi/RegisterAliasTable.cc     |  110 +
 src/lib/pipeline_hi/ReorderBuffer.cc          |  206 +
 src/lib/pipeline_hi/StaticPredictor.cc        |  120 +
 src/lib/pipeline_hi/WritebackUnit.cc          |   74 +
 src/tools/simeng/main.cc                      |    4 +-
 sst/SimEngCoreWrapper.cc                      |   94 +-
 sst/SimEngMemInterface.cc                     |   13 +-
 sst/config/mcu_int_example_config.py          |   74 +
 sst/include/SimEngCoreWrapper.hh              |    4 +-
 sst/include/SimEngMemInterface.hh             |    2 +-
 73 files changed, 12224 insertions(+), 234 deletions(-)
 create mode 100644 Makefile
 create mode 100644 configs/DEMO_RISCV32_mcu.yaml
 create mode 100644 configs/DEMO_RISCV32_mcu_sst.yaml
 create mode 100644 share_ext/share_sample_mcu_model.patch
 create mode 100644 src/include/simeng/arch/riscv/SystemRegister.hh
 create mode 100644 src/include/simeng/models/mcu/Core.hh
 create mode 100644 src/include/simeng/pipeline_hi/DecodeUnit.hh
 create mode 100644 src/include/simeng/pipeline_hi/DispatchIssueUnit.hh
 create mode 100644 src/include/simeng/pipeline_hi/ExecuteUnit.hh
 create mode 100644 src/include/simeng/pipeline_hi/FetchUnit.hh
 create mode 100644 src/include/simeng/pipeline_hi/LoadStoreQueue.hh
 create mode 100644 src/include/simeng/pipeline_hi/PipelineBuffer.hh
 create mode 100644 src/include/simeng/pipeline_hi/PipelineBuffer1.hh
 create mode 100644 src/include/simeng/pipeline_hi/PortAllocator.hh
 create mode 100644 src/include/simeng/pipeline_hi/RegDepMap.hh
 create mode 100644 src/include/simeng/pipeline_hi/RegisterAliasTable.hh
 create mode 100644 src/include/simeng/pipeline_hi/ReorderBuffer.hh
 create mode 100644 src/include/simeng/pipeline_hi/StaticPredictor.hh
 create mode 100644 src/include/simeng/pipeline_hi/WritebackUnit.hh
 create mode 100644 src/lib/arch/riscv/SystemRegister.cc
 create mode 100644 src/lib/models/mcu/Core.cc
 create mode 100644 src/lib/pipeline_hi/DecodeUnit.cc
 create mode 100644 src/lib/pipeline_hi/DispatchIssueUnit.cc
 create mode 100644 src/lib/pipeline_hi/ExecuteUnit.cc
 create mode 100644 src/lib/pipeline_hi/FetchUnit.cc
 create mode 100644 src/lib/pipeline_hi/LoadStoreQueue.cc
 create mode 100644 src/lib/pipeline_hi/RegDepMap.cc
 create mode 100644 src/lib/pipeline_hi/RegisterAliasTable.cc
 create mode 100644 src/lib/pipeline_hi/ReorderBuffer.cc
 create mode 100644 src/lib/pipeline_hi/StaticPredictor.cc
 create mode 100644 src/lib/pipeline_hi/WritebackUnit.cc
 create mode 100644 sst/config/mcu_int_example_config.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ccbc9074a0..0a95e01796 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,8 +50,8 @@ FetchContent_Declare(
 FetchContent_Declare(
   capstone-lib
   GIT_REPOSITORY https://github.com/UoB-HPC/capstone.git
-  GIT_TAG next
-  GIT_PROGRESS TRUE
+  GIT_TAG        next
+  GIT_PROGRESS   TRUE
 
   # Old Git tag pre-Armv9.2
   # GIT_TAG e7be7d99e718ef9741026b80fc6f5e100fdf4f94 # trunk
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000..0029c8cc61
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,33 @@
+# Helper setup to build simeng binary
+
+NPROC ?= 4
+CMAKE ?= /data/tools/cmake/cmake-3.21.0-linux-x86_64/bin/cmake
+BUILD_DIR ?= build
+TYPE ?= Debug
+INSTALLDIR ?= $(shell pwd)/install
+TEST_FLAG ?= OFF
+SST_FLAG ?= ON
+SST_CORE_INSTALLDIR ?= $(SST_CORE_HOME)
+
+all: configure build install
+
+configure: clean
+	$(CMAKE) -B $(BUILD_DIR) -S . -DCMAKE_BUILD_TYPE=$(TYPE) -DCMAKE_INSTALL_PREFIX=$(INSTALLDIR) -DSIMENG_ENABLE_TESTS=$(TEST_FLAG) -DSIMENG_USE_EXTERNAL_LLVM=ON -DSIMENG_ENABLE_SST=$(SST_FLAG) -DSST_INSTALL_DIR=$(SST_CORE_INSTALLDIR) -DLLVM_DIR=/usr/lib/llvm-12/lib/
+
+build:
+	$(CMAKE) --build $(BUILD_DIR) -j $(NPROC)
+
+test:
+	$(CMAKE) --build $(BUILD_DIR) -j $(NPROC) --target test
+
+install:
+	$(CMAKE) --build $(BUILD_DIR) -j $(NPROC) --target install
+
+run_sst_example:
+	sst sst/config/eacf_int_example_config.py
+
+clean:
+	rm -rf build
+
+#.PHONY : all configure build test install run_sst_example clean
+.PHONY : *
diff --git a/README_RV32.md b/README_RV32.md
index fe5942068e..f9587791c2 100644
--- a/README_RV32.md
+++ b/README_RV32.md
@@ -11,3 +11,21 @@
 - Added an alternative implementation of pipeline buffer with variable latency support. 
   - Supports 0 delay that is benefitial for merging pipeline stages if required.
   - Supports more than 1 cycle delay between pipeline stages.
+
+# SimEng Update to share the sample implementation of the MicroController (MCU) class core model using 32-bit RISC-V ISA
+- Small MCU like three stage pipeline core model
+- Additonal fixed memory support for LSU in the mcu core
+- Some update to ELF loader and SST image loading to SST memory
+- Makefile to build and run
+- Added support for memory mapped system registers,
+- Used to add a HostTargetInterface for I/O and termination so that spike binaries can run on SimEng
+- Added interrupt support;
+- Fixed csrc handling;
+- Fixed 32-bit sltiu instruction;
+- Fixed 32-bit mulh, mulhu and mulhsu instructions
+- Add support for interrupt by flushing the pipe at execution stage when an interrupt is visible, fix iteration count being int in main.cc
+- Some bug fixes
+
+# Capstone change required for RV32 compresses instruction usage in file include/capstone/capstone.h
+CS_MODE_RISV32GC = CS_MODE_RISCV32 | CS_MODE_RISCVC, ///< RISCV RV32GC
+- 
diff --git a/configs/DEMO_RISCV.yaml b/configs/DEMO_RISCV.yaml
index e5a11d3c54..15b61ed5e2 100644
--- a/configs/DEMO_RISCV.yaml
+++ b/configs/DEMO_RISCV.yaml
@@ -5,9 +5,10 @@
 
 Core:
   ISA: rv64
-  Simulation-Mode: outoforder
+  Simulation-Mode: emulation
   Clock-Frequency: 2.5
   Fetch-Block-Size: 32
+  Trace: True
 Fetch:
   Fetch-Block-Size: 32
   Loop-Buffer-Size: 0
@@ -36,7 +37,7 @@ Branch-Predictor:
   Branch-Predictor:
   BTB-bitlength: 16
 L1-Data-Memory:
-  Interface-Type: Fixed
+  Interface-Type: Flat
 L1-Instruction-Memory:
   Interface-Type: Flat
 LSQ-L1-Interface:
diff --git a/configs/DEMO_RISCV32_mcu.yaml b/configs/DEMO_RISCV32_mcu.yaml
new file mode 100644
index 0000000000..2e7983e178
--- /dev/null
+++ b/configs/DEMO_RISCV32_mcu.yaml
@@ -0,0 +1,145 @@
+---
+# The following resources where utilised to create the config file and naming schemes:
+# https://en.wikichip.org/wiki/cavium/microarchitectures/vulcan
+
+Core:
+  ISA: rv32
+  Simulation-Mode: mcu
+  Clock-Frequency: 2.5
+  Fetch-Block-Size: 32
+  Trace: True
+  EnableHaltCheck: True
+  MaxStallCycleTimeout: 10000
+  MaxSimCycleTimeout: 1000000000
+  MaxInstrTimeout: 1000000000
+Fetch:
+  Fetch-Block-Size: 32
+  Loop-Buffer-Size: 0
+  Loop-Detection-Threshold: 0
+Process-Image:
+  Heap-Size: 1073741824
+  Stack-Size: 1048576
+Register-Set:
+  GeneralPurpose-Count: 32
+  FloatingPoint-Count: 32
+Pipeline-Widths:
+  Commit: 4
+  Dispatch-Rate: 4
+  FrontEnd: 4
+  LSQ-Completion: 2
+Queue-Sizes:
+  ROB: 180
+  Load: 64
+  Store: 36
+Branch-Predictor:
+  BTB-Tag-Bits: 11
+  Saturating-Count-Bits: 2
+  Global-History-Length: 10
+  RAS-entries: 1 # need change; tmp solution: staticPred header file
+  Fallback-Static-Predictor: "Always-Taken"
+  BTB-bitlength: 16
+  Static-Type: "Always-Taken"
+L1-Data-Memory:
+  Interface-Type: Fixed
+L1-Instruction-Memory:
+  Interface-Type: Flat
+LSQ-L1-Interface:
+  Access-Latency: 1
+  Exclusive: False
+  Load-Bandwidth: 32
+  Store-Bandwidth: 16
+  Permitted-Requests-Per-Cycle: 2
+  Permitted-Loads-Per-Cycle: 2
+  Permitted-Stores-Per-Cycle: 1
+Ports:
+  0:
+    Portname: Port 0
+    Instruction-Support:
+      - INT_SIMPLE
+      - INT_MUL
+  1:
+    Portname: Port 1
+    Instruction-Support:
+      - INT
+  2:
+    Portname: Port 2
+    Instruction-Support:
+      - INT_SIMPLE
+      - INT_MUL
+      - BRANCH
+  3:
+    Portname: Port 4
+    Instruction-Support:
+      - LOAD
+  4:
+    Portname: Port 5
+    Instruction-Support:
+      - LOAD
+  5:
+    Portname: Port 3
+    Instruction-Support:
+      - STORE
+Reservation-Stations:
+  0:
+    Size: 60
+    Dispatch-Rate: 4
+    Ports:
+      - Port 0
+      - Port 1
+      - Port 2
+      - Port 4
+      - Port 5
+      - Port 3
+Execution-Units:
+  0:
+    Pipelined: True
+  1:
+    Pipelined: True
+  2:
+    Pipelined: True
+  3:
+    Pipelined: True
+  4:
+    Pipelined: True
+  5:
+    Pipelined: True
+Latencies:
+  0:
+    Instruction-Groups:
+      - INT_SIMPLE_ARTH
+      - INT_SIMPLE_LOGICAL
+    Execution-Latency: 1
+    Execution-Throughput: 1
+  1:
+    Instruction-Groups:
+      - INT_MUL
+    Execution-Latency: 1
+    Execution-Throughput: 1
+  2:
+    Instruction-Groups:
+      - INT_DIV
+    Execution-Latency: 4
+    Execution-Throughput: 4
+# CPU-Info mainly used to generate a replica of the special (or system) file directory
+# structure
+CPU-Info:
+  # Set Generate-Special-Dir to 'T' to generate the special files directory, or to 'F' to not.
+  # (Not generating the special files directory may require the user to copy over files manually)
+  Generate-Special-Dir: true
+  # Core-Count MUST be 1 as multi-core is not supported at this time. (TX2 true value is 32)
+  Core-Count: 1
+  # Socket-Count MUST be 1 as multi-socket simulations are not supported at this time. (TX2 true value is 2)
+  Socket-Count: 1
+  # SMT MUST be 1 as Simultanious-Multi-Threading is not supported at this time. (TX2 true value is 4)
+  SMT: 1
+  # Below are the values needed to generate /proc/cpuinfo
+  BogoMIPS: 400.00
+  Features: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics cpuid asimdrdm
+  CPU-Implementer: "0x43"
+  CPU-Architecture: 8
+  CPU-Variant: "0x1"
+  CPU-Part: "0x0af"
+  CPU-Revision: 2
+  # Package-Count is used to generate
+  # /sys/devices/system/cpu/cpu{0..Core-Count}/topology/{physical_package_id, core_id}
+  Package-Count: 1
\ No newline at end of file
diff --git a/configs/DEMO_RISCV32_mcu_sst.yaml b/configs/DEMO_RISCV32_mcu_sst.yaml
new file mode 100644
index 0000000000..42e4c5d87f
--- /dev/null
+++ b/configs/DEMO_RISCV32_mcu_sst.yaml
@@ -0,0 +1,145 @@
+---
+# The following resources where utilised to create the config file and naming schemes:
+# https://en.wikichip.org/wiki/cavium/microarchitectures/vulcan
+
+Core:
+  ISA: rv32
+  Simulation-Mode: mcu
+  Clock-Frequency: 2.5
+  Fetch-Block-Size: 32
+  Trace: True
+  EnableHaltCheck: False
+  MaxStallCycleTimeout: 10000
+  MaxSimCycleTimeout: 1000000000
+  MaxInstrTimeout: 1000000000
+Fetch:
+  Fetch-Block-Size: 32
+  Loop-Buffer-Size: 0
+  Loop-Detection-Threshold: 0
+Process-Image:
+  Heap-Size: 1073741824
+  Stack-Size: 1048576
+Register-Set:
+  GeneralPurpose-Count: 32
+  FloatingPoint-Count: 32
+Pipeline-Widths:
+  Commit: 4
+  Dispatch-Rate: 4
+  FrontEnd: 4
+  LSQ-Completion: 2
+Queue-Sizes:
+  ROB: 180
+  Load: 64
+  Store: 36
+Branch-Predictor:
+  BTB-Tag-Bits: 11
+  Saturating-Count-Bits: 2
+  Global-History-Length: 10
+  RAS-entries: 1 # need change; tmp solution: staticPred header file
+  Fallback-Static-Predictor: "Always-Taken"
+  BTB-bitlength: 16
+  Static-Type: "Always-Taken"
+L1-Data-Memory:
+  Interface-Type: External
+L1-Instruction-Memory:
+  Interface-Type: Flat
+LSQ-L1-Interface:
+  Access-Latency: 1
+  Exclusive: False
+  Load-Bandwidth: 32
+  Store-Bandwidth: 16
+  Permitted-Requests-Per-Cycle: 2
+  Permitted-Loads-Per-Cycle: 2
+  Permitted-Stores-Per-Cycle: 1
+Ports:
+  0:
+    Portname: Port 0
+    Instruction-Support:
+      - INT_SIMPLE
+      - INT_MUL
+  1:
+    Portname: Port 1
+    Instruction-Support:
+      - INT
+  2:
+    Portname: Port 2
+    Instruction-Support:
+      - INT_SIMPLE
+      - INT_MUL
+      - BRANCH
+  3:
+    Portname: Port 4
+    Instruction-Support:
+      - LOAD
+  4:
+    Portname: Port 5
+    Instruction-Support:
+      - LOAD
+  5:
+    Portname: Port 3
+    Instruction-Support:
+      - STORE
+Reservation-Stations:
+  0:
+    Size: 60
+    Dispatch-Rate: 4
+    Ports:
+      - Port 0
+      - Port 1
+      - Port 2
+      - Port 4
+      - Port 5
+      - Port 3
+Execution-Units:
+  0:
+    Pipelined: True
+  1:
+    Pipelined: True
+  2:
+    Pipelined: True
+  3:
+    Pipelined: True
+  4:
+    Pipelined: True
+  5:
+    Pipelined: True
+Latencies:
+  0:
+    Instruction-Groups:
+      - INT_SIMPLE_ARTH
+      - INT_SIMPLE_LOGICAL
+    Execution-Latency: 1
+    Execution-Throughput: 1
+  1:
+    Instruction-Groups:
+      - INT_MUL
+    Execution-Latency: 1
+    Execution-Throughput: 1
+  2:
+    Instruction-Groups:
+      - INT_DIV
+    Execution-Latency: 4
+    Execution-Throughput: 4
+# CPU-Info mainly used to generate a replica of the special (or system) file directory
+# structure
+CPU-Info:
+  # Set Generate-Special-Dir to 'T' to generate the special files directory, or to 'F' to not.
+  # (Not generating the special files directory may require the user to copy over files manually)
+  Generate-Special-Dir: true
+  # Core-Count MUST be 1 as multi-core is not supported at this time. (TX2 true value is 32)
+  Core-Count: 1
+  # Socket-Count MUST be 1 as multi-socket simulations are not supported at this time. (TX2 true value is 2)
+  Socket-Count: 1
+  # SMT MUST be 1 as Simultanious-Multi-Threading is not supported at this time. (TX2 true value is 4)
+  SMT: 1
+  # Below are the values needed to generate /proc/cpuinfo
+  BogoMIPS: 400.00
+  Features: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics cpuid asimdrdm
+  CPU-Implementer: "0x43"
+  CPU-Architecture: 8
+  CPU-Variant: "0x1"
+  CPU-Part: "0x0af"
+  CPU-Revision: 2
+  # Package-Count is used to generate
+  # /sys/devices/system/cpu/cpu{0..Core-Count}/topology/{physical_package_id, core_id}
+  Package-Count: 1
diff --git a/share_ext/share_sample_mcu_model.patch b/share_ext/share_sample_mcu_model.patch
new file mode 100644
index 0000000000..f6cc3acb43
--- /dev/null
+++ b/share_ext/share_sample_mcu_model.patch
@@ -0,0 +1,6824 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index ccbc9074..0a95e017 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -50,8 +50,8 @@ FetchContent_Declare(
+ FetchContent_Declare(
+   capstone-lib
+   GIT_REPOSITORY https://github.com/UoB-HPC/capstone.git
+-  GIT_TAG next
+-  GIT_PROGRESS TRUE
++  GIT_TAG        next
++  GIT_PROGRESS   TRUE
+ 
+   # Old Git tag pre-Armv9.2
+   # GIT_TAG e7be7d99e718ef9741026b80fc6f5e100fdf4f94 # trunk
+diff --git a/Makefile b/Makefile
+new file mode 100644
+index 00000000..0029c8cc
+--- /dev/null
++++ b/Makefile
+@@ -0,0 +1,33 @@
++# Helper setup to build simeng binary
++
++NPROC ?= 4
++CMAKE ?= /data/tools/cmake/cmake-3.21.0-linux-x86_64/bin/cmake
++BUILD_DIR ?= build
++TYPE ?= Debug
++INSTALLDIR ?= $(shell pwd)/install
++TEST_FLAG ?= OFF
++SST_FLAG ?= ON
++SST_CORE_INSTALLDIR ?= $(SST_CORE_HOME)
++
++all: configure build install
++
++configure: clean
++	$(CMAKE) -B $(BUILD_DIR) -S . -DCMAKE_BUILD_TYPE=$(TYPE) -DCMAKE_INSTALL_PREFIX=$(INSTALLDIR) -DSIMENG_ENABLE_TESTS=$(TEST_FLAG) -DSIMENG_USE_EXTERNAL_LLVM=ON -DSIMENG_ENABLE_SST=$(SST_FLAG) -DSST_INSTALL_DIR=$(SST_CORE_INSTALLDIR) -DLLVM_DIR=/usr/lib/llvm-12/lib/
++
++build:
++	$(CMAKE) --build $(BUILD_DIR) -j $(NPROC)
++
++test:
++	$(CMAKE) --build $(BUILD_DIR) -j $(NPROC) --target test
++
++install:
++	$(CMAKE) --build $(BUILD_DIR) -j $(NPROC) --target install
++
++run_sst_example:
++	sst sst/config/eacf_int_example_config.py
++
++clean:
++	rm -rf build
++
++#.PHONY : all configure build test install run_sst_example clean
++.PHONY : *
+diff --git a/configs/DEMO_RISCV.yaml b/configs/DEMO_RISCV.yaml
+index e5a11d3c..15b61ed5 100644
+--- a/configs/DEMO_RISCV.yaml
++++ b/configs/DEMO_RISCV.yaml
+@@ -5,9 +5,10 @@
+ 
+ Core:
+   ISA: rv64
+-  Simulation-Mode: outoforder
++  Simulation-Mode: emulation
+   Clock-Frequency: 2.5
+   Fetch-Block-Size: 32
++  Trace: True
+ Fetch:
+   Fetch-Block-Size: 32
+   Loop-Buffer-Size: 0
+@@ -36,7 +37,7 @@ Branch-Predictor:
+   Branch-Predictor:
+   BTB-bitlength: 16
+ L1-Data-Memory:
+-  Interface-Type: Fixed
++  Interface-Type: Flat
+ L1-Instruction-Memory:
+   Interface-Type: Flat
+ LSQ-L1-Interface:
+diff --git a/configs/DEMO_RISCV32_mcu.yaml b/configs/DEMO_RISCV32_mcu.yaml
+new file mode 100644
+index 00000000..2e7983e1
+--- /dev/null
++++ b/configs/DEMO_RISCV32_mcu.yaml
+@@ -0,0 +1,145 @@
++---
++# The following resources where utilised to create the config file and naming schemes:
++# https://en.wikichip.org/wiki/cavium/microarchitectures/vulcan
++
++Core:
++  ISA: rv32
++  Simulation-Mode: mcu
++  Clock-Frequency: 2.5
++  Fetch-Block-Size: 32
++  Trace: True
++  EnableHaltCheck: True
++  MaxStallCycleTimeout: 10000
++  MaxSimCycleTimeout: 1000000000
++  MaxInstrTimeout: 1000000000
++Fetch:
++  Fetch-Block-Size: 32
++  Loop-Buffer-Size: 0
++  Loop-Detection-Threshold: 0
++Process-Image:
++  Heap-Size: 1073741824
++  Stack-Size: 1048576
++Register-Set:
++  GeneralPurpose-Count: 32
++  FloatingPoint-Count: 32
++Pipeline-Widths:
++  Commit: 4
++  Dispatch-Rate: 4
++  FrontEnd: 4
++  LSQ-Completion: 2
++Queue-Sizes:
++  ROB: 180
++  Load: 64
++  Store: 36
++Branch-Predictor:
++  BTB-Tag-Bits: 11
++  Saturating-Count-Bits: 2
++  Global-History-Length: 10
++  RAS-entries: 1 # need change; tmp solution: staticPred header file
++  Fallback-Static-Predictor: "Always-Taken"
++  BTB-bitlength: 16
++  Static-Type: "Always-Taken"
++L1-Data-Memory:
++  Interface-Type: Fixed
++L1-Instruction-Memory:
++  Interface-Type: Flat
++LSQ-L1-Interface:
++  Access-Latency: 1
++  Exclusive: False
++  Load-Bandwidth: 32
++  Store-Bandwidth: 16
++  Permitted-Requests-Per-Cycle: 2
++  Permitted-Loads-Per-Cycle: 2
++  Permitted-Stores-Per-Cycle: 1
++Ports:
++  0:
++    Portname: Port 0
++    Instruction-Support:
++      - INT_SIMPLE
++      - INT_MUL
++  1:
++    Portname: Port 1
++    Instruction-Support:
++      - INT
++  2:
++    Portname: Port 2
++    Instruction-Support:
++      - INT_SIMPLE
++      - INT_MUL
++      - BRANCH
++  3:
++    Portname: Port 4
++    Instruction-Support:
++      - LOAD
++  4:
++    Portname: Port 5
++    Instruction-Support:
++      - LOAD
++  5:
++    Portname: Port 3
++    Instruction-Support:
++      - STORE
++Reservation-Stations:
++  0:
++    Size: 60
++    Dispatch-Rate: 4
++    Ports:
++      - Port 0
++      - Port 1
++      - Port 2
++      - Port 4
++      - Port 5
++      - Port 3
++Execution-Units:
++  0:
++    Pipelined: True
++  1:
++    Pipelined: True
++  2:
++    Pipelined: True
++  3:
++    Pipelined: True
++  4:
++    Pipelined: True
++  5:
++    Pipelined: True
++Latencies:
++  0:
++    Instruction-Groups:
++      - INT_SIMPLE_ARTH
++      - INT_SIMPLE_LOGICAL
++    Execution-Latency: 1
++    Execution-Throughput: 1
++  1:
++    Instruction-Groups:
++      - INT_MUL
++    Execution-Latency: 1
++    Execution-Throughput: 1
++  2:
++    Instruction-Groups:
++      - INT_DIV
++    Execution-Latency: 4
++    Execution-Throughput: 4
++# CPU-Info mainly used to generate a replica of the special (or system) file directory
++# structure
++CPU-Info:
++  # Set Generate-Special-Dir to 'T' to generate the special files directory, or to 'F' to not.
++  # (Not generating the special files directory may require the user to copy over files manually)
++  Generate-Special-Dir: true
++  # Core-Count MUST be 1 as multi-core is not supported at this time. (TX2 true value is 32)
++  Core-Count: 1
++  # Socket-Count MUST be 1 as multi-socket simulations are not supported at this time. (TX2 true value is 2)
++  Socket-Count: 1
++  # SMT MUST be 1 as Simultanious-Multi-Threading is not supported at this time. (TX2 true value is 4)
++  SMT: 1
++  # Below are the values needed to generate /proc/cpuinfo
++  BogoMIPS: 400.00
++  Features: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics cpuid asimdrdm
++  CPU-Implementer: "0x43"
++  CPU-Architecture: 8
++  CPU-Variant: "0x1"
++  CPU-Part: "0x0af"
++  CPU-Revision: 2
++  # Package-Count is used to generate
++  # /sys/devices/system/cpu/cpu{0..Core-Count}/topology/{physical_package_id, core_id}
++  Package-Count: 1
+\ No newline at end of file
+diff --git a/configs/DEMO_RISCV32_mcu_sst.yaml b/configs/DEMO_RISCV32_mcu_sst.yaml
+new file mode 100644
+index 00000000..42e4c5d8
+--- /dev/null
++++ b/configs/DEMO_RISCV32_mcu_sst.yaml
+@@ -0,0 +1,145 @@
++---
++# The following resources where utilised to create the config file and naming schemes:
++# https://en.wikichip.org/wiki/cavium/microarchitectures/vulcan
++
++Core:
++  ISA: rv32
++  Simulation-Mode: mcu
++  Clock-Frequency: 2.5
++  Fetch-Block-Size: 32
++  Trace: True
++  EnableHaltCheck: False
++  MaxStallCycleTimeout: 10000
++  MaxSimCycleTimeout: 1000000000
++  MaxInstrTimeout: 1000000000
++Fetch:
++  Fetch-Block-Size: 32
++  Loop-Buffer-Size: 0
++  Loop-Detection-Threshold: 0
++Process-Image:
++  Heap-Size: 1073741824
++  Stack-Size: 1048576
++Register-Set:
++  GeneralPurpose-Count: 32
++  FloatingPoint-Count: 32
++Pipeline-Widths:
++  Commit: 4
++  Dispatch-Rate: 4
++  FrontEnd: 4
++  LSQ-Completion: 2
++Queue-Sizes:
++  ROB: 180
++  Load: 64
++  Store: 36
++Branch-Predictor:
++  BTB-Tag-Bits: 11
++  Saturating-Count-Bits: 2
++  Global-History-Length: 10
++  RAS-entries: 1 # need change; tmp solution: staticPred header file
++  Fallback-Static-Predictor: "Always-Taken"
++  BTB-bitlength: 16
++  Static-Type: "Always-Taken"
++L1-Data-Memory:
++  Interface-Type: External
++L1-Instruction-Memory:
++  Interface-Type: Flat
++LSQ-L1-Interface:
++  Access-Latency: 1
++  Exclusive: False
++  Load-Bandwidth: 32
++  Store-Bandwidth: 16
++  Permitted-Requests-Per-Cycle: 2
++  Permitted-Loads-Per-Cycle: 2
++  Permitted-Stores-Per-Cycle: 1
++Ports:
++  0:
++    Portname: Port 0
++    Instruction-Support:
++      - INT_SIMPLE
++      - INT_MUL
++  1:
++    Portname: Port 1
++    Instruction-Support:
++      - INT
++  2:
++    Portname: Port 2
++    Instruction-Support:
++      - INT_SIMPLE
++      - INT_MUL
++      - BRANCH
++  3:
++    Portname: Port 4
++    Instruction-Support:
++      - LOAD
++  4:
++    Portname: Port 5
++    Instruction-Support:
++      - LOAD
++  5:
++    Portname: Port 3
++    Instruction-Support:
++      - STORE
++Reservation-Stations:
++  0:
++    Size: 60
++    Dispatch-Rate: 4
++    Ports:
++      - Port 0
++      - Port 1
++      - Port 2
++      - Port 4
++      - Port 5
++      - Port 3
++Execution-Units:
++  0:
++    Pipelined: True
++  1:
++    Pipelined: True
++  2:
++    Pipelined: True
++  3:
++    Pipelined: True
++  4:
++    Pipelined: True
++  5:
++    Pipelined: True
++Latencies:
++  0:
++    Instruction-Groups:
++      - INT_SIMPLE_ARTH
++      - INT_SIMPLE_LOGICAL
++    Execution-Latency: 1
++    Execution-Throughput: 1
++  1:
++    Instruction-Groups:
++      - INT_MUL
++    Execution-Latency: 1
++    Execution-Throughput: 1
++  2:
++    Instruction-Groups:
++      - INT_DIV
++    Execution-Latency: 4
++    Execution-Throughput: 4
++# CPU-Info mainly used to generate a replica of the special (or system) file directory
++# structure
++CPU-Info:
++  # Set Generate-Special-Dir to 'T' to generate the special files directory, or to 'F' to not.
++  # (Not generating the special files directory may require the user to copy over files manually)
++  Generate-Special-Dir: true
++  # Core-Count MUST be 1 as multi-core is not supported at this time. (TX2 true value is 32)
++  Core-Count: 1
++  # Socket-Count MUST be 1 as multi-socket simulations are not supported at this time. (TX2 true value is 2)
++  Socket-Count: 1
++  # SMT MUST be 1 as Simultanious-Multi-Threading is not supported at this time. (TX2 true value is 4)
++  SMT: 1
++  # Below are the values needed to generate /proc/cpuinfo
++  BogoMIPS: 400.00
++  Features: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics cpuid asimdrdm
++  CPU-Implementer: "0x43"
++  CPU-Architecture: 8
++  CPU-Variant: "0x1"
++  CPU-Part: "0x0af"
++  CPU-Revision: 2
++  # Package-Count is used to generate
++  # /sys/devices/system/cpu/cpu{0..Core-Count}/topology/{physical_package_id, core_id}
++  Package-Count: 1
+diff --git a/src/include/simeng/BranchPredictor.hh b/src/include/simeng/BranchPredictor.hh
+index 88be07dd..8d76f087 100644
+--- a/src/include/simeng/BranchPredictor.hh
++++ b/src/include/simeng/BranchPredictor.hh
+@@ -46,6 +46,11 @@ class BranchPredictor {
+  public:
+   virtual ~BranchPredictor(){};
+ 
++  /** Overload predict() with more information in parameters */
++  virtual BranchPrediction predict(uint64_t address, BranchType type,
++                                   uint64_t knownTarget, uint8_t instByteLength)
++      = 0;
++
+   /** Generate a branch prediction for the specified instruction address with a
+    * branch type and possible known target. */
+   virtual BranchPrediction predict(uint64_t address, BranchType type,
+diff --git a/src/include/simeng/CoreInstance.hh b/src/include/simeng/CoreInstance.hh
+index c8e151e8..e4d5b232 100644
+--- a/src/include/simeng/CoreInstance.hh
++++ b/src/include/simeng/CoreInstance.hh
+@@ -16,6 +16,7 @@
+ #include "simeng/kernel/Linux.hh"
+ #include "simeng/models/emulation/Core.hh"
+ #include "simeng/models/inorder/Core.hh"
++#include "simeng/models/mcu/Core.hh"
+ #include "simeng/models/outoforder/Core.hh"
+ #include "simeng/pipeline/A64FXPortAllocator.hh"
+ #include "simeng/pipeline/BalancedPortAllocator.hh"
+@@ -37,7 +38,7 @@ uint32_t hex_[] = {
+ namespace simeng {
+ 
+ /** The available modes of simulation. */
+-enum class SimulationMode { Emulation, InOrderPipelined, OutOfOrder };
++enum class SimulationMode { Emulation, InOrderPipelined, MCU, OutOfOrder };
+ 
+ /** A class to create a SimEng core instance from a supplied config. */
+ class CoreInstance {
+diff --git a/src/include/simeng/Elf.hh b/src/include/simeng/Elf.hh
+index 14bcddcb..485debea 100644
+--- a/src/include/simeng/Elf.hh
++++ b/src/include/simeng/Elf.hh
+@@ -2,6 +2,7 @@
+ 
+ #include <string>
+ #include <vector>
++#include <unordered_map>
+ 
+ #include "simeng/span.hh"
+ 
+@@ -30,23 +31,85 @@ struct Elf32Header {
+   uint32_t memorySize;
+ };
+ 
++typedef struct {
++  unsigned char e_ident[16];
++  uint16_t      e_type;
++  uint16_t      e_machine;
++  uint32_t      e_version;
++  uint32_t      e_entry;
++  uint32_t      e_phoff;
++  uint32_t      e_shoff;
++  uint32_t      e_flags;
++  uint16_t      e_ehsize;
++  uint16_t      e_phentsize;
++  uint16_t      e_phnum;
++  uint16_t      e_shentsize;
++  uint16_t      e_shnum;
++  uint16_t      e_shstrndx;
++} Elf32_Ehdr;
++
++typedef struct {
++    uint32_t   p_type;
++    uint32_t   p_offset;
++    uint32_t   p_vaddr;
++    uint32_t   p_paddr;
++    uint32_t   p_filesz;
++    uint32_t   p_memsz;
++    uint32_t   p_flags;
++    uint32_t   p_align;
++} Elf32_Phdr;
++
++typedef struct {
++  uint32_t   sh_name;
++  uint32_t   sh_type;
++  uint32_t   sh_flags;
++  uint32_t   sh_addr;
++  uint32_t   sh_offset;
++  uint32_t   sh_size;
++  uint32_t   sh_link;
++  uint32_t   sh_info;
++  uint32_t   sh_addralign;
++  uint32_t   sh_entsize;
++} Elf32_Shdr;
++
++typedef struct {
++    uint32_t      st_name;
++    uint32_t      st_value;
++    uint32_t      st_size;
++    unsigned char st_info;
++    unsigned char st_other;
++    uint16_t      st_shndx;
++} Elf32_Sym;
++
++enum ElfPhType {
++  PT_NULL,
++  PT_LOAD
++};
++
++enum ElfShType {
++  SHT_NULL,
++  SHT_PROGBITS,
++  SHT_SYMTAB,
++  SHT_STRTAB
++};
++
+ /** A processed Executable and Linkable Format (ELF) file. */
+ class Elf {
+- public:
+-  Elf(std::string path, char** imagePointer);
+-  ~Elf();
+-  uint64_t getProcessImageSize() const;
+-  bool isValid() const;
+-  uint64_t getEntryPoint() const;
+-
+- private:
+-  uint64_t entryPoint_;
+-  std::vector<ElfHeader> headers_;
+-  uint32_t entryPoint32_;
+-  std::vector<Elf32Header> headers32_;
+-  bool isValid_ = false;
+-  uint64_t processImageSize_;
+-  bool mode32bit_;
++  public:
++    Elf(std::string path, char** imagePointer, std::unordered_map<std::string, uint64_t>& symbols);
++    ~Elf();
++    uint64_t  getProcessImageSize() const;
++    bool      isValid() const;
++    uint64_t  getEntryPoint() const;
++
++  private:
++    uint64_t  entryPoint_;
++    std::vector<ElfHeader> headers_;
++    uint32_t  entryPoint32_;
++    std::vector<Elf32Header> headers32_;
++    bool      isValid_ = false;
++    uint64_t  processImageSize_;
++    bool      mode32bit_;
+ };
+ 
+ }  // namespace simeng
+diff --git a/src/include/simeng/GenericPredictor.hh b/src/include/simeng/GenericPredictor.hh
+index 21df57a4..aff5ade8 100644
+--- a/src/include/simeng/GenericPredictor.hh
++++ b/src/include/simeng/GenericPredictor.hh
+@@ -26,6 +26,9 @@ class GenericPredictor : public BranchPredictor {
+   GenericPredictor(YAML::Node config);
+   ~GenericPredictor();
+ 
++  BranchPrediction predict(uint64_t address, BranchType type,
++                           uint64_t knownTarget, uint8_t byteLength) override;
++
+   /** Generate a branch prediction for the supplied instruction address, a
+    * branch type, and a known target if not 0. Returns a branch direction and
+    * branch target address. */
+diff --git a/src/include/simeng/Instruction.hh b/src/include/simeng/Instruction.hh
+index 8b1cf2f9..9ffc4a8d 100644
+--- a/src/include/simeng/Instruction.hh
++++ b/src/include/simeng/Instruction.hh
+@@ -23,6 +23,9 @@ class Instruction {
+    * instruction. */
+   bool exceptionEncountered() const;
+ 
++  /** Binds an interrupt to this instruction  */
++  virtual void raiseInterrupt(int16_t& interruptId)                     {}
++
+   /** Retrieve the source registers this instruction reads. */
+   virtual const span<Register> getOperandRegisters() const = 0;
+ 
+@@ -99,8 +102,8 @@ class Instruction {
+   /** Retrieve branch type. */
+   virtual BranchType getBranchType() const = 0;
+ 
+-  /** Retrieve a branch target from the instruction's metadata if known. */
+-  virtual uint64_t getKnownTarget() const = 0;
++  /** Retrieve an offset of branch target from the instruction's metadata if known. */
++  virtual uint64_t getKnownOffset() const = 0;
+ 
+   /** Is this a store address operation (a subcategory of store operations which
+    * deal with the generation of store addresses to store data at)? */
+@@ -178,6 +181,12 @@ class Instruction {
+   /** Get arbitrary micro-operation index. */
+   int getMicroOpIndex() const;
+ 
++  bool isDiv() const;
++
++  bool isMul() const;
++
++  bool isSysCall() const;
++
+  protected:
+   /** Whether an exception has been encountered. */
+   bool exceptionEncountered_ = false;
+@@ -208,8 +217,8 @@ class Instruction {
+   /** What type of branch this instruction is. */
+   BranchType branchType_ = BranchType::Unknown;
+ 
+-  /** If the branch target is known at the time of decode, store it. */
+-  uint64_t knownTarget_ = 0;
++  /** If the offset of branch target is known at the time of decode, store it. */
++  uint64_t knownOffset_ = 0;
+ 
+   // Flushing
+   /** This instruction's sequence ID; a higher ID represents a chronologically
+@@ -252,6 +261,12 @@ class Instruction {
+   /** An arbitrary index value for the micro-operation. Its use is based on the
+    * implementation of specific micro-operations. */
+   int microOpIndex_;
++
++  bool isMul_ = false;
++
++  bool isDiv_ = false;
++
++  bool isSysCall_ = false;
+ };
+ 
+ }  // namespace simeng
+\ No newline at end of file
+diff --git a/src/include/simeng/arch/Architecture.hh b/src/include/simeng/arch/Architecture.hh
+index edd404c8..29874c6d 100644
+--- a/src/include/simeng/arch/Architecture.hh
++++ b/src/include/simeng/arch/Architecture.hh
+@@ -101,6 +101,9 @@ class Architecture {
+   /** Returns the maximum size of a valid instruction in bytes. */
+   virtual uint8_t getMaxInstructionSize() const = 0;
+ 
++  /** Returns the minimum size of a valid instruction in bytes. */
++  virtual uint8_t getMinInstructionSize() const = 0;
++
+   /** Returns the physical register structure as defined within the config
+    * file
+    */
+@@ -113,7 +116,7 @@ class Architecture {
+       YAML::Node config) const = 0;
+ 
+   /** Updates System registers of any system-based timers. */
+-  virtual void updateSystemTimerRegisters(RegisterFileSet* regFile,
++  virtual int16_t updateSystemTimerRegisters(RegisterFileSet* regFile,
+                                           const uint64_t iterations) const = 0;
+ 
+   /** Update trace file */
+diff --git a/src/include/simeng/arch/aarch64/Architecture.hh b/src/include/simeng/arch/aarch64/Architecture.hh
+index ad14dc1c..3c1ce27f 100644
+--- a/src/include/simeng/arch/aarch64/Architecture.hh
++++ b/src/include/simeng/arch/aarch64/Architecture.hh
+@@ -51,6 +51,9 @@ class Architecture : public arch::Architecture {
+   /** Returns the maximum size of a valid instruction in bytes. */
+   uint8_t getMaxInstructionSize() const override;
+ 
++  /** Returns the minimum size of a valid instruction in bytes. */
++  uint8_t getMinInstructionSize() const override;
++
+   /** Returns the current vector length set by the provided configuration. */
+   uint64_t getVectorLength() const;
+ 
+@@ -59,7 +62,7 @@ class Architecture : public arch::Architecture {
+   uint64_t getStreamingVectorLength() const;
+ 
+   /** Updates System registers of any system-based timers. */
+-  void updateSystemTimerRegisters(RegisterFileSet* regFile,
++  int16_t updateSystemTimerRegisters(RegisterFileSet* regFile,
+                                   const uint64_t iterations) const override;
+ 
+   /** Returns the physical register structure as defined within the config file
+diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh
+index 43d1bd49..bffa3c62 100644
+--- a/src/include/simeng/arch/aarch64/Instruction.hh
++++ b/src/include/simeng/arch/aarch64/Instruction.hh
+@@ -301,7 +301,7 @@ class Instruction : public simeng::Instruction {
+   BranchType getBranchType() const override;
+ 
+   /** Retrieve a branch target from the instruction's metadata if known. */
+-  uint64_t getKnownTarget() const override;
++  uint64_t getKnownOffset() const override;
+ 
+   /** Is this a store address operation (a subcategory of store operations which
+    * deal with the generation of store addresses to store data at)? */
+diff --git a/src/include/simeng/arch/riscv/Architecture.hh b/src/include/simeng/arch/riscv/Architecture.hh
+index de6c76c7..3bdb6287 100644
+--- a/src/include/simeng/arch/riscv/Architecture.hh
++++ b/src/include/simeng/arch/riscv/Architecture.hh
+@@ -6,27 +6,18 @@
+ #include <iomanip>
+ 
+ #include "simeng/arch/Architecture.hh"
+-#include "simeng/arch/riscv/ExceptionHandler.hh"
++
+ #include "simeng/arch/riscv/Instruction.hh"
+ #include "simeng/kernel/Linux.hh"
+ 
+ using csh = size_t;
+ 
++#include "simeng/arch/riscv/SystemRegister.hh"
++#include "simeng/arch/riscv/ExceptionHandler.hh"
++
+ namespace simeng {
+ namespace arch {
+ namespace riscv {
+-
+-enum riscv_sysreg {
+-  SYSREG_MSTATUS = 0x300,
+-  SYSREG_MSTATUSH = 0x310,
+-  SYSREG_MEPC = 0x341,
+-  SYSREG_MCAUSE = 0x342,
+-  SYSREG_MHARTID = 0xF14,
+-  SYSREG_CYCLE = 0xC00,
+-  SYSREG_TIME = 0xC01,
+-  SYSREG_INSTRRET = 0xC02
+-};
+-
+ struct constantsPool {
+   const uint8_t alignMask = 0x3;
+   const uint8_t alignMaskCompressed = 0x1;
+@@ -45,7 +36,7 @@ struct archConstants {
+ /* A basic RISC-V implementation of the `Architecture` interface. */
+ class Architecture : public arch::Architecture {
+  public:
+-  Architecture(kernel::Linux& kernel, YAML::Node config);
++  Architecture(kernel::Linux& kernel, YAML::Node config, std::shared_ptr<simeng::MemoryInterface>& dataMemory);
+   ~Architecture();
+   /** Pre-decode instruction memory into a macro-op of `Instruction`
+    * instances. Returns the number of bytes consumed to produce it (always 4),
+@@ -60,6 +51,9 @@ class Architecture : public arch::Architecture {
+   /** Returns a zero-indexed register tag for a system register encoding. */
+   int32_t getSystemRegisterTag(uint16_t reg) const override;
+ 
++  /** Returns a System Register index from a system register tag. */
++  uint16_t getSystemRegisterIdFromTag(int32_t tag) const;
++
+   /** Returns the number of system registers that have a mapping. */
+   uint16_t getNumSystemRegisters() const override;
+ 
+@@ -77,8 +71,11 @@ class Architecture : public arch::Architecture {
+   /** Returns the maximum size of a valid instruction in bytes. */
+   uint8_t getMaxInstructionSize() const override;
+ 
+-  /** Updates System registers of any system-based timers. */
+-  void updateSystemTimerRegisters(RegisterFileSet* regFile,
++  /** Returns the minimum size of a valid instruction in bytes. */
++  uint8_t getMinInstructionSize() const override;
++
++  /** Updates System registers of any system-based timers. Return +ve id if interrupt occurs */
++  int16_t updateSystemTimerRegisters(RegisterFileSet* regFile,
+                                   const uint64_t iterations) const override;
+ 
+   /** Returns the physical register structure as defined within the config file
+@@ -117,6 +114,18 @@ class Architecture : public arch::Architecture {
+   /** A mapping from system register encoding to a zero-indexed tag. */
+   std::unordered_map<uint16_t, uint16_t> systemRegisterMap_;
+ 
++  /** Ordered map of memory mapped system regsiters banks **/
++  std::map<uint64_t, MemoryMappedSystemRegisterBlock*> memoryMappedSystemRegisterBlocks;
++
++  /* Memory Interface through which memory mapped system registers are accessed */
++  std::shared_ptr<SystemRegisterMemoryInterface> systemRegisterMemoryInterface;
++
++  /* Optional Clint block which replicates that functionality in spike */
++  std::shared_ptr<Clint> clint;
++
++  /* Optional Host Target Interface block which replicates that functionality in spike */
++  std::shared_ptr<HostTargetInterface> htif;
++
+   /** A map to hold the relationship between aarch64 instruction groups and
+    * user-defined execution information. */
+   std::unordered_map<uint16_t, executionInfo> groupExecutionInfo_;
+diff --git a/src/include/simeng/arch/riscv/ExceptionHandler.hh b/src/include/simeng/arch/riscv/ExceptionHandler.hh
+index 02d29c93..36cfd5d1 100644
+--- a/src/include/simeng/arch/riscv/ExceptionHandler.hh
++++ b/src/include/simeng/arch/riscv/ExceptionHandler.hh
+@@ -57,6 +57,9 @@ class ExceptionHandler : public simeng::arch::ExceptionHandler {
+    */
+   bool readBufferThen(uint64_t ptr, uint64_t length, std::function<bool()> then,
+                       bool firstCall = true);
++  
++  /** generate system register changes associated with taking an exception **/
++  void takeException(uint64_t causecode);
+ 
+   /** A data buffer used for reading data from memory. */
+   std::vector<uint8_t> dataBuffer;
+diff --git a/src/include/simeng/arch/riscv/Instruction.hh b/src/include/simeng/arch/riscv/Instruction.hh
+index 3f023d28..60966ce0 100644
+--- a/src/include/simeng/arch/riscv/Instruction.hh
++++ b/src/include/simeng/arch/riscv/Instruction.hh
+@@ -48,7 +48,8 @@ enum class InstructionException {
+   HypervisorCall,
+   SecureMonitorCall,
+   UnmappedSysReg,
+-  NoAvailablePort
++  NoAvailablePort,
++  Interrupt
+ };
+ 
+ enum CInstructionFormat {
+@@ -87,6 +88,18 @@ class Instruction : public simeng::Instruction {
+    * processing this instruction. */
+   virtual InstructionException getException() const;
+ 
++  /** Raise an interrupt. */
++  void raiseInterrupt(int16_t& interruptId)
++  {
++    interruptId_          = interruptId;
++    exceptionEncountered_ = true;
++    exception_            = InstructionException::Interrupt;
++    interruptId           = -1;
++  }
++
++  /** Get Id of this interrupr */
++  int16_t getInterruptId() const { return interruptId_; }
++
+   /** Retrieve the source registers this instruction reads. */
+   const span<Register> getOperandRegisters() const override;
+ 
+@@ -139,8 +152,8 @@ class Instruction : public simeng::Instruction {
+   /** Retrieve branch type. */
+   BranchType getBranchType() const override;
+ 
+-  /** Retrieve a branch target from the instruction's metadata if known. */
+-  uint64_t getKnownTarget() const override;
++  /** Retrieve an offset of branch target from the instruction's metadata if known. */
++  uint64_t getKnownOffset() const override;
+ 
+   /** Is this a store address operation (a subcategory of store operations which
+    * deal with the generation of store addresses to store data at)? */
+@@ -186,6 +199,8 @@ class Instruction : public simeng::Instruction {
+   /** ONLY valid after decode. Return regByteWidth */
+   uint8_t getArchRegWidth() const;
+ 
++  const Architecture& getArchitecture() const;
++
+  private:
+   /** The maximum number of source registers any supported RISC-V instruction
+    * can have. */
+@@ -292,7 +307,9 @@ class Instruction : public simeng::Instruction {
+   std::vector<RegisterValue> memoryData;
+ 
+   /** Return integer register value, to support both 32-bit and 64-bit mode */
+-  int64_t getSignedInt(RegisterValue& value) const;
++  int64_t  getSignedInt(RegisterValue& value) const;
++
++  int16_t interruptId_;
+ };
+ 
+ }  // namespace riscv
+diff --git a/src/include/simeng/arch/riscv/SystemRegister.hh b/src/include/simeng/arch/riscv/SystemRegister.hh
+new file mode 100644
+index 00000000..0556156e
+--- /dev/null
++++ b/src/include/simeng/arch/riscv/SystemRegister.hh
+@@ -0,0 +1,229 @@
++#pragma once
++
++#include <forward_list>
++#include <unordered_map>
++#include <fstream>
++#include <iomanip>
++
++#include "simeng/arch/Architecture.hh"
++
++#include "simeng/arch/riscv/Instruction.hh"
++#include "simeng/kernel/Linux.hh"
++
++namespace simeng {
++namespace arch {
++namespace riscv {
++
++// Should probably move to Capstone
++
++enum riscv_sysreg {
++  SYSREG_MSTATUS    = 0x300,
++  SYSREG_MIE        = 0x304,
++  SYSREG_MTVEC      = 0x305,
++  SYSREG_MSTATUSH   = 0x310,
++  SYSREG_MSCRATCH   = 0x340,
++  SYSREG_MEPC       = 0x341,
++  SYSREG_MCAUSE     = 0x342,
++  SYSREG_MHARTID    = 0xF14,
++  SYSREG_MXCPTSC    = 0xFC2,
++  SYSREG_CYCLE      = 0xC00,
++  SYSREG_TIME       = 0xC01,
++  SYSREG_INSTRRET   = 0xC02
++};
++
++enum riscv_causecode_enum {
++  CAUSE_IADDRESS_MISALIGN   = 0,
++  CAUSE_IACCESS_FAULT       = 1,
++  CAUSE_ILLEGAL_INSTRUCTION = 2,
++  CAUSE_BREAKPOINT          = 3,
++  CAUSE_LDADDRESS_MISALIGN  = 4,
++  CAUSE_LDACCESS_FAULT      = 5,
++  CAUSE_STADDRESS_MISALIGN  = 6,
++  CAUSE_STACCESS_FAULT      = 7,
++  CAUSE_ECALL_FROM_M        = 11
++};
++
++enum class InterruptId {
++  HALT             = 1,
++  TIMER            = 7
++};
++
++enum riscv_sysreg_masks {
++  MSTATUS_MIE_MASK           = 0x8,
++  MSTATUS_MPIE_MASK          = 0x80
++};
++
++typedef uint16_t riscv_causecode;
++
++class MemoryMappedSystemRegister {
++  public:
++    MemoryMappedSystemRegister(const RegisterValue& val)          : state(val) {}                
++    bool size()                                                   { return state.size(); }
++    virtual void  put(const RegisterValue& val)                   { state = val; }
++    virtual const RegisterValue& get()                            { return state; }
++  private:
++    RegisterValue state;
++};
++
++class MemoryMappedSystemRegisterBlock {
++  public:
++    MemoryMappedSystemRegisterBlock(size_t sz)                    : size_(sz) {}
++    size_t size()                                                 { return size_; }
++    virtual bool put(uint16_t, const RegisterValue&);
++    virtual bool get(uint16_t, RegisterValue&);
++    virtual void tick()                                           {}
++  protected:
++    /** Ordered map of memory mapped system regsiters **/
++    std::map<uint16_t, MemoryMappedSystemRegister*> memoryMappedSystemRegisters;
++    size_t size_;
++};
++
++class SystemRegisterMemoryInterface : public MemoryInterface {
++  public:
++    SystemRegisterMemoryInterface(
++      std::shared_ptr<simeng::MemoryInterface>& dataMemory, 
++      std::map<uint64_t,MemoryMappedSystemRegisterBlock*>& memoryMappedSystemRegisterBlocks
++    ) :
++      dataMemory_(dataMemory),
++      memoryMappedSystemRegisterBlocks_(memoryMappedSystemRegisterBlocks)
++    {}
++
++    /** Request a read from the supplied target location. */
++    virtual void requestRead(const MemoryAccessTarget& target,
++                            uint64_t requestId = 0)
++    {
++      RegisterValue data(0,target.size);
++      if (getMemoryMappedSystemRegister(target.address, data))
++        completedReads_.push_back({target, data, requestId});
++      else 
++        dataMemory_.get()->requestRead(target,requestId);
++    }
++
++    /** Request a write of `data` to the target location. */
++    virtual void requestWrite(const MemoryAccessTarget& target,
++                              const RegisterValue& data)
++    {
++      if (!putMemoryMappedSystemRegister(target.address, data))
++        dataMemory_.get()->requestWrite(target,data);
++    }
++
++    /** Retrieve all completed read requests. */
++    virtual const span<MemoryReadResult> getCompletedReads() const
++    {
++      if (completedReads_.empty())
++        return dataMemory_.get()->getCompletedReads();
++      else
++        return {const_cast<MemoryReadResult*>(completedReads_.data()), completedReads_.size()};
++    }
++
++    /** Clear the completed reads. */
++    virtual void clearCompletedReads()
++    {
++      if (completedReads_.empty())
++        dataMemory_.get()->clearCompletedReads();
++      else 
++        completedReads_.clear();
++    }
++
++    /** Returns true if there are any oustanding memory requests in-flight. */
++    virtual bool hasPendingRequests() const
++    {
++      return dataMemory_.get()->hasPendingRequests();
++    }
++
++    /** Tick the memory interface to allow it to process internal tasks.
++    *
++    * TODO: Move ticking out of the memory interface and into a central "memory
++    * system" covering a set of related interfaces.
++    */
++    virtual void tick()
++    {
++      dataMemory_.get()->tick();
++    }
++
++  private :
++    /** Put/Get Memory Mapped Registers */
++    bool putMemoryMappedSystemRegister(uint64_t address, const RegisterValue& value);
++    bool getMemoryMappedSystemRegister(uint64_t address, RegisterValue& value);
++
++    std::shared_ptr<simeng::MemoryInterface> dataMemory_;
++
++    /** Address map of all system register blocks */
++    std::map<uint64_t,MemoryMappedSystemRegisterBlock*>& memoryMappedSystemRegisterBlocks_;
++    
++    /** A vector containing all completed read requests. */
++    std::vector<MemoryReadResult> completedReads_;
++};
++
++class Architecture;
++
++class HostTargetInterface : public MemoryMappedSystemRegisterBlock {
++  public:
++    enum { 
++      PAYLOAD_OFFSET  = 0,
++      DEVICEID_OFFSET = 4
++    };
++
++    HostTargetInterface(Architecture& architecture)
++    : 
++      MemoryMappedSystemRegisterBlock(8),
++      architecture_(architecture),
++      isHalted_(false)
++    {
++      memoryMappedSystemRegisters[PAYLOAD_OFFSET]  = new MemoryMappedSystemRegister(static_cast<uint32_t>(0));
++      memoryMappedSystemRegisters[DEVICEID_OFFSET] = new MemoryMappedSystemRegister(static_cast<uint32_t>(0));
++    }
++
++    bool put(uint16_t offset, const RegisterValue&value);
++
++    int16_t updateSystemTimerRegisters(RegisterFileSet* regFile, const uint64_t iterations) {
++      if (isHalted_)
++        return static_cast<int16_t>(InterruptId::HALT);
++      return -1;
++    }
++
++  private :
++    Architecture& architecture_;
++    bool          isHalted_;
++};
++
++class Clint : public MemoryMappedSystemRegisterBlock {
++  public:
++    enum {
++      CLINT_BASE        = 0x02000000,
++      CLINT_SIZE        = 0x0000c000,
++      MTIMECMP_OFFSET   = 0x4000,
++      MTIME_OFFSET      = 0xbff8
++    };
++
++    Clint(Architecture& architecture)
++    :
++      MemoryMappedSystemRegisterBlock(CLINT_SIZE),
++      architecture_(architecture),
++      mtime_(static_cast<uint64_t>(0)),
++      mtimecmp_(static_cast<uint64_t>(0)),
++      mtime_freq(100),
++      mtime_count(0),
++      last_tick(0)
++    {
++      memoryMappedSystemRegisters[MTIME_OFFSET]    = &mtime_;
++      memoryMappedSystemRegisters[MTIMECMP_OFFSET] = &mtimecmp_;
++    }
++
++    int16_t updateSystemTimerRegisters(RegisterFileSet* regFile, const uint64_t iterations);
++
++  private :
++    Architecture& architecture_;
++
++    MemoryMappedSystemRegister mtime_;
++    MemoryMappedSystemRegister mtimecmp_;
++
++    uint32_t      mtime_freq;
++    uint32_t      mtime_count;
++    uint64_t      last_tick;
++};
++
++
++}  // namespace riscv
++}  // namespace arch
++}  // namespace simeng
+diff --git a/src/include/simeng/kernel/Linux.hh b/src/include/simeng/kernel/Linux.hh
+index 0908d590..635bd427 100644
+--- a/src/include/simeng/kernel/Linux.hh
++++ b/src/include/simeng/kernel/Linux.hh
+@@ -93,6 +93,8 @@ struct LinuxProcessState {
+   std::vector<int64_t> fileDescriptorTable;
+   /** Set of deallocated virtual file descriptors available for reuse. */
+   std::set<int64_t> freeFileDescriptors;
++  /** Pointer to LinuxProcess from which ProcessState derived*/
++  const LinuxProcess* process;
+ };
+ 
+ /** Fixed-width definition of 'rusage' (from <sys/resource.h>). */
+@@ -236,6 +238,9 @@ class Linux {
+   /** The maximum size of a filesystem path. */
+   static const size_t LINUX_PATH_MAX = 4096;
+ 
++  /** Lookup symbol value from table in elf file. */
++  bool lookupSymbolValue(const std::string symbol, uint64_t& value);
++
+  private:
+   /** Resturn correct Dirfd depending on given pathname abd dirfd given to
+    * syscall. */
+diff --git a/src/include/simeng/kernel/LinuxProcess.hh b/src/include/simeng/kernel/LinuxProcess.hh
+index 9796b529..d6b2c4a9 100644
+--- a/src/include/simeng/kernel/LinuxProcess.hh
++++ b/src/include/simeng/kernel/LinuxProcess.hh
+@@ -77,6 +77,9 @@ class LinuxProcess {
+   /** Check whether the process image was created successfully. */
+   bool isValid() const;
+ 
++  /** Lookup symbol value from table in elf file. */
++  bool lookupSymbolValue(const std::string symbol, uint64_t& value) const;
++
+  private:
+   /** The size of the stack, in bytes. */
+   const uint64_t STACK_SIZE;
+@@ -113,6 +116,8 @@ class LinuxProcess {
+ 
+   /** Shared pointer to processImage. */
+   std::shared_ptr<char> processImage_;
++  
++  std::unordered_map<std::string, uint64_t> symbols_;
+ };
+ 
+ }  // namespace kernel
+diff --git a/src/include/simeng/models/emulation/Core.hh b/src/include/simeng/models/emulation/Core.hh
+index c4a4acc4..1db10d23 100644
+--- a/src/include/simeng/models/emulation/Core.hh
++++ b/src/include/simeng/models/emulation/Core.hh
+@@ -108,6 +108,9 @@ class Core : public simeng::Core {
+ 
+   /** The number of branches executed. */
+   uint64_t branchesExecuted_ = 0;
++
++  /** Set to interruptId when interrupt occurs, otherwise -1 */
++  int16_t  interruptId_;
+ };
+ 
+ }  // namespace emulation
+diff --git a/src/include/simeng/models/mcu/Core.hh b/src/include/simeng/models/mcu/Core.hh
+new file mode 100644
+index 00000000..de6a53d3
+--- /dev/null
++++ b/src/include/simeng/models/mcu/Core.hh
+@@ -0,0 +1,181 @@
++#pragma once
++
++#include <vector>
++
++#include "simeng/ArchitecturalRegisterFileSet.hh"
++#include "simeng/Core.hh"
++#include "simeng/FlatMemoryInterface.hh"
++#include "simeng/pipeline_hi/DecodeUnit.hh"
++#include "simeng/pipeline_hi/ExecuteUnit.hh"
++#include "simeng/pipeline_hi/FetchUnit.hh"
++#include "simeng/pipeline_hi/WritebackUnit.hh"
++#include "simeng/pipeline_hi/StaticPredictor.hh"
++#include "simeng/pipeline_hi/LoadStoreQueue.hh"
++#include "simeng/pipeline_hi/RegDepMap.hh"
++
++#include "simeng/arch/riscv/Architecture.hh"
++
++namespace simeng {
++namespace models {
++namespace mcu {
++
++/** An entry in the reservation station. */
++struct dependencyEntry1 {
++  /** The instruction to execute. */
++  std::shared_ptr<Instruction> uop;
++
++  /** The operand waiting on a value. */
++  uint16_t operandIndex;
++};
++
++/** A simple scalar in-order pipelined core model. */
++class Core : public simeng::Core {
++ public:
++  /** Construct a core model, providing an ISA and branch predictor to use,
++   * along with a pointer and size of instruction memory, and a pointer to
++   * process memory. */
++  Core(MemoryInterface& instructionMemory, MemoryInterface& dataMemory,
++       uint64_t processMemorySize, uint64_t entryPoint,
++       const arch::Architecture& isa, BranchPredictor& branchPredictor, YAML::Node config);
++
++  /** Tick the core. Ticks each of the pipeline stages sequentially, then ticks
++   * the buffers between them. Checks for and executes pipeline flushes at the
++   * end of each cycle. */
++  void tick() override;
++
++  /** Check whether the program has halted. */
++  bool hasHalted() const override;
++
++  /** Retrieve the architectural register file set. */
++  const ArchitecturalRegisterFileSet& getArchitecturalRegisterFileSet()
++      const override;
++
++  /** Retrieve the number of instructions retired. */
++  uint64_t getInstructionsRetiredCount() const override;
++
++  /** Retrieve the simulated nanoseconds elapsed since the core started. */
++  uint64_t getSystemTimer() const override;
++
++  /** Generate a map of statistics to report. */
++  std::map<std::string, std::string> getStats() const override;
++
++ private:
++  /** Raise an exception to the core, providing the generating instruction. */
++  void raiseException(const std::shared_ptr<Instruction>& instruction);
++
++  /** Handle an exception raised during the cycle. */
++  void handleException();
++
++  /** Load and supply memory data requested by an instruction. */
++  void loadData(const std::shared_ptr<Instruction>& instruction);
++  /** Store data supplied by an instruction to memory. */
++  void storeData(const std::shared_ptr<Instruction>& instruction);
++
++  /** Forward operands to the most recently decoded instruction. */
++  void forwardOperands(const span<Register>& destinations,
++                       const span<RegisterValue>& values);
++
++  bool canIssue(const std::shared_ptr<Instruction>& instruction);
++  void removeDep(const std::shared_ptr<Instruction>& instruction);
++
++  /** Read pending registers for the most recently decoded instruction. */
++  void readRegisters();
++
++  /** Process the active exception handler. */
++  void processExceptionHandler();
++
++  /** Apply changes to the process state. */
++  void applyStateChange(const arch::ProcessStateChange& change);
++
++  /** Handle requesting/execution of a load instruction. */
++  void handleLoad(const std::shared_ptr<Instruction>& instruction);
++
++  void addInstrOrderQ(const std::shared_ptr<Instruction>& instruction);
++  bool removeInstrOrderQ(const std::shared_ptr<Instruction>& instruction);
++
++  /** The process memory. */
++  MemoryInterface& dataMemory_;
++
++  /** A reference to the core's architecture. */
++  const arch::Architecture& isa_;
++
++  /** The core's register file set. */
++  RegisterFileSet registerFileSet_;
++
++  /** An architectural register file set, serving as a simple wrapper around the
++   * register file set. */
++  ArchitecturalRegisterFileSet architecturalRegisterFileSet_;
++
++  /** The process memory. */
++  span<char> processMemory;
++
++  /** The buffer between fetch and decode. */
++  pipeline_hi::PipelineBuffer<MacroOp> fetchToDecodeBuffer_;
++
++  /** The buffer between decode and execute. */
++  pipeline_hi::PipelineBuffer<std::shared_ptr<Instruction>> decodeToExecuteBuffer_;
++
++  /** The buffer between execute and writeback. */
++  std::vector<pipeline_hi::PipelineBuffer<std::shared_ptr<Instruction>>>
++      completionSlots_;
++
++  /** The previously generated addresses. */
++  std::queue<simeng::MemoryAccessTarget> previousAddresses_;
++
++  /** The register dependency map. */
++  pipeline_hi::RegDepMap regDepMap_;
++
++  /** The fetch unit; fetches instructions from memory. */
++  pipeline_hi::FetchUnit fetchUnit_;
++
++  /** The decode unit; decodes instructions into uops and reads operands. */
++  pipeline_hi::DecodeUnit decodeUnit_;
++
++  /** The execute unit; executes uops and sends to writeback, also forwarding
++   * results. */
++  pipeline_hi::ExecuteUnit executeUnit_;
++
++  /** The writeback unit; writes uop results to the register files. */
++  pipeline_hi::WritebackUnit writebackUnit_;
++
++  pipeline_hi::LoadStoreQueue loadStoreQueue_;
++
++  /** The number of times the pipeline has been flushed. */
++  uint64_t flushes_ = 0;
++
++  /** The number of times this core has been ticked. */
++  uint64_t ticks_ = 0;
++
++  uint64_t lastCommitTick_ = 0;
++
++  /** Whether an exception was generated during the cycle. */
++  bool exceptionGenerated_ = false;
++
++  /** A pointer to the instruction responsible for generating the exception. */
++  std::shared_ptr<Instruction> exceptionGeneratingInstruction_;
++
++  /** Whether the core has halted. */
++  bool hasHalted_ = false;
++
++  /** The active exception handler. */
++  std::shared_ptr<arch::ExceptionHandler> exceptionHandler_;
++
++  std::deque<std::shared_ptr<Instruction>> inorderIQ_;
++
++  void checkHalting();
++  bool enableHaltCheck = false;
++  uint64_t maxStallCycleTimeout;
++  uint64_t maxSimCycleTimeout;
++  uint64_t maxInstrTimeout;
++
++  /** Set to interruptId when interrupt occurs, otherwise -1 */
++  int16_t  interruptId_;
++
++  /** Return interrupt id of the pending interrupt*/
++  int16_t isInterruptPending();
++
++};
++
++}  // namespace mcu
++}  // namespace models
++}  // namespace simeng
+diff --git a/src/include/simeng/pipeline_hi/DecodeUnit.hh b/src/include/simeng/pipeline_hi/DecodeUnit.hh
+new file mode 100644
+index 00000000..728dff88
+--- /dev/null
++++ b/src/include/simeng/pipeline_hi/DecodeUnit.hh
+@@ -0,0 +1,66 @@
++#pragma once
++
++#include <functional>
++#include <queue>
++
++#include "simeng/arch/Architecture.hh"
++#include "simeng/pipeline_hi/PipelineBuffer.hh"
++
++namespace simeng {
++namespace pipeline_hi {
++
++/** A decode unit for a pipelined processor. Splits pre-decoded macro-ops into
++ * uops. */
++class DecodeUnit {
++ public:
++  /** Constructs a decode unit with references to input/output buffers and the
++   * current branch predictor. */
++  DecodeUnit(PipelineBuffer<MacroOp>& input,
++             PipelineBuffer<std::shared_ptr<Instruction>>& output,
++             BranchPredictor& predictor,
++             std::function<bool(const std::shared_ptr<Instruction>&)> canIssue);
++
++  /** Ticks the decode unit. Breaks macro-ops into uops, and performs early
++   * branch misprediction checks. */
++  void tick();
++
++  /** Check whether the core should be flushed this cycle. */
++  bool shouldFlush() const;
++
++  /** Retrieve the target instruction address associated with the most recently
++   * discovered misprediction. */
++  uint64_t getFlushAddress() const;
++
++  /** Retrieve the number of times that the decode unit requested a flush due to
++   * discovering a branch misprediction early. */
++  uint64_t getEarlyFlushes() const;
++
++  /** Clear the microOps_ queue. */
++  void purgeFlushed();
++
++ private:
++  /** A buffer of macro-ops to split into uops. */
++  PipelineBuffer<MacroOp>& input_;
++  /** An internal buffer for storing one or more uops. */
++  std::deque<std::shared_ptr<Instruction>> microOps_;
++  /** A buffer for writing decoded uops into. */
++  PipelineBuffer<std::shared_ptr<Instruction>>& output_;
++
++  /** A reference to the current branch predictor. */
++  BranchPredictor& predictor_;
++
++  /** Whether the core should be flushed after this cycle. */
++  bool shouldFlush_;
++
++  /** The target instruction address the PC should be updated to upon flush. */
++  uint64_t pc_;
++
++  /** The number of times that the decode unit requested a flush due to
++   * discovering a branch misprediction early. */
++  uint64_t earlyFlushes_ = 0;
++
++  std::function<bool(const std::shared_ptr<Instruction>&)> canIssue_;
++};
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/include/simeng/pipeline_hi/DispatchIssueUnit.hh b/src/include/simeng/pipeline_hi/DispatchIssueUnit.hh
+new file mode 100644
+index 00000000..132358fd
+--- /dev/null
++++ b/src/include/simeng/pipeline_hi/DispatchIssueUnit.hh
+@@ -0,0 +1,150 @@
++#pragma once
++
++#include <deque>
++#include <initializer_list>
++#include <queue>
++#include <tuple>
++#include <unordered_map>
++#include <unordered_set>
++
++#include "simeng/Instruction.hh"
++#include "simeng/pipeline_hi/PipelineBuffer.hh"
++#include "simeng/pipeline_hi/PortAllocator.hh"
++#include "yaml-cpp/yaml.h"
++
++namespace simeng {
++namespace pipeline_hi {
++
++/** A reservation station issue port */
++struct ReservationStationPort {
++  /** Issue port this port maps to */
++  uint16_t issuePort;
++  /** Queue of instructions that are ready to be
++   * issued */
++  std::deque<std::shared_ptr<Instruction>> ready;
++};
++
++/** A reservation station */
++struct ReservationStation {
++  /** Size of reservation station */
++  uint16_t capacity;
++  /** Number of instructions that can be dispatched to this unit per cycle. */
++  uint16_t dispatchRate;
++  /** Current number of non-stalled instructions
++   * in reservation station */
++  uint16_t currentSize;
++  /** Issue ports belonging to reservation station */
++  std::vector<ReservationStationPort> ports;
++};
++
++/** An entry in the reservation station. */
++struct dependencyEntry {
++  /** The instruction to execute. */
++  std::shared_ptr<Instruction> uop;
++  /** The port to issue to. */
++  uint16_t port;
++  /** The operand waiting on a value. */
++  uint16_t operandIndex;
++};
++
++/** A dispatch/issue unit for an out-of-order pipelined processor. Reads
++ * instruction operand and performs scoreboarding. Issues instructions to the
++ * execution unit once ready. */
++class DispatchIssueUnit {
++ public:
++  /** Construct a dispatch/issue unit with references to input/output buffers,
++   * the register file, the port allocator, and a description of the number of
++   * physical registers the scoreboard needs to reflect. */
++  DispatchIssueUnit(
++      PipelineBuffer<std::shared_ptr<Instruction>>& fromRename,
++      std::vector<PipelineBuffer<std::shared_ptr<Instruction>>>& issuePorts,
++      const RegisterFileSet& registerFileSet, PortAllocator& portAllocator,
++      const std::vector<uint16_t>& physicalRegisterStructure,
++      YAML::Node config);
++
++  /** Ticks the dispatch/issue unit. Reads available input operands for
++   * instructions and sets scoreboard flags for destination registers. */
++  void tick();
++
++  /** Identify the oldest ready instruction in the reservation station and issue
++   * it. */
++  void issue();
++
++  /** Forwards operands and performs register reads for the currently queued
++   * instruction. */
++  void forwardOperands(const span<Register>& destinations,
++                       const span<RegisterValue>& values);
++
++  /** Set the scoreboard entry for the provided register as ready. */
++  void setRegisterReady(Register reg);
++
++  /** Clear the RS of all flushed instructions. */
++  void purgeFlushed();
++
++  /** Retrieve the number of cycles this unit stalled due to insufficient RS
++   * space. */
++  uint64_t getRSStalls() const;
++
++  /** Retrieve the number of cycles no instructions were issued due to an empty
++   * RS. */
++  uint64_t getFrontendStalls() const;
++
++  /** Retrieve the number of cycles no instructions were issued due to
++   * dependencies or a lack of available ports. */
++  uint64_t getBackendStalls() const;
++
++  /** Retrieve the number of times an instruction was unable to issue due to a
++   * busy port. */
++  uint64_t getPortBusyStalls() const;
++
++  /** Retrieve the current sizes and capacities of the reservation stations*/
++  void getRSSizes(std::vector<uint64_t>&) const;
++
++ private:
++  /** A buffer of instructions to dispatch and read operands for. */
++  PipelineBuffer<std::shared_ptr<Instruction>>& input_;
++
++  /** Ports to the execution units, for writing ready instructions to. */
++  std::vector<PipelineBuffer<std::shared_ptr<Instruction>>>& issuePorts_;
++
++  /** A reference to the physical register file set. */
++  const RegisterFileSet& registerFileSet_;
++
++  /** The register availability scoreboard. */
++  std::vector<std::vector<bool>> scoreboard_;
++
++  /** Reservation stations */
++  std::vector<ReservationStation> reservationStations_;
++
++  /** A mapping from port to RS port */
++  std::vector<std::pair<uint16_t, uint16_t>> portMapping_;
++
++  /** A dependency matrix, containing all the instructions waiting on an
++   * operand. For a register `{type,tag}`, the vector of dependents may be found
++   * at `dependencyMatrix[type][tag]`. */
++  std::vector<std::vector<std::vector<dependencyEntry>>> dependencyMatrix_;
++
++  /** A map to collect flushed instructions for each reservation station. */
++  std::unordered_map<uint16_t, std::unordered_set<std::shared_ptr<Instruction>>>
++      flushed_;
++
++  /** A reference to the execution port allocator. */
++  PortAllocator& portAllocator_;
++
++  /** The number of cycles stalled due to a full reservation station. */
++  uint64_t rsStalls_ = 0;
++
++  /** The number of cycles no instructions were issued due to an empty RS. */
++  uint64_t frontendStalls_ = 0;
++
++  /** The number of cycles no instructions were issued due to dependencies or a
++   * lack of available ports. */
++  uint64_t backendStalls_ = 0;
++
++  /** The number of times an instruction was unable to issue due to a busy port.
++   */
++  uint64_t portBusyStalls_ = 0;
++};
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/include/simeng/pipeline_hi/ExecuteUnit.hh b/src/include/simeng/pipeline_hi/ExecuteUnit.hh
+new file mode 100644
+index 00000000..da51db34
+--- /dev/null
++++ b/src/include/simeng/pipeline_hi/ExecuteUnit.hh
+@@ -0,0 +1,147 @@
++#pragma once
++
++#include <deque>
++#include <functional>
++
++#include "simeng/BranchPredictor.hh"
++#include "simeng/Instruction.hh"
++#include "simeng/pipeline_hi/PipelineBuffer.hh"
++
++namespace simeng {
++namespace pipeline_hi {
++
++/** An execution unit pipeline entry, containing an instruction, and an
++ * indication of when it's reached the front of the execution pipeline. */
++struct ExecutionUnitPipelineEntry {
++  /** The instruction queued for execution. */
++  std::shared_ptr<Instruction> insn;
++  /** The tick number this instruction will reach the front of the queue at. */
++  uint64_t readyAt;
++};
++
++/** An execute unit for a pipelined processor. Executes instructions and
++ * forwards results. */
++class ExecuteUnit {
++ public:
++  /** Constructs an execute unit with references to an input and output buffer,
++   * the currently used branch predictor, and handlers for forwarding operands,
++   * loads/stores, and exceptions. */
++  ExecuteUnit(
++      PipelineBuffer<std::shared_ptr<Instruction>>& input,
++      PipelineBuffer<std::shared_ptr<Instruction>>& output,
++      std::function<void(span<Register>, span<RegisterValue>)> forwardOperands,
++      std::function<void(const std::shared_ptr<Instruction>&)> handleLoad,
++      std::function<void(const std::shared_ptr<Instruction>&)> handleStore,
++      std::function<void(const std::shared_ptr<Instruction>&)> raiseException,
++      std::function<void(const std::shared_ptr<Instruction>&)> addInstrOrderQ,
++      std::function<int16_t(void)> isInterruptPending,
++      BranchPredictor& predictor, bool pipelined = true,
++      const std::vector<uint16_t>& blockingGroups = {});
++
++  /** Tick the execute unit. Places incoming instructions into the pipeline and
++   * executes an instruction that has reached the head of the pipeline, if
++   * present. */
++  void tick();
++
++  /** Query whether a branch misprediction was discovered this cycle. */
++  bool shouldFlush() const;
++
++  /** Retrieve the target instruction address associated with the most recently
++   * discovered misprediction. */
++  uint64_t getFlushAddress() const;
++
++  /** Retrieve the sequence ID associated with the most recently discovered
++   * misprediction. */
++  uint64_t getFlushSeqId() const;
++
++  /** Purge flushed instructions from the internal pipeline and clear any active
++   * stall, if applicable. */
++  void purgeFlushed();
++
++  /** Retrieve the number of branch instructions that have been executed. */
++  uint64_t getBranchExecutedCount() const;
++
++  /** Retrieve the number of branch mispredictions. */
++  uint64_t getBranchMispredictedCount() const;
++
++  /** Retrieve the number of active execution cycles. */
++  uint64_t getCycles() const;
++
++ private:
++  /** Execute the supplied uop, write it into the output buffer, and forward
++   * results back to dispatch/issue. */
++  void execute(std::shared_ptr<Instruction>& uop);
++
++  /** A buffer of instructions to execute. */
++  PipelineBuffer<std::shared_ptr<Instruction>>& input_;
++
++  /** A buffer for writing executed instructions into. */
++  PipelineBuffer<std::shared_ptr<Instruction>>& output_;
++
++  /** A function handle called when forwarding operands. */
++  std::function<void(span<Register>, span<RegisterValue>)> forwardOperands_;
++
++  /** A function handle called after generating the addresses for a load. */
++  std::function<void(const std::shared_ptr<Instruction>&)> handleLoad_;
++  /** A function handle called after acquiring the data for a store. */
++  std::function<void(const std::shared_ptr<Instruction>&)> handleStore_;
++
++  /** A function handle called upon exception generation. */
++  std::function<void(const std::shared_ptr<Instruction>&)> raiseException_;
++
++  /** A function to add the executed instruction into an ordering queue. */
++  std::function<void(const std::shared_ptr<Instruction>&)> addInstrOrderQ_;
++
++  /** Check if any interrupts are pending */
++  std::function<int16_t(void)> isInterruptPending_;
++
++  /** A reference to the branch predictor, for updating with prediction results.
++   */
++  BranchPredictor& predictor_;
++
++  /** Whether this unit is pipelined, or if all instructions should stall until
++   * complete. */
++  bool pipelined_;
++
++  /** The execution unit's internal pipeline, holding instructions until their
++   * execution latency has expired and they are ready for their final results to
++   * be calculated and forwarded. */
++  std::deque<ExecutionUnitPipelineEntry> pipeline_;
++
++  /** A group of operation types that are blocked whilst a similar operation
++   * is being executed. */
++  std::vector<uint16_t> blockingGroups_;
++
++  /** A queue to hold blocked instructions of a similar group type to
++   * blockingGroup_. */
++  std::deque<std::shared_ptr<Instruction>> operationsStalled_;
++
++  /** Whether the core should be flushed after this cycle. */
++  bool shouldFlush_ = false;
++
++  /** The target instruction address the PC should be reset to after this cycle.
++   */
++  uint64_t pc_;
++
++  /** The sequence ID of the youngest instruction that should remain after the
++   * current flush. */
++  uint64_t flushAfter_;
++
++  /** The number of times this unit has been ticked. */
++  uint64_t tickCounter_ = 0;
++
++  /** The cycle this unit will become unstalled. */
++  uint64_t stallUntil_ = 0;
++
++  /** The number of branch instructions that were executed. */
++  uint64_t branchesExecuted_ = 0;
++
++  /** The number of branch mispredictions that were observed. */
++  uint64_t branchMispredicts_ = 0;
++
++  /** The number of active execution cycles that were observed. */
++  uint64_t cycles_ = 0;
++};
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/include/simeng/pipeline_hi/FetchUnit.hh b/src/include/simeng/pipeline_hi/FetchUnit.hh
+new file mode 100644
+index 00000000..1c8f40c2
+--- /dev/null
++++ b/src/include/simeng/pipeline_hi/FetchUnit.hh
+@@ -0,0 +1,127 @@
++#pragma once
++
++#include <queue>
++
++#include "simeng/MemoryInterface.hh"
++#include "simeng/arch/Architecture.hh"
++#include "simeng/pipeline_hi/PipelineBuffer.hh"
++
++namespace simeng {
++namespace pipeline_hi {
++
++/** The various states of the loop buffer. */
++enum class LoopBufferState {
++  IDLE = 0,  // No operations
++  WAITING,   // Waiting to find boundary instruction in fetch stream
++  FILLING,   // Filling loop buffer with loop body
++  SUPPLYING  // Feeding loop buffer content to output buffer
++};
++
++// Struct to hold information about a fetched instruction
++struct loopBufferEntry {
++  // Encoding of the instruction
++  const uint64_t encoding;
++
++  // Size of the instruction
++  const uint16_t instructionSize;
++
++  // PC of the instruction
++  const uint64_t address;
++
++  // Branch prediction made for instruction
++  const BranchPrediction prediction;
++};
++
++/** A fetch and pre-decode unit for a pipelined processor. Responsible for
++ * reading instruction memory and maintaining the program counter. */
++class FetchUnit {
++ public:
++  /** Construct a fetch unit with a reference to an output buffer, the ISA, and
++   * the current branch predictor, and information on the instruction memory. */
++  FetchUnit(PipelineBuffer<MacroOp>& output, MemoryInterface& instructionMemory,
++            uint64_t programByteLength, uint64_t entryPoint, uint8_t blockSize,
++            const arch::Architecture& isa, BranchPredictor& branchPredictor);
++
++  ~FetchUnit();
++
++  /** Tick the fetch unit. Retrieves and pre-decodes the instruction at the
++   * current program counter. */
++  void tick();
++
++  /** Function handle to retrieve branch that represents loop boundary. */
++  void registerLoopBoundary(uint64_t branchAddress);
++
++  /** Check whether the program has ended. Returns `true` if the current PC is
++   * outside of instruction memory. */
++  bool hasHalted() const;
++
++  /** Update the program counter to the specified address. */
++  void updatePC(uint64_t address);
++
++  /** Request instructions at the current program counter for a future cycle. */
++  void requestFromPC();
++
++  /** Retrieve the number of cycles fetch terminated early due to a predicted
++   * branch. */
++  uint64_t getBranchStalls() const;
++
++  /** Clear the loop buffer. */
++  void flushLoopBuffer();
++
++  /**  */
++  void flushPredictor(uint64_t address);
++
++ private:
++  /** An output buffer connecting this unit to the decode unit. */
++  PipelineBuffer<MacroOp>& output_;
++
++  /** The current program counter. */
++  uint64_t pc_ = 0;
++
++  /** An interface to the instruction memory. */
++  MemoryInterface& instructionMemory_;
++
++  /** The length of the available instruction memory. */
++  uint64_t programByteLength_;
++
++  /** Reference to the currently used ISA. */
++  const arch::Architecture& isa_;
++
++  /** Reference to the current branch predictor. */
++  BranchPredictor& branchPredictor_;
++
++  /** A loop buffer to supply a detected loop instruction stream. */
++  std::deque<loopBufferEntry> loopBuffer_;
++
++  /** State of the loop buffer. */
++  LoopBufferState loopBufferState_ = LoopBufferState::IDLE;
++
++  /** The branch instruction that forms the loop. */
++  uint64_t loopBoundaryAddress_ = 0;
++
++  /** The current program halt state. Set to `true` when the PC leaves the
++   * instruction memory region, and set back to `false` if the PC is returned to
++   * the instruction region. */
++  bool hasHalted_ = false;
++
++  bool waitSCEval_ = false;
++
++  /** The number of cycles fetch terminated early due to a predicted branch. */
++  uint64_t branchStalls_ = 0;
++
++  /** The size of a fetch block, in bytes. */
++  uint8_t blockSize_;
++
++  /** A mask of the bits of the program counter to use for obtaining the block
++   * address to fetch. */
++  uint64_t blockMask_;
++
++  /** The buffer used to hold fetched instruction data. */
++  uint8_t* fetchBuffer_;
++
++  /** The amount of data currently in the fetch buffer. */
++  uint8_t bufferedBytes_ = 0;
++};
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/include/simeng/pipeline_hi/LoadStoreQueue.hh b/src/include/simeng/pipeline_hi/LoadStoreQueue.hh
+new file mode 100644
+index 00000000..211b1ef7
+--- /dev/null
++++ b/src/include/simeng/pipeline_hi/LoadStoreQueue.hh
+@@ -0,0 +1,235 @@
++#pragma once
++
++#include <deque>
++#include <functional>
++#include <map>
++#include <queue>
++#include <unordered_map>
++
++#include "simeng/Instruction.hh"
++#include "simeng/MemoryInterface.hh"
++#include "simeng/pipeline_hi/PipelineBuffer.hh"
++
++namespace simeng {
++namespace pipeline_hi {
++
++/** The memory access types which are processed. */
++enum accessType { LOAD = 0, STORE };
++
++/** A requestQueue_ entry. */
++struct requestEntry {
++  /** The memory address(es) to be accessed. */
++  std::queue<simeng::MemoryAccessTarget> reqAddresses;
++  /** The instruction sending the request(s). */
++  std::shared_ptr<Instruction> insn;
++};
++/** A requestQueue_ entry. */
++struct requestEntry1 {
++  /** The memory address(es) to be accessed. */
++  std::queue<simeng::MemoryAccessTarget> reqAddresses;
++  /** The memory address(es) to be accessed. */
++  std::queue<simeng::RegisterValue> data;
++  /** The instruction sending the request(s). */
++  std::shared_ptr<Instruction> insn;
++  accessType type;
++  uint64_t reqtick;
++  bool isMisAligned;
++};
++/** A load store queue (known as "load/store buffers" or "memory order buffer").
++ * Holds in-flight memory access requests to ensure load/store consistency. */
++class LoadStoreQueue {
++ public:
++  /** Constructs a combined load/store queue model, simulating a shared queue
++   * for both load and store instructions, supplying completion slots for loads
++   * and an operand forwarding handler. */
++  LoadStoreQueue(
++      unsigned int maxCombinedSpace, MemoryInterface& memory,
++      span<PipelineBuffer<std::shared_ptr<Instruction>>> completionSlots,
++      std::function<void(span<Register>, span<RegisterValue>)> forwardOperands,
++      bool exclusive = false, uint16_t loadBandwidth = UINT16_MAX,
++      uint16_t storeBandwidth = UINT16_MAX,
++      uint16_t permittedRequests = UINT16_MAX,
++      uint16_t permittedLoads = UINT16_MAX,
++      uint16_t permittedStores = UINT16_MAX);
++
++  /** Constructs a split load/store queue model, simulating discrete queues for
++   * load and store instructions, supplying completion slots for loads and an
++   * operand forwarding handler. */
++  LoadStoreQueue(
++      unsigned int maxLoadQueueSpace, unsigned int maxStoreQueueSpace,
++      MemoryInterface& memory,
++      span<PipelineBuffer<std::shared_ptr<Instruction>>> completionSlots,
++      std::function<void(span<Register>, span<RegisterValue>)> forwardOperands,
++      bool exclusive = false, uint16_t loadBandwidth = UINT16_MAX,
++      uint16_t storeBandwidth = UINT16_MAX,
++      uint16_t permittedRequests = UINT16_MAX,
++      uint16_t permittedLoads = UINT16_MAX,
++      uint16_t permittedStores = UINT16_MAX);
++
++  /** Retrieve the available space for load uops. For combined queue this is the
++   * total remaining space. */
++  unsigned int getLoadQueueSpace() const;
++
++  /** Retrieve the available space for store uops. For a combined queue this is
++   * the total remaining space. */
++  unsigned int getStoreQueueSpace() const;
++
++  /** Retrieve the available space for any memory uops. For a split queue this
++   * is the sum of the space in both queues. */
++  unsigned int getTotalSpace() const;
++
++  /** Add a load uop to the queue. */
++  void addLoad(const std::shared_ptr<Instruction>& insn);
++
++  /** Add a store uop to the queue. */
++  void addStore(const std::shared_ptr<Instruction>& insn);
++
++  /** Add the load instruction's memory requests to the requestQueue_. */
++  void startLoad(const std::shared_ptr<Instruction>& insn);
++
++  /** Supply the data to be stored by a store operation. */
++  void supplyStoreData(const std::shared_ptr<Instruction>& insn);
++
++  /** Commit and write the oldest store instruction to memory, removing it from
++   * the store queue. Returns `true` if memory disambiguation has discovered a
++   * memory order violation during the commit. */
++  bool commitStore(const std::shared_ptr<Instruction>& uop);
++
++  /** Remove the oldest load instruction from the load queue. */
++  void commitLoad(const std::shared_ptr<Instruction>& uop);
++
++  /** Remove all flushed instructions from the queues. */
++  void purgeFlushed();
++
++  /** Whether this is a combined load/store queue. */
++  bool isCombined() const;
++
++  /** Process received load data and send any completed loads for writeback. */
++  void tick();
++
++  /** Retrieve the load instruction associated with the most recently discovered
++   * memory order violation. */
++  std::shared_ptr<Instruction> getViolatingLoad() const;
++
++  void processResponse();
++
++  bool activeMisAlignedOpr() const;
++
++  bool isBusy() const;
++
++  float getAvgLdLat() const { return (totalLdLatency)/numLoads; };
++
++  uint32_t getMaxLdLat() const { return maxLdLatency; };
++  uint32_t getMinLdLat() const { return minLdLatency; };
++
++ private:
++  /** The load queue: holds in-flight load instructions. */
++  std::deque<std::shared_ptr<Instruction>> loadQueue_;
++
++  /** The store queue: holds in-flight store instructions with its associated
++   * data. */
++  std::deque<std::pair<std::shared_ptr<Instruction>,
++                       span<const simeng::RegisterValue>>>
++      storeQueue_;
++
++  /** Slots to write completed load instructions into for writeback. */
++  span<PipelineBuffer<std::shared_ptr<Instruction>>> completionSlots_;
++
++  /** Map of loads that have requested their data, keyed by sequence ID. */
++  std::unordered_map<uint64_t, std::shared_ptr<Instruction>> requestedLoads_;
++
++    /** Map of loads that have requested their data, keyed by sequence ID. */
++  std::unordered_map<uint64_t, uint64_t> latencyLoads_;
++
++  /** A function handler to call to forward the results of a completed load. */
++  std::function<void(span<Register>, span<RegisterValue>)> forwardOperands_;
++
++  /** The maximum number of loads that can be in-flight. Undefined if this
++   * is a combined queue. */
++  unsigned int maxLoadQueueSpace_;
++
++  /** The maximum number of stores that can be in-flight. Undefined if this is a
++   * combined queue. */
++  unsigned int maxStoreQueueSpace_;
++
++  /** The maximum number of memory ops that can be in-flight. Undefined if this
++   * is a split queue. */
++  unsigned int maxCombinedSpace_;
++
++  /** Whether this queue is combined or split. */
++  bool combined_;
++
++  /** Retrieve the load queue space for a split queue. */
++  unsigned int getLoadQueueSplitSpace() const;
++
++  /** Retrieve the store queue space for a split queue. */
++  unsigned int getStoreQueueSplitSpace() const;
++
++  /** Retrieve the total memory uop space available for a combined queue. */
++  unsigned int getCombinedSpace() const;
++
++  /** A pointer to process memory. */
++  MemoryInterface& memory_;
++
++  /** The load instruction associated with the most recently discovered memory
++   * order violation. */
++  std::shared_ptr<Instruction> violatingLoad_ = nullptr;
++
++  /** The number of times this unit has been ticked. */
++  uint64_t tickCounter_ = 0;
++
++  /** A map to hold load instructions that are stalled due to a detected
++   * memory reordering confliction. First key is a store's sequence id and the
++   * second key the conflicting address. The value takes the form of a vector of
++   * pairs containing a pointer to the conflicted load and the size of the data
++   * needed at that address by the load. */
++  std::unordered_map<
++      uint64_t,
++      std::unordered_map<
++          uint64_t,
++          std::vector<std::pair<std::shared_ptr<Instruction>, uint16_t>>>>
++      conflictionMap_;
++
++  /** A map between LSQ cycles and load requests ready on that cycle. */
++  std::map<uint64_t, std::deque<requestEntry>> requestLoadQueue_;
++
++  /** A map between LSQ cycles and store requests ready on that cycle. */
++  std::map<uint64_t, std::deque<requestEntry>> requestStoreQueue_;
++
++  /** A queue of completed loads ready for writeback. */
++  std::queue<std::shared_ptr<Instruction>> completedLoads_;
++
++  /** Whether the LSQ can only process loads xor stores within a cycle. */
++  bool exclusive_;
++
++  /** The amount of data readable from the L1D cache per cycle. */
++  uint16_t loadBandwidth_;
++
++  /** The amount of data writable to the L1D cache per cycle. */
++  uint16_t storeBandwidth_;
++
++  /** The combined limit of loads and store requests permitted per cycle. */
++  uint16_t totalLimit_;
++
++  /** The number of loads and stores permitted per cycle. */
++  std::array<uint16_t, 2> reqLimits_;
++
++  /** A map between LSQ cycles and load or store requests ready on that cycle. */
++  std::deque<requestEntry1> requestQueue_;
++
++  /* Identifier for request to memory*/
++  uint8_t busReqId = 0;
++
++  //bool activeMisAlignedStore = false;
++
++  //Stats
++  uint64_t numLoads = 0;
++  double totalLdLatency = 0;
++  uint32_t maxLdLatency = 0;
++  uint32_t minLdLatency = 0xFFFF;
++  float averageAccessLdLatency = 0.0;
++};
++
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/include/simeng/pipeline_hi/PipelineBuffer.hh b/src/include/simeng/pipeline_hi/PipelineBuffer.hh
+new file mode 100644
+index 00000000..dd2ed70c
+--- /dev/null
++++ b/src/include/simeng/pipeline_hi/PipelineBuffer.hh
+@@ -0,0 +1,107 @@
++#pragma once
++
++#include <algorithm>
++#include <memory>
++#include <vector>
++
++namespace simeng {
++namespace pipeline_hi {
++
++/** A tickable pipelined buffer. Values are shifted from the tail slot to the
++ * head slot each time `tick()` is called. */
++template <class T>
++class PipelineBuffer {
++ public:
++  /** Construct a pipeline buffer of width `width`, and fill all slots with
++   * `initialValue`. */
++  PipelineBuffer(int width, const T& initialValue)
++      : width(width), buffer(width * defaultLength_, initialValue),
++        length_(defaultLength_), headIndex_(defaultLength_-1),
++        tailIndex_(0) {}
++
++  PipelineBuffer(int width, const T& initialValue, int length)
++      : width(width), buffer(width * length, initialValue), length_(length),
++        headIndex_(length_-1), tailIndex_(0) {
++    assert(length_ != 0 && "Pipeline buffer length cannot be 0");
++  }
++
++  /** Tick the buffer and move head/tail pointers, or do nothing if it's
++   * stalled. */
++  void tick() {
++    if (isStalled_) return;
++
++    //length ==1 shortcut? condition check cost
++
++    if (headIndex_) { // when headIndex != 0
++      headIndex_--;
++    } else {
++      headIndex_ = length_ - 1;
++    }
++    if (tailIndex_) { // when tailIndex != 0
++      tailIndex_--;
++    } else {
++      tailIndex_ = length_ - 1;
++    }
++  }
++
++  /** Get a tail slots pointer. */
++  T* getTailSlots() {
++    T* ptr = buffer.data();
++    return &ptr[tailIndex_ * width];
++  }
++
++  /** Get a const tail slots pointer. */
++  const T* getTailSlots() const {
++    const T* ptr = buffer.data();
++    return &ptr[tailIndex_ * width];
++  }
++
++  /** Get a head slots pointer. */
++  T* getHeadSlots() {
++    T* ptr = buffer.data();
++    return &ptr[headIndex_ * width];
++  }
++
++  /** Get a const head slots pointer. */
++  const T* getHeadSlots() const {
++    const T* ptr = buffer.data();
++    return &ptr[headIndex_ * width];
++  }
++
++  /** Check if the buffer is stalled. */
++  bool isStalled() const { return isStalled_; }
++
++  /** Set the buffer's stall flag to `stalled`. */
++  void stall(bool stalled) { isStalled_ = stalled; }
++
++  /** Fill the buffer with a specified value. */
++  void fill(const T& value) { std::fill(buffer.begin(), buffer.end(), value); }
++
++  /** Get the width of the buffer slots. */
++  unsigned short getWidth() const { return width; }
++
++ private:
++  /** The width of each row of slots. */
++  unsigned short width;
++
++  /** The buffer. */
++  std::vector<T> buffer;
++
++  /** Whether the buffer is stalled or not. */
++  bool isStalled_ = false;
++
++  /** Buffer length */
++  const unsigned int length_;
++
++  /**  */
++  unsigned int headIndex_;
++
++  /**  */
++  unsigned int tailIndex_;
++
++  /** The number of stages in the pipeline. */
++  static const unsigned int defaultLength_ = 2;
++};
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/include/simeng/pipeline_hi/PipelineBuffer1.hh b/src/include/simeng/pipeline_hi/PipelineBuffer1.hh
+new file mode 100644
+index 00000000..dfb465a3
+--- /dev/null
++++ b/src/include/simeng/pipeline_hi/PipelineBuffer1.hh
+@@ -0,0 +1,133 @@
++#pragma once
++
++#include <algorithm>
++#include <memory>
++#include <vector>
++
++namespace simeng {
++namespace pipeline_hi {
++
++// TODO: Extend to allow specifying the number of cycles it will take for
++// information to move from tail to head (currently fixed at 1 by
++// implementation)
++
++/** A tickable pipelined buffer. Values are shifted from the tail slot to the
++ * head slot each time `tick()` is called. */
++template <class T>
++class PipelineBuffer {
++ public:
++  /** Construct a pipeline buffer of width `width`, and fill all slots with
++   * `initialValue`. */
++  PipelineBuffer(int width, const T& initialValue)
++      : width(width), buffer(width * defaultLength_, initialValue),
++        length_(defaultLength_) {}
++
++  //TODO:currently length > 2 is not working, oscillate between 0 and 1
++  PipelineBuffer(int width, const T& initialValue, int length)
++      : width(width), buffer(width * length, initialValue), length_(length),
++        useDefaultLength_(false) {
++    assert(length_ != 0 && "Pipeline buffer length cannot be 0");
++  }
++
++  /** Tick the buffer and move head/tail pointers, or do nothing if it's
++   * stalled. */
++  void tick() {
++    if (useDefaultLength_) {
++      if (isStalled_) return;
++
++      headIsStart = !headIsStart;
++    } else {
++      if (length_ == 1) {
++        return;
++      } else if (length_ > 2) {
++        //TODO
++      }
++    }
++  }
++
++  /** Get a tail slots pointer. */
++  T* getTailSlots() {
++    T* ptr = buffer.data();
++    if (useDefaultLength_) {
++      return &ptr[headIsStart * width];
++    } else {
++      if (length_ == 1) {
++        return &ptr[0];
++      }
++    }
++  }
++
++  /** Get a const tail slots pointer. */
++  const T* getTailSlots() const {
++    const T* ptr = buffer.data();
++    if (useDefaultLength_) {
++      return &ptr[headIsStart * width];
++    } else {
++      if (length_ == 1) {
++        return &ptr[0];
++      }
++    }
++  }
++
++
++      /** Get a head slots pointer. */
++  T* getHeadSlots() {
++    T* ptr = buffer.data();
++    if (useDefaultLength_) {
++      return &ptr[!headIsStart * width];
++    } else {
++      if (length_ == 1) {
++        return &ptr[0];
++      }
++    }
++  }
++
++  /** Get a const head slots pointer. */
++  const T* getHeadSlots() const {
++    const T* ptr = buffer.data();
++    if (useDefaultLength_) {
++      return &ptr[!headIsStart * width];
++    } else {
++      if (length_ == 1) {
++        return &ptr[0];
++      }
++    }
++  }
++
++  /** Check if the buffer is stalled. */
++  bool isStalled() const { return isStalled_; }
++
++  /** Set the buffer's stall flag to `stalled`. */
++  void stall(bool stalled) { isStalled_ = stalled; }
++
++  /** Fill the buffer with a specified value. */
++  void fill(const T& value) { std::fill(buffer.begin(), buffer.end(), value); }
++
++  /** Get the width of the buffer slots. */
++  unsigned short getWidth() const { return width; }
++
++ private:
++  /** The width of each row of slots. */
++  unsigned short width;
++
++  /** The buffer. */
++  std::vector<T> buffer;
++
++  /** The offset of the head pointer; either 0 or 1. */
++  bool headIsStart = 0;
++
++  /** Whether the buffer is stalled or not. */
++  bool isStalled_ = false;
++
++  /** Buffer length */
++  const unsigned int length_;
++
++  /** True if using default length (== 2) */
++  bool useDefaultLength_ = true;
++
++  /** The number of stages in the pipeline. */
++  static const unsigned int defaultLength_ = 2;
++};
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/include/simeng/pipeline_hi/PortAllocator.hh b/src/include/simeng/pipeline_hi/PortAllocator.hh
+new file mode 100644
+index 00000000..bc985c0a
+--- /dev/null
++++ b/src/include/simeng/pipeline_hi/PortAllocator.hh
+@@ -0,0 +1,43 @@
++#pragma once
++
++#include <cstdint>
++#include <functional>
++
++namespace simeng {
++namespace pipeline_hi {
++
++namespace PortType {
++/** Instructions have to match the exact group(s) in set. */
++const uint8_t COMPULSORY = 0;
++/** Instructions can optional match group(s) in set. */
++const uint8_t OPTIONAL = 1;
++}  // namespace PortType
++
++/** An abstract execution port allocator interface. */
++class PortAllocator {
++ public:
++  virtual ~PortAllocator(){};
++
++  /** Allocate a port for the specified instruction group; returns the allocated
++   * port. */
++  virtual uint16_t allocate(const std::vector<uint16_t>& ports) = 0;
++
++  /** Inform the allocator that an instruction was issued to the specified port.
++   */
++  virtual void issued(uint16_t port) = 0;
++
++  /** Inform the allocator that an instruction will not issue to its
++   * allocated port. */
++  virtual void deallocate(uint16_t port) = 0;
++
++  /** Set function from DispatchIssueUnit to retrieve reservation
++   * station sizes during execution. */
++  virtual void setRSSizeGetter(
++      std::function<void(std::vector<uint64_t>&)> rsSizes) = 0;
++
++  /** Tick the port allocator to allow it to process internal tasks. */
++  virtual void tick() = 0;
++};
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/include/simeng/pipeline_hi/RegDepMap.hh b/src/include/simeng/pipeline_hi/RegDepMap.hh
+new file mode 100644
+index 00000000..7145fd19
+--- /dev/null
++++ b/src/include/simeng/pipeline_hi/RegDepMap.hh
+@@ -0,0 +1,57 @@
++#pragma once
++
++#include <deque>
++#include <map>
++#include <queue>
++#include <unordered_map>
++
++#include "simeng/Instruction.hh"
++
++namespace simeng {
++namespace pipeline_hi {
++
++typedef std::shared_ptr<Instruction> InstrPtr;
++class RegDepMap
++{
++  public:
++    RegDepMap(const std::vector<RegisterFileStructure> registerFileStructures, 
++              const RegisterFileSet& registerFileSet);
++    ~RegDepMap();
++
++    /** Clear the Entire Map */
++    void clear();
++
++    /** Insert all of a instruction's destination registers into map*/
++    void insert(InstrPtr instr);
++
++    /** Remove all of a instruction's destination registers into map*/
++    void remove(InstrPtr instr);
++
++    /** Is the current instruction able to read from this
++     *  destination register?
++     */
++    bool canRead(InstrPtr instr);
++
++    /** Is the current instruction able to write to this
++     *  destination register?
++     */
++    bool canWrite(InstrPtr instr);
++
++    /* Is there any instr that can forward the data for this instr. If yes, set
++     * the data*/
++    bool canForward(InstrPtr instr);
++
++    void purgeFlushed();
++
++    void dump();
++    
++  private:
++    const std::vector<RegisterFileStructure> registerFileStructures_;
++    const RegisterFileSet& registerFileSet_;
++    typedef std::vector<std::vector<InstrPtr> > DepMap;
++    std::vector<DepMap> regMap_;
++    uint32_t outstandingDep_ = 0;
++};
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/include/simeng/pipeline_hi/RegisterAliasTable.hh b/src/include/simeng/pipeline_hi/RegisterAliasTable.hh
+new file mode 100644
+index 00000000..1b2327fc
+--- /dev/null
++++ b/src/include/simeng/pipeline_hi/RegisterAliasTable.hh
+@@ -0,0 +1,69 @@
++#pragma once
++
++#include <queue>
++
++#include "simeng/RegisterFileSet.hh"
++
++namespace simeng {
++namespace pipeline_hi {
++
++/** A Register Alias Table (RAT) implementation. Contains information on
++ * the current register renaming state. */
++class RegisterAliasTable {
++ public:
++  /** Construct a RAT, supplying a description of the architectural register
++   * structure, and the corresponding numbers of physical registers that should
++   * be available. */
++  RegisterAliasTable(std::vector<RegisterFileStructure> architecturalStructure,
++                     std::vector<uint16_t> physicalStructure);
++
++  /** Retrieve the current physical register assigned to the provided
++   * architectural register. */
++  Register getMapping(Register architectural) const;
++
++  /** Determine whether it's possible to allocate `quantity` physical registers
++   * of type `type` this cycle. */
++  bool canAllocate(uint8_t type, unsigned int quantity) const;
++
++  /** Check whether registers of type `type` can be renamed by this RAT. */
++  bool canRename(uint8_t type) const;
++
++  /** Allocate a physical register for the provided architectural register. */
++  Register allocate(Register architectural);
++
++  /** Get the number of free registers available for allocation this cycle. */
++  unsigned int freeRegistersAvailable(uint8_t type) const;
++
++  /** Commit the provided physical register. This register now holds the
++   * committed state of the corresponding architectural register, and previous
++   * physical register is freed. */
++  void commit(Register physical);
++
++  /** Rewind the allocation of a physical register. The former physical register
++   * is reinstated to the mapping table, and the provided register is freed. */
++  void rewind(Register physical);
++
++  /** Free the provided physical register. */
++  void free(Register physical);
++
++ private:
++  /** The register mapping tables. Holds a map of architectural -> physical
++   * register mappings for each register type. */
++  std::vector<std::vector<uint16_t>> mappingTable_;
++
++  /** The register history tables. Each table holds an entry for each physical
++   * register, recording the physical register formerly assigned to its
++   * architectural register; one table is available per register type. */
++  std::vector<std::vector<uint16_t>> historyTable_;
++
++  /** The register destination tables. Holds a map of physical -> architectural
++   * register mappings for each register type. Used for rewind behaviour. */
++  std::vector<std::vector<uint16_t>> destinationTable_;
++
++  /** The free register queues. Holds a list of unallocated physical registers
++   * for each register type. */
++  std::vector<std::queue<uint16_t>> freeQueues_;
++};
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/include/simeng/pipeline_hi/ReorderBuffer.hh b/src/include/simeng/pipeline_hi/ReorderBuffer.hh
+new file mode 100644
+index 00000000..1e5fd840
+--- /dev/null
++++ b/src/include/simeng/pipeline_hi/ReorderBuffer.hh
+@@ -0,0 +1,136 @@
++#pragma once
++
++#include <deque>
++#include <functional>
++
++#include "simeng/Instruction.hh"
++#include "simeng/pipeline_hi/LoadStoreQueue.hh"
++#include "simeng/pipeline_hi/RegisterAliasTable.hh"
++
++namespace simeng {
++namespace pipeline_hi {
++
++/** A branch prediction outcome with an associated instruction address. */
++struct latestBranch {
++  /** Branch instruction address. */
++  uint64_t address;
++
++  /** Outcome of the branch. */
++  BranchPrediction outcome;
++
++  /** The related instructionsCommitted_ value that this instruction was
++   * committed on. */
++  uint64_t commitNumber;
++};
++
++/** A Reorder Buffer (ROB) implementation. Contains an in-order queue of
++ * in-flight instructions. */
++class ReorderBuffer {
++ public:
++  /** Constructs a reorder buffer of maximum size `maxSize`, supplying a
++   * reference to the register alias table. */
++  ReorderBuffer(
++      unsigned int maxSize, RegisterAliasTable& rat, LoadStoreQueue& lsq,
++      std::function<void(const std::shared_ptr<Instruction>&)> raiseException,
++      std::function<void(uint64_t branchAddress)> sendLoopBoundary,
++      BranchPredictor& predictor, uint16_t loopBufSize,
++      uint16_t loopDetectionThreshold);
++
++  /** Add the provided instruction to the ROB. */
++  void reserve(const std::shared_ptr<Instruction>& insn);
++
++  void commitMicroOps(uint64_t insnId);
++
++  /** Commit and remove up to `maxCommitSize` instructions. */
++  unsigned int commit(unsigned int maxCommitSize);
++
++  /** Flush all instructions with a sequence ID greater than `afterSeqId`. */
++  void flush(uint64_t afterSeqId);
++
++  /** Retrieve the current size of the ROB. */
++  unsigned int size() const;
++
++  /** Retrieve the current amount of free space in the ROB. */
++  unsigned int getFreeSpace() const;
++
++  /** Query whether a memory order violation was discovered in the most recent
++   * cycle. */
++  bool shouldFlush() const;
++
++  /** Retrieve the instruction address associated with the most recently
++   * discovered memory order violation. */
++  uint64_t getFlushAddress() const;
++
++  /** Retrieve the sequence ID associated with the most recently discovered
++   * memory order violation. */
++  uint64_t getFlushSeqId() const;
++
++  /** Get the number of instructions the ROB has committed. */
++  uint64_t getInstructionsCommittedCount() const;
++
++  /** Get the number of speculated loads which violated load-store ordering. */
++  uint64_t getViolatingLoadsCount() const;
++
++ private:
++  /** A reference to the register alias table. */
++  RegisterAliasTable& rat_;
++
++  /** A reference to the load/store queue. */
++  LoadStoreQueue& lsq_;
++
++  /** The maximum size of the ROB. */
++  unsigned int maxSize_;
++
++  /** A function to call upon exception generation. */
++  std::function<void(std::shared_ptr<Instruction>)> raiseException_;
++
++  /** A function to send an instruction at a detected loop boundary. */
++  std::function<void(uint64_t branchAddress)> sendLoopBoundary_;
++
++  /** Whether or not a loop has been detected. */
++  bool loopDetected_ = false;
++
++  /** A reference to the current branch predictor. */
++  BranchPredictor& predictor_;
++
++  /** The buffer containing in-flight instructions. */
++  std::deque<std::shared_ptr<Instruction>> buffer_;
++
++  /** Whether the core should be flushed after the most recent commit. */
++  bool shouldFlush_ = false;
++
++  /** The target instruction address the PC should be reset to after the most
++   * recent commit.
++   */
++  uint64_t pc_;
++
++  /** The sequence ID of the youngest instruction that should remain after the
++   * current flush. */
++  uint64_t flushAfter_;
++
++  /** Latest retired branch outcome with a counter. */
++  std::pair<latestBranch, uint64_t> branchCounter_ = {{0, {false, 0}, 0}, 0};
++
++  /** Loop buffer size. */
++  uint16_t loopBufSize_;
++
++  /** Amount of times a branch must be seen without interruption for it to be
++   * considered a loop. */
++  uint16_t loopDetectionThreshold_;
++
++  /** The next available sequence ID. */
++  uint64_t seqId_ = 0;
++
++  /** The next available instruction ID. Used to identify in-order groups of
++   * micro-operations. */
++  uint64_t insnId_ = 0;
++
++  /** The number of instructions committed. */
++  uint64_t instructionsCommitted_ = 0;
++
++  /** The number of speculatived loads which violated load-store ordering. */
++  uint64_t loadViolations_ = 0;
++};
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/include/simeng/pipeline_hi/StaticPredictor.hh b/src/include/simeng/pipeline_hi/StaticPredictor.hh
+new file mode 100644
+index 00000000..d8923dc2
+--- /dev/null
++++ b/src/include/simeng/pipeline_hi/StaticPredictor.hh
+@@ -0,0 +1,53 @@
++#pragma once
++
++#include <deque>
++
++#include "simeng/BranchPredictor.hh"
++#include "yaml-cpp/yaml.h"
++
++namespace simeng {
++namespace pipeline_hi {
++
++/** A static branch predictor; configurable in YAML config
++ */
++class StaticPredictor : public BranchPredictor {
++ public:
++  StaticPredictor(uint8_t sType); //TODO: temp constructor, get rid of yaml, delete it later
++  StaticPredictor(YAML::Node config);
++  ~StaticPredictor();
++
++  BranchPrediction predict(uint64_t address, BranchType type,
++                           uint64_t knownTarget, uint8_t byteLength) override;
++
++  /** Generate a branch prediction for the specified instruction address; will
++   * behave based on the configuration  */
++  BranchPrediction predict(uint64_t address, BranchType type,
++                           uint64_t knownTarget) override;
++
++  /** Provide branch results to update the prediction model for the specified
++   * instruction address. As this model is static, this does nothing. */
++  void update(uint64_t address, bool taken, uint64_t targetAddress,
++              BranchType type) override;
++
++  /** Provide flush logic for branch prediction scheme. The behaviour will
++   * be based on the configuration */
++  void flush(uint64_t address) override;
++
++ private:
++  /** Decide which static predictor will be in use */
++  uint8_t staticType_;
++
++  /** A return address stack. */
++  std::deque<uint64_t> ras_;
++
++  /** RAS history with instruction address as the keys. A non-zero value
++   * represents the target prediction for a return instruction and a 0 entry for
++   * a branch-and-link instruction. */
++  std::map<uint64_t, uint64_t> rasHistory_;
++
++  /** The size of the RAS. */
++  uint64_t rasSize_ = 1000;
++};
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/include/simeng/pipeline_hi/WritebackUnit.hh b/src/include/simeng/pipeline_hi/WritebackUnit.hh
+new file mode 100644
+index 00000000..0816d3b5
+--- /dev/null
++++ b/src/include/simeng/pipeline_hi/WritebackUnit.hh
+@@ -0,0 +1,62 @@
++#pragma once
++
++#include <functional>
++
++#include "simeng/Instruction.hh"
++#include "simeng/pipeline_hi/PipelineBuffer.hh"
++#include <deque>
++
++namespace simeng {
++namespace pipeline_hi {
++
++/** A writeback pipeline unit. Responsible for writing instruction results to
++ * the register files. */
++class WritebackUnit {
++ public:
++  /** Constructs a writeback unit with references to an input buffer and
++   * register file to write to. */
++  WritebackUnit(std::vector<PipelineBuffer<std::shared_ptr<Instruction>>>&
++                    completionSlots,
++                RegisterFileSet& registerFileSet,
++                std::function<void(uint64_t insnId)> flagMicroOpCommits,
++                std::function<void(const std::shared_ptr<Instruction>&)> removeDep,
++                std::function<bool(const std::shared_ptr<Instruction>&)> removeInstrOrderQ);
++
++  /** Tick the writeback unit to perform its operation for this cycle. */
++  void tick();
++
++  /** Retrieve a count of the number of instructions retired. */
++  uint64_t getInstructionsWrittenCount() const;
++
++  /** Retrieve instruction(s) to be printed out to the trace */
++  std::vector<std::shared_ptr<Instruction>> getInstsForTrace();
++
++  /** Clear the container for tracing */
++  void traceFinished(); //Might be safer to update trace within WritebackUnit
++
++ private:
++  /** Buffers of completed instructions to process. */
++  std::vector<PipelineBuffer<std::shared_ptr<Instruction>>>& completionSlots_;
++
++  /** The register file set to write results into. */
++  RegisterFileSet& registerFileSet_;
++
++  /** A function handle called to determine if uops associated to an instruction
++   * ID can now be committed. */
++  std::function<void(uint64_t insnId)> flagMicroOpCommits_;
++
++    /** A function to remove the commited instruction from dependency queue. */
++  std::function<void(const std::shared_ptr<Instruction>&)> removeDep_;
++
++    /** A function to remove the commited instruction from ordering queue. */
++  std::function<bool(const std::shared_ptr<Instruction>&)> removeInstrOrderQ_;
++
++  /** The number of instructions processed and retired by this stage. */
++  uint64_t instructionsWritten_ = 0;
++
++  /** Instruction(s) to be printed out to the trace */
++  std::deque<std::shared_ptr<Instruction>> committedInstsForTrace_;
++};
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
+index 1fbf2865..97de63eb 100644
+--- a/src/lib/CMakeLists.txt
++++ b/src/lib/CMakeLists.txt
+@@ -9,6 +9,7 @@ set(SIMENG_SOURCES
+     arch/aarch64/MicroDecoder.cc
+     arch/riscv/Architecture.cc
+     arch/riscv/ExceptionHandler.cc
++    arch/riscv/SystemRegister.cc
+     arch/riscv/Instruction.cc
+     arch/riscv/Instruction_address.cc
+     arch/riscv/Instruction_decode.cc
+@@ -18,6 +19,7 @@ set(SIMENG_SOURCES
+     kernel/LinuxProcess.cc
+     models/emulation/Core.cc
+     models/inorder/Core.cc
++    models/mcu/Core.cc
+     models/outoforder/Core.cc
+     pipeline/A64FXPortAllocator.cc
+     pipeline/BalancedPortAllocator.cc
+@@ -32,6 +34,16 @@ set(SIMENG_SOURCES
+     pipeline/RenameUnit.cc
+     pipeline/ReorderBuffer.cc
+     pipeline/WritebackUnit.cc
++    pipeline_hi/DecodeUnit.cc
++    pipeline_hi/DispatchIssueUnit.cc
++    pipeline_hi/ExecuteUnit.cc
++    pipeline_hi/FetchUnit.cc
++    pipeline_hi/LoadStoreQueue.cc
++    pipeline_hi/RegDepMap.cc
++    pipeline_hi/RegisterAliasTable.cc
++    pipeline_hi/ReorderBuffer.cc
++    pipeline_hi/StaticPredictor.cc
++    pipeline_hi/WritebackUnit.cc
+     AlwaysNotTakenPredictor.cc
+     ArchitecturalRegisterFileSet.cc
+     CMakeLists.txt
+diff --git a/src/lib/CoreInstance.cc b/src/lib/CoreInstance.cc
+index 8ba06c8e..e8f91d34 100644
+--- a/src/lib/CoreInstance.cc
++++ b/src/lib/CoreInstance.cc
+@@ -90,6 +90,10 @@ void CoreInstance::setSimulationMode() {
+              "outoforder") {
+     mode_ = SimulationMode::OutOfOrder;
+     modeString_ = "Out-of-Order";
++  } else if (config_["Core"]["Simulation-Mode"].as<std::string>() ==
++             "mcu") {
++    mode_ = SimulationMode::MCU;
++    modeString_ = "MCU";
+   }
+ 
+   return;
+@@ -236,7 +240,7 @@ void CoreInstance::createCore() {
+   if (config_["Core"]["ISA"].as<std::string>() == "rv64" ||
+       config_["Core"]["ISA"].as<std::string>() == "rv32") {
+     arch_ =
+-        std::make_unique<simeng::arch::riscv::Architecture>(kernel_, config_);
++        std::make_unique<simeng::arch::riscv::Architecture>(kernel_, config_,dataMemory_);
+   } else if (config_["Core"]["ISA"].as<std::string>() == "AArch64") {
+     arch_ =
+         std::make_unique<simeng::arch::aarch64::Architecture>(kernel_, config_);
+@@ -244,6 +248,9 @@ void CoreInstance::createCore() {
+ 
+   // Construct branch predictor object
+   predictor_ = std::make_unique<simeng::GenericPredictor>(config_);
++  if (mode_ == SimulationMode::MCU) {
++    predictor_ = std::make_unique<simeng::pipeline_hi::StaticPredictor>(2); //config_
++  }
+ 
+   // Extract port arrangement from config file
+   auto config_ports = config_["Ports"];
+@@ -268,6 +275,10 @@ void CoreInstance::createCore() {
+     core_ = std::make_shared<simeng::models::inorder::Core>(
+         *instructionMemory_, *dataMemory_, processMemorySize_, entryPoint,
+         *arch_, *predictor_);
++  } else if (mode_ == SimulationMode::MCU) {
++    core_ = std::make_shared<simeng::models::mcu::Core>(
++        *instructionMemory_, *dataMemory_, processMemorySize_, entryPoint,
++        *arch_, *predictor_, config_);
+   } else if (mode_ == SimulationMode::OutOfOrder) {
+     core_ = std::make_shared<simeng::models::outoforder::Core>(
+         *instructionMemory_, *dataMemory_, processMemorySize_, entryPoint,
+diff --git a/src/lib/Elf.cc b/src/lib/Elf.cc
+index 62815984..901f370e 100644
+--- a/src/lib/Elf.cc
++++ b/src/lib/Elf.cc
+@@ -2,6 +2,7 @@
+ 
+ #include <cstring>
+ #include <fstream>
++#include <iostream>
+ 
+ namespace simeng {
+ 
+@@ -13,7 +14,8 @@ namespace simeng {
+  * https://man7.org/linux/man-pages/man5/elf.5.html
+  */
+ 
+-Elf::Elf(std::string path, char** imagePointer) {
++Elf::Elf(std::string path, char** imagePointer, std::unordered_map<std::string, uint64_t>& symbols)
++{
+   std::ifstream file(path, std::ios::binary);
+ 
+   if (!file.is_open()) {
+@@ -174,120 +176,69 @@ Elf::Elf(std::string path, char** imagePointer) {
+       }
+     }
+   } else {
+-    /**
+-     * Starting from the 24th byte of the ELF header a 32-bit value
+-     * represents the virtual address to which the system first transfers
+-     * control, thus starting the process.
+-     * In `elf32_hdr` this value maps to the member `Elf32_Addr e_entry`.
+-     */
++    file.seekg(0);
+ 
+-    // Seek to the entry point of the file.
+-    // The information in between is discarded
+-    file.seekg(0x18);
+-    file.read(reinterpret_cast<char*>(&entryPoint32_), sizeof(entryPoint32_));
++    Elf32_Ehdr eheader;
++    file.read(reinterpret_cast<char*>(&eheader), sizeof(eheader));
+ 
+-    /**
+-     * Starting from the 32nd byte of the ELF Header a 64-bit value
+-     * represents the offset of the ELF Program header or
+-     * Program header table in the ELF file.
+-     * In `elf32_hdr` this value maps to the member `Elf32_Addr e_phoff`.
+-     */
+-
+-    // Seek to the byte representing the start of the header offset table.
+-    uint32_t headerOffset;
+-    file.read(reinterpret_cast<char*>(&headerOffset), sizeof(headerOffset));
+-
+-    /**
+-     * Starting 42th byte of the ELF Header a 16-bit value indicates
+-     * the size of each entry in the ELF Program header. In the `elf32_hdr`
+-     * struct this value maps to the member `Elf32_Half e_phentsize`. All
+-     * header entries have the same size.
+-     * Starting from the 44th byte a 16-bit value represents the number
+-     * of header entries in the ELF Program header. In the `elf32_hdr`
+-     * struct this value maps to `Elf32_Half e_phnum`.
+-     */
+-
+-    // Seek to the byte representing header entry size.
+-    file.seekg(0x2a);
+-    uint16_t headerEntrySize;
+-    file.read(reinterpret_cast<char*>(&headerEntrySize), sizeof(headerEntrySize));
+-    uint16_t headerEntries;
+-    file.read(reinterpret_cast<char*>(&headerEntries), sizeof(headerEntries));
+-
+-    // Resize the header to equal the number of header entries.
+-    headers32_.resize(headerEntries);
++    entryPoint32_ = eheader.e_entry;
++  
+     processImageSize_ = 0;
+ 
+-    // Loop over all headers and extract them.
+-    for (size_t i = 0; i < headerEntries; i++) {
+-      // Since all headers entries have the same size.
+-      // We can extract the nth header using the header offset
+-      // and header entry size.
+-      file.seekg(headerOffset + (i * headerEntrySize));
+-      auto& header = headers32_[i];
++    // Loop over pheaders and extract them.
++    file.seekg(eheader.e_phoff);
++    std::vector<Elf32_Phdr> pheaders(eheader.e_phnum);
++    for (auto &ph : pheaders) {
++        file.read(reinterpret_cast<char*>(&ph), sizeof(ph));
++        if ((ph.p_type == PT_LOAD) && (ph.p_vaddr+ph.p_memsz > processImageSize_))
++           processImageSize_ = ph.p_vaddr+ph.p_memsz;
++    }
+ 
+-      /**
+-       * Like the ELF Header, the ELF Program header is also defined
+-       * using a struct:
+-       *  typedef struct {
+-       *    uint32_t   p_type;
+-       *    Elf32_Off  p_offset;
+-       *    Elf32_Addr p_vaddr;
+-       *    Elf32_Addr p_paddr;
+-       *    uint32_t   p_filesz;
+-       *    uint32_t   p_memsz;
+-       *    uint32_t   p_flags;
+-       *    uint32_t   p_align;
+-       *  } Elf32_Phdr;
+-       *
+-       * The ELF Program header table is an array of structures,
+-       * each describing a segment or other information the system
+-       * needs to prepare the program for execution. A segment
+-       * contains one or more sections (ELF Program Section).
+-       *
+-       * The `p_vaddr` field holds the virtual address at which the first
+-       * byte of the segment resides in memory and the `p_memsz` field
+-       * holds the number of bytes in the memory image of the segment.
+-       * It may be zero. The `p_offset` member holds the offset from the
+-       * beginning of the file at which the first byte of the segment resides.
+-       */
++    *imagePointer = (char*)malloc(processImageSize_ * sizeof(char));
+ 
+-      // Each address-related field is 4 bytes in a 32-bit ELF file
+-      const int fieldBytes = 4;
+-      file.read(reinterpret_cast<char*>(&(header.type)), sizeof(header.type));
+-      file.read(reinterpret_cast<char*>(&(header.offset)), fieldBytes);
+-      file.read(reinterpret_cast<char*>(&(header.virtualAddress)), fieldBytes);
+-      file.read(reinterpret_cast<char*>(&(header.physicalAddress)), fieldBytes);
+-      file.read(reinterpret_cast<char*>(&(header.fileSize)), fieldBytes);
+-      file.read(reinterpret_cast<char*>(&(header.memorySize)), fieldBytes);
++    for (const auto& ph : pheaders) {
++       if (ph.p_type == PT_LOAD) {
++        file.seekg(ph.p_offset);
++        // Read `fileSize` bytes from `file` into the appropriate place in process memory
++        file.read(*imagePointer+ph.p_vaddr, ph.p_filesz);
+ 
+-      // To construct the process we look for the largest virtual address and
+-      // add it to the memory size of the header. This way we obtain a very
+-      // large array which can hold data at large virtual address.
+-      // However, this way we end up creating a sparse array, in which most
+-      // of the entries are unused. Also SimEng internally treats these
+-      // virtual address as physical addresses to index into this large array.
+-      if (header.virtualAddress + header.memorySize > processImageSize_) {
+-        processImageSize_ = header.virtualAddress + header.memorySize;
++        if (ph.p_memsz>ph.p_filesz)
++          // Need to padd the rest of the section memory with zeros
++          memset(*imagePointer+ph.p_vaddr+ph.p_filesz, 0, ph.p_memsz-ph.p_filesz);
+       }
+     }
+ 
+-    *imagePointer = (char*)malloc(processImageSize_ * sizeof(char));
+-    /**
+-     * The ELF Program header has a member called `p_type`, which represents
+-     * the kind of data or memory segments described by the program header.
+-     * The value PT_LOAD=1 represents a loadable segment. In other words,
+-     * it contains initialized data that contributes to the program's
+-     * memory image.
+-     */
+-
+-    // Process headers; only observe LOAD sections for this basic implementation
+-    for (const auto& header : headers32_) {
+-      if (header.type == 1) {  // LOAD
+-        file.seekg(header.offset);
+-        // Read `fileSize` bytes from `file` into the appropriate place in process
+-        // memory
+-        file.read(*imagePointer + header.virtualAddress, header.fileSize);
++    // read section headers
++    Elf32_Shdr* sh_strtab = NULL;
++    Elf32_Shdr* sh_symtab = NULL;
++    file.seekg(eheader.e_shoff);
++    std::vector<Elf32_Shdr> sheaders(eheader.e_shnum);
++    unsigned int sh_idx = 0;
++    for (auto &sh : sheaders) {
++      file.read(reinterpret_cast<char*>(&sh), sizeof(sh));
++
++      // find section header for strings to use for symbol table.
++      if (sh.sh_type==SHT_SYMTAB)
++        sh_symtab = &sh;
++      else if (sh.sh_type==SHT_STRTAB && sh_idx!=eheader.e_shstrndx)
++        sh_strtab = &sh;
++      sh_idx++;
++    };
++
++    // Read strings table
++    file.seekg(sh_strtab->sh_offset);
++    std::vector<char> strtab(sh_strtab->sh_size);
++    file.read(&strtab[0], sh_strtab->sh_size);
++
++    // Read symbols tables
++    file.seekg(sh_symtab->sh_offset);
++    unsigned num_symbols = sh_symtab->sh_size/sh_symtab->sh_entsize;
++    Elf32_Sym sym;
++    while(num_symbols--) {
++      file.read(reinterpret_cast<char*>(&sym), sizeof(sym));
++      if (strtab[sym.st_name]) {
++        std::string name(&strtab[sym.st_name]);
++        symbols[name] = sym.st_value;
+       }
+     }
+   }
+diff --git a/src/lib/GenericPredictor.cc b/src/lib/GenericPredictor.cc
+index 2539d7ae..4b93d832 100644
+--- a/src/lib/GenericPredictor.cc
++++ b/src/lib/GenericPredictor.cc
+@@ -110,4 +110,11 @@ void GenericPredictor::flush(uint64_t address) {
+   }
+ }
+ 
++
++BranchPrediction GenericPredictor::predict(uint64_t address, BranchType type,
++                                           uint64_t knownTarget,
++                                           uint8_t byteLength) {
++  return predict(address, type, knownTarget);
++}
++
+ }  // namespace simeng
+diff --git a/src/lib/Instruction.cc b/src/lib/Instruction.cc
+index ac923c11..d1b7b112 100644
+--- a/src/lib/Instruction.cc
++++ b/src/lib/Instruction.cc
+@@ -57,5 +57,8 @@ bool Instruction::isLastMicroOp() const { return isLastMicroOp_; }
+ void Instruction::setWaitingCommit() { waitingCommit_ = true; }
+ bool Instruction::isWaitingCommit() const { return waitingCommit_; }
+ int Instruction::getMicroOpIndex() const { return microOpIndex_; }
++bool Instruction::isDiv() const { return isDiv_; }
++bool Instruction::isMul() const { return isMul_; }
++bool Instruction::isSysCall() const { return isSysCall_; }
+ 
+ }  // namespace simeng
+diff --git a/src/lib/ModelConfig.cc b/src/lib/ModelConfig.cc
+index 88cc1f7d..34247634 100644
+--- a/src/lib/ModelConfig.cc
++++ b/src/lib/ModelConfig.cc
+@@ -69,7 +69,7 @@ void ModelConfig::validate() {
+       configFile_[root][subFields[0]], subFields[0],
+       std::vector<std::string>({"AArch64", "rv64", "rv32"}), ExpectedValue::String);
+   nodeChecker<std::string>(configFile_[root][subFields[1]], subFields[1],
+-                           {"emulation", "inorderpipelined", "outoforder"},
++                           {"emulation", "inorderpipelined", "mcu", "outoforder"},
+                            ExpectedValue::String);
+   nodeChecker<float>(configFile_[root][subFields[2]], subFields[2],
+                      std::make_pair(0.f, 10.f), ExpectedValue::Float);
+diff --git a/src/lib/arch/aarch64/Architecture.cc b/src/lib/arch/aarch64/Architecture.cc
+index 23ebf86a..5ad11c70 100644
+--- a/src/lib/arch/aarch64/Architecture.cc
++++ b/src/lib/arch/aarch64/Architecture.cc
+@@ -281,11 +281,13 @@ ProcessStateChange Architecture::getInitialState() const {
+ 
+ uint8_t Architecture::getMaxInstructionSize() const { return 4; }
+ 
++uint8_t Architecture::getMinInstructionSize() const { return 4; }
++
+ uint64_t Architecture::getVectorLength() const { return VL_; }
+ 
+ uint64_t Architecture::getStreamingVectorLength() const { return SVL_; }
+ 
+-void Architecture::updateSystemTimerRegisters(RegisterFileSet* regFile,
++int16_t Architecture::updateSystemTimerRegisters(RegisterFileSet* regFile,
+                                               const uint64_t iterations) const {
+   // Update the Processor Cycle Counter to total cycles completed.
+   regFile->set(PCCreg_, iterations);
+@@ -293,6 +295,8 @@ void Architecture::updateSystemTimerRegisters(RegisterFileSet* regFile,
+   if (iterations % (uint64_t)vctModulo_ == 0) {
+     regFile->set(VCTreg_, regFile->get(VCTreg_).get<uint64_t>() + 1);
+   }
++  // interrupts NYI
++  return -1;
+ }
+ 
+ std::vector<RegisterFileStructure>
+diff --git a/src/lib/arch/aarch64/Instruction.cc b/src/lib/arch/aarch64/Instruction.cc
+index 909f5263..602bdc7f 100644
+--- a/src/lib/arch/aarch64/Instruction.cc
++++ b/src/lib/arch/aarch64/Instruction.cc
+@@ -43,7 +43,7 @@ Instruction::Instruction(const Instruction& insn)
+   branchAddress_ = insn.branchAddress_;
+   branchTaken_ = insn.branchTaken_;
+   branchType_ = insn.branchType_;
+-  knownTarget_ = insn.knownTarget_;
++  knownOffset_ = insn.knownOffset_;
+   sequenceId_ = insn.sequenceId_;
+   flushed_ = insn.flushed_;
+   latency_ = insn.latency_;
+@@ -182,7 +182,7 @@ std::tuple<bool, uint64_t> Instruction::checkEarlyBranchMisprediction() const {
+ 
+ BranchType Instruction::getBranchType() const { return branchType_; }
+ 
+-uint64_t Instruction::getKnownTarget() const { return knownTarget_; }
++uint64_t Instruction::getKnownOffset() const { return knownOffset_; }
+ 
+ uint16_t Instruction::getGroup() const {
+   // Use identifiers to decide instruction group
+diff --git a/src/lib/arch/aarch64/Instruction_decode.cc b/src/lib/arch/aarch64/Instruction_decode.cc
+index e3f0529e..ca869629 100644
+--- a/src/lib/arch/aarch64/Instruction_decode.cc
++++ b/src/lib/arch/aarch64/Instruction_decode.cc
+@@ -364,7 +364,7 @@ void Instruction::decode() {
+     switch (metadata.opcode) {
+       case Opcode::AArch64_B:  // b label
+         branchType_ = BranchType::Unconditional;
+-        knownTarget_ = metadata.operands[0].imm;
++        knownOffset_ = metadata.operands[0].imm;
+         break;
+       case Opcode::AArch64_BR: {  // br xn
+         branchType_ = BranchType::Unconditional;
+@@ -372,7 +372,7 @@ void Instruction::decode() {
+       }
+       case Opcode::AArch64_BL:  // bl #imm
+         branchType_ = BranchType::SubroutineCall;
+-        knownTarget_ = metadata.operands[0].imm;
++        knownOffset_ = metadata.operands[0].imm;
+         break;
+       case Opcode::AArch64_BLR: {  // blr xn
+         branchType_ = BranchType::SubroutineCall;
+@@ -383,7 +383,7 @@ void Instruction::decode() {
+           branchType_ = BranchType::LoopClosing;
+         else
+           branchType_ = BranchType::Conditional;
+-        knownTarget_ = metadata.operands[0].imm;
++        knownOffset_ = metadata.operands[0].imm;
+         break;
+       }
+       case Opcode::AArch64_CBNZW:  // cbnz wn, #imm
+@@ -397,7 +397,7 @@ void Instruction::decode() {
+           branchType_ = BranchType::LoopClosing;
+         else
+           branchType_ = BranchType::Conditional;
+-        knownTarget_ = metadata.operands[1].imm;
++        knownOffset_ = metadata.operands[1].imm;
+         break;
+       }
+       case Opcode::AArch64_TBNZW:  // tbnz wn, #imm, label
+@@ -411,7 +411,7 @@ void Instruction::decode() {
+           branchType_ = BranchType::LoopClosing;
+         else
+           branchType_ = BranchType::Conditional;
+-        knownTarget_ = metadata.operands[2].imm;
++        knownOffset_ = metadata.operands[2].imm;
+         break;
+       }
+       case Opcode::AArch64_RET: {  // ret {xr}
+diff --git a/src/lib/arch/riscv/Architecture.cc b/src/lib/arch/riscv/Architecture.cc
+index d1a18777..84afcc09 100644
+--- a/src/lib/arch/riscv/Architecture.cc
++++ b/src/lib/arch/riscv/Architecture.cc
+@@ -4,6 +4,7 @@
+ #include <cassert>
+ #include <iostream>
+ #include <queue>
++#include <string>
+ 
+ #include "InstructionMetadata.hh"
+ 
+@@ -14,8 +15,10 @@ namespace riscv {
+ std::unordered_map<uint32_t, Instruction> Architecture::decodeCache;
+ std::forward_list<InstructionMetadata> Architecture::metadataCache;
+ 
+-Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
+-    : linux_(kernel) {
++Architecture::Architecture(kernel::Linux& kernel, YAML::Node config, std::shared_ptr<simeng::MemoryInterface>& dataMemory)
++: 
++  linux_(kernel)
++{
+   is32Bit_ = ARCH_64BIT;
+   if (config["Core"]["ISA"].as<std::string>() == "rv32") {
+     is32Bit_ = ARCH_32BIT;
+@@ -46,14 +49,39 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
+ 
+   // Generate zero-indexed system register map
+   systemRegisterMap_[SYSREG_MSTATUS] = systemRegisterMap_.size();
++  systemRegisterMap_[SYSREG_MIE] = systemRegisterMap_.size();
++  systemRegisterMap_[SYSREG_MTVEC] = systemRegisterMap_.size();
+   systemRegisterMap_[SYSREG_MSTATUSH] = systemRegisterMap_.size();
++  systemRegisterMap_[SYSREG_MSCRATCH] = systemRegisterMap_.size();
+   systemRegisterMap_[SYSREG_MEPC] = systemRegisterMap_.size();
+   systemRegisterMap_[SYSREG_MCAUSE] = systemRegisterMap_.size();
+   systemRegisterMap_[SYSREG_MHARTID] = systemRegisterMap_.size();
++  systemRegisterMap_[SYSREG_MXCPTSC] = systemRegisterMap_.size();
+   systemRegisterMap_[SYSREG_CYCLE] = systemRegisterMap_.size();
+   systemRegisterMap_[SYSREG_TIME] = systemRegisterMap_.size();
+   systemRegisterMap_[SYSREG_INSTRRET] = systemRegisterMap_.size();
+ 
++  // Memory Mapped System Register Blocks
++
++  // if elf file includes the label tohost then assume that this binary supports HTIF protocol (used by spike) and include an HTI block
++  uint64_t htifAddress;
++  if (linux_.lookupSymbolValue("tohost",htifAddress))
++  {
++    std::cout << "[SimEng] HTIF detected at: " << std::hex << htifAddress << std::endl;
++    htif = std::make_shared<HostTargetInterface>(*this);
++    memoryMappedSystemRegisterBlocks[htifAddress] = htif.get();
++  }
++
++  // Install CLINT into memort map, this is optional
++  clint = std::make_shared<Clint>(*this);
++  memoryMappedSystemRegisterBlocks[Clint::CLINT_BASE] = clint.get();
++
++  if (!memoryMappedSystemRegisterBlocks.empty())
++  {
++    systemRegisterMemoryInterface = std::make_shared<SystemRegisterMemoryInterface>(dataMemory, memoryMappedSystemRegisterBlocks);
++    dataMemory = systemRegisterMemoryInterface;
++  }
++
+   // Instantiate an executionInfo entry for each group in the InstructionGroup
+   // namespace.
+   for (int i = 0; i < NUM_GROUPS; i++) {
+@@ -145,7 +173,7 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
+       }
+     }
+   }
+-  if (config["Core"]["Trace"].as<bool>()) {
++  if (config["Core"]["Trace"].IsDefined() && config["Core"]["Trace"].as<bool>()) {
+     traceFile_ = new std::ofstream();
+     traceFile_->open("./trace.log");
+     traceOn_ = true;
+@@ -164,6 +192,7 @@ Architecture::~Architecture() {
+ uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
+                                 uint64_t instructionAddress,
+                                 MacroOp& output) const {
++
+   // Check that instruction address is 4-byte aligned as required by RISC-V
+   // 2-byte when Compressed ISA is supported
+   if (instructionAddress & constants_.alignMask) {
+@@ -221,9 +250,11 @@ uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
+   output.resize(1);
+   auto& uop = output[0];
+ 
+-  // Retrieve the cached instruction and write to output
+-  uop = std::make_shared<Instruction>(iter->second);
++  // Retrieve the cached instruction
++  auto newinsn = std::make_shared<Instruction>(iter->second);
+ 
++  // write to output
++  uop = newinsn;
+   uop->setInstructionAddress(instructionAddress);
+ 
+   return iter->second.getMetadata().lenBytes;
+@@ -265,8 +296,19 @@ int32_t Architecture::getSystemRegisterTag(uint16_t reg) const {
+   // Check below is done for speculative instructions that may be passed into
+   // the function but will not be executed. If such invalid speculative
+   // instructions get through they can cause an out-of-range error.
+-  if (!systemRegisterMap_.count(reg)) return 0;
+-  return systemRegisterMap_.at(reg);
++  if (systemRegisterMap_.count(reg))
++    return systemRegisterMap_.at(reg);
++  else
++    return -1;
++}
++
++/** Returns a System Register index from a system register tag.
++    reverse lookup slow but only used in printing so will be fine */
++uint16_t Architecture::getSystemRegisterIdFromTag(int32_t tag) const {
++  for (auto it = systemRegisterMap_.begin();it != systemRegisterMap_.end();it++)
++      if (it->second == tag)
++        return it->first;
++  assert(0 && "Tag not found in systemRegisterMap");
+ }
+ 
+ ProcessStateChange Architecture::getInitialState() const {
+@@ -289,6 +331,8 @@ ProcessStateChange Architecture::getInitialState() const {
+ 
+ uint8_t Architecture::getMaxInstructionSize() const { return 4; }
+ 
++uint8_t Architecture::getMinInstructionSize() const { return 2; }
++
+ std::vector<RegisterFileStructure>
+ Architecture::getConfigPhysicalRegisterStructure(YAML::Node config) const {
+   return {{constants_.regWidth, config["Register-Set"]["GeneralPurpose-Count"].as<uint16_t>()},
+@@ -306,9 +350,21 @@ uint16_t Architecture::getNumSystemRegisters() const {
+   return static_cast<uint16_t>(systemRegisterMap_.size());
+ }
+ 
+-// Left blank as no implementation necessary
+-void Architecture::updateSystemTimerRegisters(RegisterFileSet* regFile,
++int16_t Architecture::updateSystemTimerRegisters(RegisterFileSet* regFile,
+                                               const uint64_t iterations) const {
++  int16_t interruptId = -1;
++
++  if (htif)
++  {
++    interruptId = htif->updateSystemTimerRegisters(regFile, iterations);
++    if (interruptId>=0)
++       return interruptId;
++  }
++
++  if (clint)
++    interruptId = clint->updateSystemTimerRegisters(regFile, iterations);
++
++  return interruptId;
+ }
+ 
+ void Architecture::updateInstrTrace(const std::shared_ptr<simeng::Instruction>& instruction,
+@@ -346,7 +402,7 @@ void Architecture::updateInstrTrace(const std::shared_ptr<simeng::Instruction>&
+           } else if(reg.type == RegisterType::FLOAT) {
+             s << "f" << std::dec << std::setfill('0') << std::setw(2) << reg.tag << "=0x";
+           } else if(reg.type == RegisterType::SYSTEM) {
+-            s << "csr_0x" << std::hex << std::setfill('0') << std::setw(3) << metadata.csr << "=0x";
++            s << "csr_0x" << std::hex << std::setfill('0') << std::setw(3) << getSystemRegisterIdFromTag(reg.tag) << "=0x";
+           }
+           s << std::hex << std::setfill('0') << std::setw(8) << regFile->get(reg).get<uint32_t>();
+           if(i < (num_dest-1)) {
+@@ -364,7 +420,7 @@ void Architecture::updateInstrTrace(const std::shared_ptr<simeng::Instruction>&
+           } else if(reg.type == RegisterType::FLOAT) {
+             s << "f" << std::dec << std::setfill('0') << std::setw(2) << reg.tag << "=0x";
+           } else if(reg.type == RegisterType::SYSTEM) {
+-            s << "csr_0x" << std::hex << std::setfill('0') << std::setw(3) << metadata.csr << "=0x";
++            s << "csr_0x" << std::hex << std::setfill('0') << std::setw(3) << getSystemRegisterIdFromTag(reg.tag) << "=0x";
+           }
+           s << std::hex << std::setfill('0') << std::setw(8) << regFile->get(reg).get<uint32_t>();
+           if(i < (num_src-1)) {
+diff --git a/src/lib/arch/riscv/ExceptionHandler.cc b/src/lib/arch/riscv/ExceptionHandler.cc
+index c8844804..9ba22008 100644
+--- a/src/lib/arch/riscv/ExceptionHandler.cc
++++ b/src/lib/arch/riscv/ExceptionHandler.cc
+@@ -1,5 +1,5 @@
++#include "simeng/arch/riscv/Architecture.hh"
+ #include "simeng/arch/riscv/ExceptionHandler.hh"
+-
+ #include <iomanip>
+ #include <iostream>
+ 
+@@ -646,6 +646,18 @@ bool ExceptionHandler::init() {
+     }
+ 
+     return concludeSyscall(stateChange);
++
++  } else if (exception == InstructionException::SecureMonitorCall) {
++    printException(instruction_);
++    takeException(CAUSE_BREAKPOINT);
++    return true;
++   } else if (exception == InstructionException::Interrupt) {
++    printException(instruction_);
++    if (instruction_.getInterruptId() == static_cast<uint16_t>(InterruptId::HALT))
++      return fatal();
++    uint64_t mcause_val = static_cast<uint64_t>(instruction_.getInterruptId()) | (1<<(8*instruction_.getArchRegWidth()-1));
++    takeException(mcause_val);
++    return true;
+   }
+ 
+   printException(instruction_);
+@@ -745,6 +757,45 @@ void ExceptionHandler::readLinkAt(span<char> path) {
+   concludeSyscall(stateChange);
+ }
+ 
++void ExceptionHandler::takeException(uint64_t causecode)
++{
++  const auto& registerFileSet = core.getArchitecturalRegisterFileSet();
++  auto& architecture    = instruction_.getArchitecture();
++  uint16_t mtvec_tag    = static_cast<uint16_t>(architecture.getSystemRegisterTag(SYSREG_MTVEC));
++  uint16_t mstatus_tag  = static_cast<uint16_t>(architecture.getSystemRegisterTag(SYSREG_MSTATUS));
++  uint16_t mepc_tag     = static_cast<uint16_t>(architecture.getSystemRegisterTag(SYSREG_MEPC));
++  uint16_t mcause_tag   = static_cast<uint16_t>(architecture.getSystemRegisterTag(SYSREG_MCAUSE));
++  uint64_t mcause_val   = static_cast<uint64_t>(causecode);
++
++  auto  mstatus_bits = registerFileSet.get( { RegisterType::SYSTEM, mstatus_tag } ).get<uint64_t>();
++
++  // mpie=mie, mie=0
++  mstatus_bits &= ~MSTATUS_MPIE_MASK;
++  if (mstatus_bits & MSTATUS_MIE_MASK)
++    mstatus_bits |= MSTATUS_MPIE_MASK;
++  mstatus_bits &= ~MSTATUS_MIE_MASK;
++
++  RegisterValue mstatus (mstatus_bits,                          architecture.getConstants().regWidth);
++  RegisterValue mepc    (instruction_.getInstructionAddress(),  architecture.getConstants().regWidth);
++  RegisterValue mcause  (mcause_val,                            architecture.getConstants().regWidth);
++
++  uint64_t      mtvec   = registerFileSet.get( { RegisterType::SYSTEM, mtvec_tag } ).get<uint64_t>();
++
++  ProcessStateChange changes = {
++    ChangeType::REPLACEMENT,
++    {
++      { RegisterType::SYSTEM, mstatus_tag },
++      { RegisterType::SYSTEM, mepc_tag },
++      { RegisterType::SYSTEM, mcause_tag }
++    },
++    {mstatus,  mepc,   mcause}
++  };
++
++  result_ = {false, mtvec, changes};
++  //result_ = {false, instruction_.getInstructionAddress(), changes};
++}
++
++
+ bool ExceptionHandler::readBufferThen(uint64_t ptr, uint64_t length,
+                                       std::function<bool()> then,
+                                       bool firstCall) {
+@@ -827,6 +878,9 @@ void ExceptionHandler::printException(const Instruction& insn) const {
+     case InstructionException::NoAvailablePort:
+       std::cout << "unsupported execution port";
+       break;
++    case InstructionException::Interrupt:
++      std::cout << "interrupt (id: " << insn.getInterruptId() << ")";
++      break;
+     case InstructionException::UnmappedSysReg:
+       std::cout << "unmapped system register";
+       break;
+diff --git a/src/lib/arch/riscv/Instruction.cc b/src/lib/arch/riscv/Instruction.cc
+index 6cfc173b..e292b889 100644
+--- a/src/lib/arch/riscv/Instruction.cc
++++ b/src/lib/arch/riscv/Instruction.cc
+@@ -131,7 +131,7 @@ std::tuple<bool, uint64_t> Instruction::checkEarlyBranchMisprediction() const {
+ 
+ BranchType Instruction::getBranchType() const { return branchType_; }
+ 
+-uint64_t Instruction::getKnownTarget() const { return knownTarget_; }
++uint64_t Instruction::getKnownOffset() const { return knownOffset_; }
+ 
+ uint16_t Instruction::getGroup() const {
+   uint16_t base = InstructionGroups::INT;
+@@ -171,6 +171,10 @@ void Instruction::setArchRegWidth(uint8_t len) { archRegWidth_ = len; }
+ 
+ uint8_t Instruction::getArchRegWidth() const { return archRegWidth_; }
+ 
++const Architecture& Instruction::getArchitecture() const {
++  return architecture_;
++}
++
+ }  // namespace riscv
+ }  // namespace arch
+ }  // namespace simeng
+diff --git a/src/lib/arch/riscv/InstructionMetadata.cc b/src/lib/arch/riscv/InstructionMetadata.cc
+index f2b5a9b7..d293bc7f 100644
+--- a/src/lib/arch/riscv/InstructionMetadata.cc
++++ b/src/lib/arch/riscv/InstructionMetadata.cc
+@@ -264,7 +264,10 @@ void InstructionMetadata::alterPseudoInstructions(const cs_insn& insn) {
+       csr = ((uint32_t)encoding[3] << 4) | ((uint32_t)encoding[2] >> 4);
+       //If there are less than 2 operands provided add necessary x0 operand
+       if(operandCount == 1) {
+-        if(strcmp(mnemonic, "csrr") == 0) { //csrrs rd,csr,x0
++        if((strcmp(mnemonic, "rdinstret") == 0) ||
++           (strcmp(mnemonic, "rdcycle") == 0) ||
++           (strcmp(mnemonic, "rdtime") == 0) ||
++           (strcmp(mnemonic, "csrr") == 0)) { //csrrs rd,csr,x0
+           operands[1].type = RISCV_OP_REG;
+           operands[1].reg = 1;
+         } else { //csrrxx x0,csr,rs/imm
+diff --git a/src/lib/arch/riscv/Instruction_decode.cc b/src/lib/arch/riscv/Instruction_decode.cc
+index 8bdd5041..9efa7f5c 100644
+--- a/src/lib/arch/riscv/Instruction_decode.cc
++++ b/src/lib/arch/riscv/Instruction_decode.cc
+@@ -143,6 +143,24 @@ void Instruction::decode() {
+     case Opcode::RISCV_SD:
+       isStore_ = true;
+       break;
++      //identify MULs/DIVs
++    case Opcode::RISCV_MUL:
++    case Opcode::RISCV_MULH:
++    case Opcode::RISCV_MULHU:
++    case Opcode::RISCV_MULHSU:
++    case Opcode::RISCV_MULW:
++      isMultiply_ = true;
++      isMul_ = true; //this one is for simeng/Instruction.hh
++      break;
++    case Opcode::RISCV_DIV:
++    case Opcode::RISCV_DIVU:
++    case Opcode::RISCV_DIVUW:
++    case Opcode::RISCV_DIVW:
++      isDivide_ = true;
++      isDiv_ = true; //this one is for simeng/Instruction.hh
++      break;
++    case Opcode::RISCV_ECALL:
++      isSysCall_ = true;
+   }
+ 
+   if (Opcode::RISCV_AMOADD_D <= metadata.opcode &&
+@@ -257,6 +275,16 @@ void Instruction::decode() {
+     isCompare_ = true;
+   }
+ 
++  if (Opcode::RISCV_MRET == metadata.opcode) {
++    uint16_t mepc_tag     = static_cast<uint16_t>(architecture_.getSystemRegisterTag(SYSREG_MEPC));
++    uint16_t mstatus_tag  = static_cast<uint16_t>(architecture_.getSystemRegisterTag(SYSREG_MSTATUS));
++    sourceRegisters[sourceRegisterCount++]            = { RegisterType::SYSTEM, mepc_tag };
++    sourceRegisters[sourceRegisterCount++]            = { RegisterType::SYSTEM, mstatus_tag };
++    destinationRegisters[destinationRegisterCount++]  = { RegisterType::SYSTEM, mstatus_tag };
++    operandsPending += 2;
++    isBranch_ = true;
++  }
++
+   // Set branch type
+   switch (metadata.opcode) {
+     case Opcode::RISCV_BEQ:
+@@ -266,12 +294,24 @@ void Instruction::decode() {
+     case Opcode::RISCV_BGE:
+     case Opcode::RISCV_BGEU:
+       branchType_ = BranchType::Conditional;
+-      knownTarget_ = instructionAddress_ + metadata.operands[2].imm;
++      knownOffset_ = metadata.operands[2].imm;
+       break;
+     case Opcode::RISCV_JAL:
++      branchType_ = BranchType::SubroutineCall;
++      knownOffset_ = metadata.operands[1].imm;
++      break;
+     case Opcode::RISCV_JALR:
+-      branchType_ = BranchType::Unconditional;
+-      knownTarget_ = instructionAddress_ + metadata.operands[1].imm;
++    {
++      //jalr x0, 0(x1) == ret
++      if (metadata.operands[0].reg == RISCV_REG_X0 && metadata.operands[1].reg == RISCV_REG_X1 && metadata.operands[2].imm == 0) {
++        branchType_ = BranchType::Return;
++      } else {
++        branchType_ = BranchType::SubroutineCall;
++      }
++      break;
++    }
++    case Opcode::RISCV_MRET:
++      branchType_ = BranchType::Unknown; //TODO: think which type it fits / create new type
+       break;
+   }
+ }
+@@ -292,10 +332,14 @@ bool Instruction::decode16() {
+              "Invalid operand for JR,JALR:- CR instructions");
+       sourceRegisters[sourceRegisterCount++] = csRegToRegister(metadata.operands[0].reg);
+       operandsPending++;
++      branchType_ = BranchType::SubroutineCall;
+       if (metadata.opcode == Opcode::RISCV_C_JALR) {
+         destinationRegisters[destinationRegisterCount++] = Instruction::RA_REGISTER;
++      } else { //case C_JR
++        if (metadata.operands[0].reg == RISCV_REG_X1 ) {
++          branchType_ = BranchType::Return;
++        }
+       }
+-      branchType_ = BranchType::Unconditional;
+       break;
+     case Opcode::RISCV_C_MV:
+       instFormat_ = CIF_CR;
+@@ -309,7 +353,7 @@ bool Instruction::decode16() {
+       sourceRegisters[sourceRegisterCount++] = csRegToRegister(metadata.operands[1].reg);
+       operandsPending++;
+       break;
+-    case Opcode::RISCV_C_EBREAK://TODO
++    case Opcode::RISCV_C_EBREAK:
+       instFormat_ = CIF_CR;
+       break;
+     case Opcode::RISCV_C_ADD:
+@@ -410,7 +454,7 @@ bool Instruction::decode16() {
+       operandsPending++;
+       c_imm = metadata.operands[1].imm;
+       branchType_ = BranchType::Conditional;
+-      knownTarget_ = instructionAddress_ + metadata.operands[1].imm;
++      knownOffset_ = metadata.operands[1].imm;
+       break;
+     case Opcode::RISCV_C_FLD:
+     case Opcode::RISCV_C_FLW:
+@@ -503,9 +547,11 @@ bool Instruction::decode16() {
+       c_imm = metadata.operands[0].imm;
+       if (metadata.opcode == Opcode::RISCV_C_JAL) {
+         destinationRegisters[destinationRegisterCount++] = Instruction::RA_REGISTER;
++        branchType_ = BranchType::SubroutineCall;
++      } else { // case C_J
++        branchType_ = BranchType::Unconditional;
+       }
+-      branchType_ = BranchType::Unconditional;
+-      knownTarget_ = instructionAddress_ + metadata.operands[0].imm;
++      knownOffset_ = metadata.operands[0].imm;
+       break;
+     case Opcode::RISCV_C_UNIMP:
+       break;
+@@ -523,7 +569,7 @@ bool Instruction::decodeCsr() {
+   }
+ 
+   isCsr_ = true;
+-  uint32_t sysRegTag = architecture_.getSystemRegisterTag(metadata.csr);
++  int32_t sysRegTag = architecture_.getSystemRegisterTag(metadata.csr);
+   if (sysRegTag == -1) {
+     exceptionEncountered_ = true;
+     exception_ = InstructionException::UnmappedSysReg;
+@@ -539,16 +585,16 @@ bool Instruction::decodeCsr() {
+   destinationRegisters[destinationRegisterCount++] = {
+       RegisterType::SYSTEM, static_cast<uint16_t>(sysRegTag)};
+ 
+-  // First operand from metadata is rd, second operand from metadata is rs1
+-  if (csRegToRegister(metadata.operands[1].reg) != Instruction::ZERO_REGISTER) {
++  // First operand (0) from metadata is rd, second operand (1) from metadata is rs1
++  if (csRegToRegister(metadata.operands[0].reg) != Instruction::ZERO_REGISTER) {
+     destinationRegisters[destinationRegisterCount++] =
+-        csRegToRegister(metadata.operands[1].reg);
++        csRegToRegister(metadata.operands[0].reg);
+   }
+ 
+-  if(metadata.operands[0].type == RISCV_OP_IMM) {
+-    c_imm = metadata.operands[0].imm;
+-  } else if (metadata.operands[0].type == RISCV_OP_REG) {
+-    sourceRegisters[sourceRegisterCount] = csRegToRegister(metadata.operands[0].reg);
++  if(metadata.operands[1].type == RISCV_OP_IMM) {
++    c_imm = metadata.operands[1].imm;
++  } else if (metadata.operands[1].type == RISCV_OP_REG) {
++    sourceRegisters[sourceRegisterCount] = csRegToRegister(metadata.operands[1].reg);
+     if (sourceRegisters[sourceRegisterCount] ==
+         Instruction::ZERO_REGISTER) {
+       // Catch zero register references and pre-complete those operands
+diff --git a/src/lib/arch/riscv/Instruction_execute.cc b/src/lib/arch/riscv/Instruction_execute.cc
+index b7a4a822..a37d3750 100644
+--- a/src/lib/arch/riscv/Instruction_execute.cc
++++ b/src/lib/arch/riscv/Instruction_execute.cc
+@@ -4,6 +4,7 @@
+ 
+ #include "InstructionMetadata.hh"
+ #include "simeng/arch/riscv/Instruction.hh"
++#include "simeng/arch/riscv/SystemRegister.hh"
+ 
+ namespace simeng {
+ namespace arch {
+@@ -358,7 +359,9 @@ void Instruction::execute() {
+     }
+     case Opcode::RISCV_SLTIU: {  // SLTIU rd,rs1,imm
+       const uint64_t rs1 = operands[0].get<uint64_t>();
+-      const uint64_t imm = static_cast<int64_t>(metadata.operands[2].imm);
++      uint64_t       imm = metadata.operands[2].imm;
++      if (archRegWidth_==4)
++        imm = static_cast<uint32_t>(imm);
+       if (rs1 < imm) {
+         results[0] = RegisterValue(static_cast<uint64_t>(1), archRegWidth_);
+       } else {
+@@ -460,13 +463,28 @@ void Instruction::execute() {
+       results[0] = RegisterValue(instructionAddress_ + 4, archRegWidth_);
+       break;
+     }
+-      // TODO EBREAK
++    case Opcode::RISCV_EBREAK: {  // EBREAK
+       // used to return control to a debugging environment pg27 20191213
++      exceptionEncountered_ = true;
++      exception_ = InstructionException::SecureMonitorCall;
++      break;
++    }
+     case Opcode::RISCV_ECALL: {  // ECALL
+       exceptionEncountered_ = true;
+       exception_ = InstructionException::SupervisorCall;
+       break;
+     }
++    case Opcode::RISCV_MRET: {  // MRET
++      branchAddress_  = (operands[0].get<uint64_t>()) & ~1;  // Set LSB of result to 0
++      branchTaken_    = true;
++
++      auto mstatus    = operands[1].get<uint64_t>();
++      if (mstatus & MSTATUS_MPIE_MASK)
++        mstatus |= MSTATUS_MIE_MASK;
++
++      results[0]      = RegisterValue(mstatus, archRegWidth_);
++      break;
++    }
+     case Opcode::RISCV_FENCE: {  // FENCE
+       // TODO currently modelled as a NOP as all codes are currently single
+       // threaded "Informally, no other RISC-V hart or external device can
+@@ -709,28 +727,49 @@ void Instruction::execute() {
+       results[0] = RegisterValue(static_cast<int64_t>(rs1 * rs2), archRegWidth_);
+       break;
+     }
+-      //    case Opcode::RISCV_MULH: {//MULH rd,rs1,rs2
+-      //      return executionNYI();
+-      //
+-      //      const int64_t rs1 = operands[0].get<int64_t>();
+-      //      const int64_t rs2 = operands[1].get<int64_t>();
+-      //      results[0] = RegisterValue(mulhiss(rs1, rs2);
+-      //      break;
+-      //    }
++    case Opcode::RISCV_MULH: {//MULH rd,rs1,rs2
++      int64_t result;
++      if (archRegWidth_==4)
++      {
++        const int64_t rs1 = operands[0].get<int32_t>();
++        const int64_t rs2 = operands[1].get<int32_t>();
++        result = (rs1*rs2)>>32;
++      } else {
++        const int64_t rs1 = operands[0].get<int64_t>();
++        const int64_t rs2 = operands[1].get<int64_t>();
++        //result = mulhiss(rs1, rs2);
++        return executionNYI();
++      }
++      results[0] = RegisterValue(result, archRegWidth_);
++      break;
++    }
+     case Opcode::RISCV_MULHU: {  // MULHU rd,rs1,rs2
+       const uint64_t rs1 = operands[0].get<uint64_t>();
+       const uint64_t rs2 = operands[1].get<uint64_t>();
+-      results[0] = RegisterValue(mulhiuu(rs1, rs2), archRegWidth_);
++      uint64_t result;
++      if (archRegWidth_==4)
++        result = (rs1*rs2)>>32;
++      else
++        result = mulhiuu(rs1, rs2);
++      results[0] = RegisterValue(result, archRegWidth_);
++      break;
++    }
++    case Opcode::RISCV_MULHSU: {//MULHSU rd,rs1,rs2
++      int64_t result;
++      if (archRegWidth_==4)
++      {
++        const int64_t rs1  = operands[0].get<int32_t>();
++        const uint64_t rs2 = operands[1].get<uint32_t>();
++        result = (rs1*rs2)>>32;
++      } else {
++        const int64_t rs1 = operands[0].get<int64_t>();
++        const uint64_t rs2 = operands[1].get<uint64_t>();
++        //result = mulhisu(rs1, rs2);
++        return executionNYI();
++      }
++      results[0] = RegisterValue(result, archRegWidth_);
+       break;
+     }
+-      //    case Opcode::RISCV_MULHSU: {//MULHSU rd,rs1,rs2
+-      //      return executionNYI();
+-      //
+-      //      const int64_t rs1 = operands[0].get<int64_t>();
+-      //      const uint64_t rs2 = operands[1].get<uint64_t>();
+-      //      results[0] = RegisterValue(mulhisu(rs1, rs2);
+-      //      break;
+-      //    }
+     case Opcode::RISCV_MULW: {  // MULW rd,rs1,rs2
+       const uint32_t rs1 = operands[0].get<uint32_t>();
+       const uint32_t rs2 = operands[1].get<uint32_t>();
+@@ -852,12 +891,14 @@ void Instruction::execute() {
+       uint32_t new_csr_value = old_csr_value & ~(operands[1].get<uint32_t>());
+       results[0] = RegisterValue(new_csr_value, 4);
+       results[1] = RegisterValue(old_csr_value, 4);
++      break;
+     }
+     case Opcode::RISCV_CSRRCI: {
+       uint32_t old_csr_value = operands[0].get<uint32_t>();
+       uint32_t new_csr_value = old_csr_value & ~(c_imm);
+       results[0] = RegisterValue(new_csr_value, 4);
+       results[1] = RegisterValue(old_csr_value, 4);
++      break;
+     }
+     case Opcode::RISCV_CSRRS: {
+       uint32_t old_csr_value = operands[0].get<uint32_t>();
+@@ -938,8 +979,12 @@ void Instruction::execute() {
+       }
+       break;
+     }
+-    case Opcode::RISCV_C_EBREAK:
++    case Opcode::RISCV_C_EBREAK: {
++      // used to return control to a debugging environment pg27 20191213
++      exceptionEncountered_ = true;
++      exception_ = InstructionException::SecureMonitorCall;
+       break;
++    }
+     case Opcode::RISCV_C_FLD:
+       break;
+     case Opcode::RISCV_C_FLDSP:
+diff --git a/src/lib/arch/riscv/SystemRegister.cc b/src/lib/arch/riscv/SystemRegister.cc
+new file mode 100644
+index 00000000..05de188d
+--- /dev/null
++++ b/src/lib/arch/riscv/SystemRegister.cc
+@@ -0,0 +1,124 @@
++#include "simeng/arch/riscv/Architecture.hh"
++
++namespace simeng {
++namespace arch {
++namespace riscv {
++
++bool MemoryMappedSystemRegisterBlock::put(uint16_t offset, const RegisterValue& value)
++{
++  auto it = memoryMappedSystemRegisters.upper_bound(offset);
++  if  (it != memoryMappedSystemRegisters.begin() )
++  {
++    it--;
++    if (offset-it->first < it->second->size()) {
++      it->second->put(value);
++      return true;
++    }
++    return false;
++  }
++  return false;
++}
++
++bool MemoryMappedSystemRegisterBlock::get(uint16_t offset, RegisterValue& value)
++{
++  auto it = memoryMappedSystemRegisters.upper_bound(offset);
++  if  (it != memoryMappedSystemRegisters.begin() )
++  {
++    it--;
++    if (offset-it->first < it->second->size()) {
++      value = it->second->get();
++      return true;
++    }
++    return false;
++  }
++  return false;
++}
++
++/** Put/Get Memory Mapped Registers */
++bool SystemRegisterMemoryInterface::putMemoryMappedSystemRegister(uint64_t address, const RegisterValue& value)
++{
++  auto it = memoryMappedSystemRegisterBlocks_.upper_bound(address);
++  if  (it != memoryMappedSystemRegisterBlocks_.begin() )
++  {
++    it--;
++    if (address-it->first < it->second->size()) {
++      it->second->put(static_cast<uint16_t>(address-it->first),value);
++      return true;
++    }
++    return false;
++  }
++  return false;
++}
++
++bool SystemRegisterMemoryInterface::getMemoryMappedSystemRegister(uint64_t address, RegisterValue& value)
++{
++  auto it = memoryMappedSystemRegisterBlocks_.upper_bound(address);
++  if  (it != memoryMappedSystemRegisterBlocks_.begin() )
++  {
++    it--;
++    if (address-it->first < it->second->size()) {
++      it->second->get(static_cast<uint16_t>(address-it->first),value);
++      return true;
++    }
++    return false;
++  }
++  return false;
++}
++
++bool HostTargetInterface::put(uint16_t offset, const RegisterValue&value)
++{
++  switch(offset) {
++    case PAYLOAD_OFFSET : 
++    {
++      char ch = value.getAsVector<uint8_t>()[0];
++      if (ch==3 || ch==1)
++        isHalted_ = true;
++      else 
++        putchar(ch);
++      return true;
++    }
++    default :
++      return MemoryMappedSystemRegisterBlock::put(offset, value);
++  }
++}
++
++int16_t Clint::updateSystemTimerRegisters(RegisterFileSet* regFile, const uint64_t iterations)
++{
++  uint64_t ticks      = iterations-last_tick;
++  uint64_t mtime_val  = mtime_.get().get<uint64_t>();
++  bool     ticked     = false;
++
++  last_tick = iterations;
++
++  // if large time passed then multiple timer ticks might be needed
++  while (ticks>=mtime_count)
++  {
++    ticks       -= mtime_count;
++    mtime_count  = mtime_freq;
++    mtime_val   += 1;
++    ticked       = true;
++  }
++
++  // any remaining ticks taken of mtime countdown
++  if (ticks)
++    mtime_count -= ticks;
++
++  mtime_.put(mtime_val);
++
++  if (ticked)
++  {
++    // to improve execution speed only do interrupt checks when the timer ticks
++    // check if interrupts enabled
++    uint16_t mstatus_tag  = static_cast<uint16_t>(architecture_.getSystemRegisterTag(SYSREG_MSTATUS));
++    auto     mstatus_bits = regFile->get( { RegisterType::SYSTEM, mstatus_tag } ).get<uint64_t>();
++    if (mstatus_bits & MSTATUS_MIE_MASK)
++      if  (mtime_val >= mtimecmp_.get().get<uint64_t>())
++        return static_cast<uint16_t>(InterruptId::TIMER);
++  }
++
++  return -1;
++}
++
++}  // namespace riscv
++}  // namespace arch
++}  // namespace simeng
+diff --git a/src/lib/kernel/Linux.cc b/src/lib/kernel/Linux.cc
+index 02de8950..bc060bba 100644
+--- a/src/lib/kernel/Linux.cc
++++ b/src/lib/kernel/Linux.cc
+@@ -29,10 +29,12 @@ void Linux::createProcess(const LinuxProcess& process) {
+                             .currentBrk = process.getHeapStart(),
+                             .initialStackPointer = process.getStackPointer(),
+                             .mmapRegion = process.getMmapStart(),
+-                            .pageSize = process.getPageSize()});
++                            .pageSize = process.getPageSize(),
++                            });
+   processStates_.back().fileDescriptorTable.push_back(STDIN_FILENO);
+   processStates_.back().fileDescriptorTable.push_back(STDOUT_FILENO);
+   processStates_.back().fileDescriptorTable.push_back(STDERR_FILENO);
++  processStates_.back().process = &process;
+ 
+   // Define vector of all currently supported special file paths & files.
+   supportedSpecialFiles_.insert(
+@@ -649,5 +651,11 @@ int64_t Linux::writev(int64_t fd, const void* iovdata, int iovcnt) {
+   return ::writev(hfd, reinterpret_cast<const struct iovec*>(iovdata), iovcnt);
+ }
+ 
++/** Lookup symbol value from table in elf file. */
++bool Linux::lookupSymbolValue(const std::string symbol, uint64_t& value)
++{
++  processStates_[0].process->lookupSymbolValue(symbol,value);
++}
++
+ }  // namespace kernel
+ }  // namespace simeng
+diff --git a/src/lib/kernel/LinuxProcess.cc b/src/lib/kernel/LinuxProcess.cc
+index 31e36d7f..3279652a 100644
+--- a/src/lib/kernel/LinuxProcess.cc
++++ b/src/lib/kernel/LinuxProcess.cc
+@@ -24,7 +24,7 @@ LinuxProcess::LinuxProcess(const std::vector<std::string>& commandLine,
+   // Parse ELF file
+   assert(commandLine.size() > 0);
+   char* unwrappedProcImgPtr;
+-  Elf elf(commandLine[0], &unwrappedProcImgPtr);
++  Elf elf(commandLine[0], &unwrappedProcImgPtr,symbols_);
+   if (!elf.isValid()) {
+     return;
+   }
+@@ -178,5 +178,17 @@ void LinuxProcess::createStack(char** processImage) {
+             (*processImage) + stackPointer_);
+ }
+ 
++bool LinuxProcess::lookupSymbolValue(const std::string symbol, uint64_t& value) const
++{
++  auto lookup = symbols_.find(symbol);
++  if (lookup==symbols_.end())
++    return false;
++  else 
++  {
++    value = lookup->second;
++    return true;
++  }
++}
++
+ }  // namespace kernel
+ }  // namespace simeng
+diff --git a/src/lib/models/emulation/Core.cc b/src/lib/models/emulation/Core.cc
+index 0eff31d5..d9268da2 100644
+--- a/src/lib/models/emulation/Core.cc
++++ b/src/lib/models/emulation/Core.cc
+@@ -20,7 +20,8 @@ Core::Core(MemoryInterface& instructionMemory, MemoryInterface& dataMemory,
+       isa_(isa),
+       pc_(entryPoint),
+       registerFileSet_(isa.getRegisterFileStructures()),
+-      architecturalRegisterFileSet_(registerFileSet_) {
++      architecturalRegisterFileSet_(registerFileSet_),
++      interruptId_(-1) {
+   // Pre-load the first instruction
+   instructionMemory_.requestRead({pc_, FETCH_SIZE});
+ 
+@@ -144,11 +145,16 @@ void Core::tick() {
+   }
+ 
+   execute(uop);
+-  isa_.updateSystemTimerRegisters(&registerFileSet_, ticks_);
++
++  interruptId_ = isa_.updateSystemTimerRegisters(&registerFileSet_, ticks_);
+ }
+ 
+ void Core::execute(std::shared_ptr<Instruction>& uop) {
+-  uop->execute();
++
++  if (interruptId_>=0)
++    uop->raiseInterrupt(interruptId_);
++  else
++    uop->execute();
+ 
+   if (uop->exceptionEncountered()) {
+     instructionsExecuted_++;
+diff --git a/src/lib/models/mcu/Core.cc b/src/lib/models/mcu/Core.cc
+new file mode 100644
+index 00000000..a085d7a3
+--- /dev/null
++++ b/src/lib/models/mcu/Core.cc
+@@ -0,0 +1,515 @@
++#include "simeng/models/mcu/Core.hh"
++
++#include <iomanip>
++#include <ios>
++#include <sstream>
++#include <string>
++
++#include "simeng/arch/riscv/SystemRegister.hh"
++
++namespace simeng {
++namespace models {
++namespace mcu {
++
++// TODO: Replace with config options
++const unsigned int blockSize = 16;
++const unsigned int clockFrequency = 2.5 * 1e9;
++
++Core::Core(MemoryInterface& instructionMemory, MemoryInterface& dataMemory,
++           uint64_t processMemorySize, uint64_t entryPoint,
++           const arch::Architecture& isa, BranchPredictor& branchPredictor, YAML::Node config)
++    : dataMemory_(dataMemory),
++      isa_(isa),
++      registerFileSet_(isa.getRegisterFileStructures()),
++      architecturalRegisterFileSet_(registerFileSet_),
++      fetchToDecodeBuffer_(1, {}),
++      decodeToExecuteBuffer_(1, nullptr, 1),
++      completionSlots_(2, {1, nullptr}),
++      regDepMap_(isa.getRegisterFileStructures(), registerFileSet_),
++      fetchUnit_(fetchToDecodeBuffer_, instructionMemory, processMemorySize,
++                 entryPoint, blockSize, isa, branchPredictor),
++      decodeUnit_(fetchToDecodeBuffer_, decodeToExecuteBuffer_,
++                  branchPredictor,
++                  [this](auto instruction) { return canIssue(instruction); }),
++      writebackUnit_(completionSlots_, registerFileSet_, [](auto insnId) {},
++                     [this](auto instruction) {removeDep(instruction);},
++                     [this](auto instruction) { return removeInstrOrderQ(instruction); }),
++      loadStoreQueue_(4, dataMemory, { completionSlots_.data()+1, 1 }, [this](auto regs, auto values) { forwardOperands(regs, values); }, false, 4, 4, 2, 1, 1),
++      executeUnit_(
++          decodeToExecuteBuffer_, completionSlots_[0],
++          [this](auto regs, auto values) { forwardOperands(regs, values); },
++          [this](auto instruction) { loadStoreQueue_.addLoad(instruction); },
++          [this](auto instruction) { loadStoreQueue_.addStore(instruction); },
++          [this](auto instruction) { raiseException(instruction); },
++          [this](auto instruction) { addInstrOrderQ(instruction); },
++          [this]() { return isInterruptPending(); },
++          branchPredictor, false),
++      interruptId_(-1) {
++  // Query and apply initial state
++  auto state = isa.getInitialState();
++  applyStateChange(state);
++
++  maxStallCycleTimeout = -1;
++  maxSimCycleTimeout = -1;
++  maxInstrTimeout = -1;
++  if(config["Core"]["EnableHaltCheck"].IsDefined() && config["Core"]["EnableHaltCheck"].as<bool>()) {
++    enableHaltCheck = true;
++    if(config["Core"]["MaxStallCycleTimeout"].IsDefined()) {
++      maxStallCycleTimeout = config["Core"]["MaxStallCycleTimeout"].as<uint64_t>();
++    }
++    if(config["Core"]["MaxSimCycleTimeout"].IsDefined()) {
++      maxSimCycleTimeout = config["Core"]["MaxSimCycleTimeout"].as<uint64_t>();
++    }
++    if(config["Core"]["MaxInstrTimeout"].IsDefined()) {
++      maxInstrTimeout = config["Core"]["MaxInstrTimeout"].as<uint64_t>();
++    }
++  }
++};
++
++void Core::checkHalting() {
++  if(!enableHaltCheck) return;
++
++  if (((ticks_ - lastCommitTick_) > maxStallCycleTimeout)) {
++    std::cout << std::dec << "[SimEng:Core] Max Pipeline stall cycle timeout reached at tick: " <<  (ticks_ - lastCommitTick_) << std::endl;
++    hasHalted_ = true;
++  }
++
++  if((ticks_ > maxSimCycleTimeout)) {
++    std::cout << std::dec << "[SimEng:Core] Max Simulation cycle timeout reached at tick: " <<  ticks_ << std::endl;
++    hasHalted_ = true;    
++  }
++
++  if((getInstructionsRetiredCount() > maxInstrTimeout)) {
++    std::cout << std::dec << "[SimEng:Core] Max Instruction count timeout reached at tick: " <<  ticks_ << std::endl;
++    hasHalted_ = true;
++  }
++}
++
++void Core::tick() {
++  ticks_++;
++
++  checkHalting();
++
++  if (hasHalted_) return;
++
++  if (exceptionHandler_ != nullptr) {
++    processExceptionHandler();
++    return;
++  }
++
++  // Writeback must be ticked at start of cycle, to ensure decode reads the
++  // correct values
++  // writebackUnit_.tick();
++  // for(std::shared_ptr<Instruction> inst: writebackUnit_.getInstsForTrace()) {
++  //   uint16_t sysreg_instrret = isa_.getSystemRegisterTag(arch::riscv32::riscv_sysreg::SYSREG_INSTRRET);
++  //   uint16_t sysreg_cycle = isa_.getSystemRegisterTag(arch::riscv32::riscv_sysreg::SYSREG_CYCLE);
++  //   registerFileSet_.set(Register{0x2, sysreg_instrret}, RegisterValue(static_cast<uint32_t>(writebackUnit_.getInstructionsWrittenCount()), 4));
++  //   registerFileSet_.set(Register{0x2, sysreg_cycle}, RegisterValue(static_cast<uint32_t>(ticks_), 4));
++  //   isa_.updateInstrTrace(inst, &registerFileSet_, ticks_);
++  //   if(inst->isLoad()) {
++  //     loadStoreQueue_.commitLoad(inst);
++  //   } else if(inst->isStoreData()) {
++  //     loadStoreQueue_.commitStore(inst);
++  //   }
++  //   lastCommitTick_ = ticks_;
++  // }
++  // writebackUnit_.traceFinished();
++
++
++  loadStoreQueue_.processResponse();
++  completionSlots_[1].tick();
++
++  // Tick units
++  fetchUnit_.tick();
++  decodeUnit_.tick();
++  executeUnit_.tick();
++
++  // Wipe any data read responses, as they will have been handled by this point
++  //dataMemory_.clearCompletedReads();
++
++  loadStoreQueue_.tick();
++  // Writeback must be ticked at start of cycle, to ensure decode reads the
++  // correct values
++  writebackUnit_.tick();
++  for(std::shared_ptr<Instruction> inst: writebackUnit_.getInstsForTrace()) {
++    uint16_t sysreg_instrret = isa_.getSystemRegisterTag(arch::riscv::riscv_sysreg::SYSREG_INSTRRET);
++    uint16_t sysreg_cycle = isa_.getSystemRegisterTag(arch::riscv::riscv_sysreg::SYSREG_CYCLE);
++    registerFileSet_.set(Register{0x2, sysreg_instrret}, RegisterValue(static_cast<uint32_t>(writebackUnit_.getInstructionsWrittenCount()), 4));
++    registerFileSet_.set(Register{0x2, sysreg_cycle}, RegisterValue(static_cast<uint32_t>(ticks_), 4));
++    isa_.updateInstrTrace(inst, &registerFileSet_, ticks_);
++    if(inst->isLoad()) {
++      loadStoreQueue_.commitLoad(inst);
++    } else if(inst->isStoreData()) {
++      loadStoreQueue_.commitStore(inst);
++    }
++    lastCommitTick_ = ticks_;
++  }
++  // writebackUnit_.traceFinished();
++  // Read pending registers for ready-to-execute uop; must happen after execute
++  // to allow operand forwarding to take place first
++  // readRegisters();
++
++  // Tick buffers
++  // Each unit must have wiped the entries at the head of the buffer after use,
++  // as these will now loop around and become the tail.
++  fetchToDecodeBuffer_.tick();
++  decodeToExecuteBuffer_.tick();
++  completionSlots_[0].tick();
++  // for (auto& buffer : completionSlots_) {
++  //   buffer.tick();
++  // }
++
++  // if (exceptionGenerated_) {
++  //   handleException();
++  //   //fetchUnit_.requestFromPC();
++  //   return;
++  // }
++
++  // Check for flush
++  if (executeUnit_.shouldFlush()) {
++    // Flush was requested at execute stage
++    // Update PC and wipe younger buffers (Fetch/Decode, Decode/Execute)
++    auto targetAddress = executeUnit_.getFlushAddress();
++
++    fetchUnit_.flushLoopBuffer();
++    fetchUnit_.updatePC(targetAddress);
++    fetchUnit_.flushPredictor(targetAddress);
++    // Ensure instructions in the buffer if any are set to be flushed before being removed, this helps with removing the respective dependencies if any
++    decodeUnit_.purgeFlushed();
++    executeUnit_.purgeFlushed();
++    fetchToDecodeBuffer_.fill({});
++    decodeToExecuteBuffer_.fill(nullptr);
++    loadStoreQueue_.purgeFlushed();
++    regDepMap_.purgeFlushed();
++
++    flushes_++;
++  } else if (decodeUnit_.shouldFlush()) {
++    assert(false && "Decode unit should not generate flush");
++    // Flush was requested at decode stage
++    // Update PC and wipe Fetch/Decode buffer.
++    auto targetAddress = decodeUnit_.getFlushAddress();
++
++    fetchUnit_.flushLoopBuffer();
++    fetchUnit_.updatePC(targetAddress);
++    fetchToDecodeBuffer_.fill({});
++
++    flushes_++;
++  }
++
++  if (exceptionGenerated_) {
++    handleException();
++    //fetchUnit_.requestFromPC();
++    return;
++  }
++
++  fetchUnit_.requestFromPC();
++  interruptId_ = isa_.updateSystemTimerRegisters(&registerFileSet_, ticks_);
++}
++
++bool Core::hasHalted() const {
++  if (hasHalted_) {
++    return true;
++  }
++
++  // Core is considered to have halted when the fetch unit has halted, there
++  // are no uops at the head of any buffer, and no exception is currently being
++  // handled.
++  bool decodePending = fetchToDecodeBuffer_.getHeadSlots()[0].size() > 0;
++  bool executePending = decodeToExecuteBuffer_.getHeadSlots()[0] != nullptr;
++  bool writebackPending = completionSlots_[0].getHeadSlots()[0] != nullptr;
++  writebackPending |= completionSlots_[1].getHeadSlots()[0] != nullptr;
++
++  return (fetchUnit_.hasHalted() && !decodePending && !writebackPending &&
++          !executePending && exceptionHandler_ == nullptr);
++}
++
++const ArchitecturalRegisterFileSet& Core::getArchitecturalRegisterFileSet()
++    const {
++  return architecturalRegisterFileSet_;
++}
++
++uint64_t Core::getInstructionsRetiredCount() const {
++  return writebackUnit_.getInstructionsWrittenCount();
++}
++
++uint64_t Core::getSystemTimer() const {
++  // TODO: This will need to be changed if we start supporting DVFS.
++  return ticks_ / (clockFrequency / 1e9);
++}
++
++std::map<std::string, std::string> Core::getStats() const {
++  auto retired = writebackUnit_.getInstructionsWrittenCount();
++  auto ipc = retired / static_cast<float>(ticks_);
++  std::ostringstream ipcStr;
++  ipcStr << std::setprecision(2) << ipc;
++
++  // Sum up the branch stats reported across the execution units.
++  uint64_t totalBranchesExecuted = 0;
++  uint64_t totalBranchMispredicts = 0;
++  totalBranchesExecuted += executeUnit_.getBranchExecutedCount();
++  totalBranchMispredicts += executeUnit_.getBranchMispredictedCount();
++  auto branchMissRate = 100.0f * static_cast<float>(totalBranchMispredicts) /
++                        static_cast<float>(totalBranchesExecuted);
++  std::ostringstream branchMissRateStr;
++  branchMissRateStr << std::setprecision(3) << branchMissRate << "%";
++
++  return {{"cycles", std::to_string(ticks_)},
++          {"retired", std::to_string(retired)},
++          {"ipc", ipcStr.str()},
++          {"flushes", std::to_string(flushes_)},
++          {"branch.executed", std::to_string(totalBranchesExecuted)},
++          {"branch.mispredict", std::to_string(totalBranchMispredicts)},
++          {"branch.missrate", branchMissRateStr.str()},
++          {"lsu.ldminlatency", std::to_string(loadStoreQueue_.getMinLdLat())},
++          {"lsu.ldmaxlatency", std::to_string(loadStoreQueue_.getMaxLdLat())},
++          {"lsu.ldavglatency", std::to_string(loadStoreQueue_.getAvgLdLat())}};
++}
++
++void Core::raiseException(const std::shared_ptr<Instruction>& instruction) {
++  exceptionGenerated_ = true;
++  exceptionGeneratingInstruction_ = instruction;
++}
++
++void Core::handleException() {
++  exceptionGenerated_ = false;
++
++  exceptionHandler_ =
++      isa_.handleException(exceptionGeneratingInstruction_, *this, dataMemory_);
++
++  processExceptionHandler();
++//  isa_.updateInstrTrace(exceptionGeneratingInstruction_, &registerFileSet_, ticks_);
++//  lastCommitTick_ = ticks_;
++//  assert(removeInstrOrderQ(exceptionGeneratingInstruction_) && "Unexpected instruction at the top of inorder instr queue on exception");
++
++  //TODO: This is not a good point to flush the pipeline if the exception is not changing the PC.
++
++  // Flush pipeline
++//  decodeUnit_.purgeFlushed();
++//  executeUnit_.purgeFlushed();
++//  fetchToDecodeBuffer_.fill({});
++//  decodeToExecuteBuffer_.fill(nullptr);
++//  loadStoreQueue_.purgeFlushed();
++//  completionSlots_[0].fill(nullptr);
++//  completionSlots_[1].fill(nullptr);
++//  regDepMap_.purgeFlushed();
++}
++
++void Core::processExceptionHandler() {
++  assert(exceptionHandler_ != nullptr &&
++         "Attempted to process an exception handler that wasn't present");
++  if (dataMemory_.hasPendingRequests()) {
++    // Must wait for all memory requests to complete before processing the
++    // exception
++    return;
++  }
++
++  auto success = exceptionHandler_->tick();
++  if (!success) {
++    // Exception handler requires further ticks to complete
++    return;
++  }
++
++  const auto& result = exceptionHandler_->getResult();
++
++  if (result.fatal) {
++    hasHalted_ = true;
++    std::cout << "[SimEng:Core] Halting due to fatal exception" << std::endl;
++  } else {
++    //fetchUnit_.flushLoopBuffer();
++    fetchUnit_.updatePC(result.instructionAddress);
++    applyStateChange(result.stateChange);
++  }
++
++  exceptionHandler_ = nullptr;
++}
++
++void Core::loadData(const std::shared_ptr<Instruction>& instruction) {
++  const auto& addresses = instruction->getGeneratedAddresses();
++  for (const auto& target : addresses) {
++    dataMemory_.requestRead(target);
++  }
++
++  // NOTE: This model only supports zero-cycle data memory models, and will not
++  // work unless data requests are handled synchronously.
++  for (const auto& response : dataMemory_.getCompletedReads()) {
++    instruction->supplyData(response.target.address, response.data);
++  }
++
++  assert(instruction->hasAllData() &&
++         "Load instruction failed to obtain all data this cycle");
++
++  instruction->execute();
++
++  if (instruction->isStoreData()) {
++    storeData(instruction);
++  }
++}
++
++void Core::storeData(const std::shared_ptr<Instruction>& instruction) {
++  if (instruction->isStoreAddress()) {
++    auto addresses = instruction->getGeneratedAddresses();
++    for (auto const& target : addresses) {
++      previousAddresses_.push(target);
++    }
++  }
++  if (instruction->isStoreData()) {
++    const auto data = instruction->getData();
++    for (size_t i = 0; i < data.size(); i++) {
++      dataMemory_.requestWrite(previousAddresses_.front(), data[i]);
++      previousAddresses_.pop();
++    }
++  }
++}
++
++void Core::forwardOperands(const span<Register>& registers,
++                           const span<RegisterValue>& values) {
++  return;
++  // assert(registers.size() == values.size() &&
++  //        "Mismatched register and value vector sizes");
++
++  // const auto& uop = decodeToExecuteBuffer_.getTailSlots()[0];
++  // if (uop == nullptr) {
++  //   return;
++  // }
++
++  // auto sourceRegisters = uop->getOperandRegisters();
++  // for (size_t i = 0; i < registers.size(); i++) {
++  //   // Check each forwarded register vs source operands and supply for each
++  //   // match
++  //   for (size_t operand = 0; operand < sourceRegisters.size(); operand++) {
++  //     const auto& sourceReg = sourceRegisters[operand];
++  //     if (uop->canExecute()) {
++  //       return;
++  //     }
++  //     if (sourceReg == registers[i] && !uop->isOperandReady(operand)) {
++  //       // Supply the operand
++  //       uop->supplyOperand(operand, values[i]);
++  //     }
++  //   }
++  // }
++}
++
++bool Core::canIssue(const std::shared_ptr<Instruction>& uop) {
++  if (uop->isSysCall() && inorderIQ_.size() > 0) {
++    return false;
++  }
++  if((uop->isLoad() || uop->isStoreData()) && loadStoreQueue_.isBusy()) {
++    return false;
++  }
++  if (regDepMap_.canRead(uop) && regDepMap_.canWrite(uop)) {
++    regDepMap_.insert(uop);
++    return true;
++  }
++  return false;
++}
++
++void Core::removeDep(const std::shared_ptr<Instruction>& uop) {
++  regDepMap_.remove(uop);
++}
++
++void Core::readRegisters() {
++  if (decodeToExecuteBuffer_.isStalled()) {
++    return;
++  }
++
++  const auto& uop = decodeToExecuteBuffer_.getTailSlots()[0];
++  if (uop == nullptr) {
++    return;
++  }
++
++  // Register read
++  // Identify missing registers and supply values
++  const auto& sourceRegisters = uop->getOperandRegisters();
++  for (size_t i = 0; i < sourceRegisters.size(); i++) {
++    const auto& reg = sourceRegisters[i];
++    if (!uop->isOperandReady(i)) {
++      uop->supplyOperand(i, registerFileSet_.get(reg));
++    }
++  }
++}
++
++void Core::applyStateChange(const arch::ProcessStateChange& change) {
++  // Update registers in accoradance with the ProcessStateChange type
++  switch (change.type) {
++    case arch::ChangeType::INCREMENT: {
++      for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
++        registerFileSet_.set(
++            change.modifiedRegisters[i],
++            registerFileSet_.get(change.modifiedRegisters[i]).get<uint64_t>() +
++                change.modifiedRegisterValues[i].get<uint64_t>());
++      }
++      break;
++    }
++    case arch::ChangeType::DECREMENT: {
++      for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
++        registerFileSet_.set(
++            change.modifiedRegisters[i],
++            registerFileSet_.get(change.modifiedRegisters[i]).get<uint64_t>() -
++                change.modifiedRegisterValues[i].get<uint64_t>());
++      }
++      break;
++    }
++    default: {  // arch::ChangeType::REPLACEMENT
++      // If type is ChangeType::REPLACEMENT, set new values
++      for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
++        registerFileSet_.set(change.modifiedRegisters[i],
++                             change.modifiedRegisterValues[i]);
++      }
++      break;
++    }
++  }
++
++  // Update memory
++  // TODO: Analyse if ChangeType::INCREMENT or ChangeType::DECREMENT case is
++  // required for memory changes
++  for (size_t i = 0; i < change.memoryAddresses.size(); i++) {
++    dataMemory_.requestWrite(change.memoryAddresses[i],
++                             change.memoryAddressValues[i]);
++  }
++}
++
++void Core::handleLoad(const std::shared_ptr<Instruction>& instruction) {
++  loadData(instruction);
++  if (instruction->exceptionEncountered()) {
++    raiseException(instruction);
++    return;
++  }
++
++  forwardOperands(instruction->getDestinationRegisters(),
++                  instruction->getResults());
++  // Manually add the instruction to the writeback input buffer
++  completionSlots_[1].getTailSlots()[0] = instruction;
++}
++
++void Core::addInstrOrderQ(const std::shared_ptr<Instruction>& insn) {
++  //std::cout << std::dec << ticks_ << ": Adding instruction at address: 0x" << std::hex << insn->getInstructionAddress() << std::endl;
++  inorderIQ_.push_back(insn);
++}
++
++bool Core::removeInstrOrderQ(const std::shared_ptr<Instruction>& insn) {
++  if (insn == inorderIQ_.front()) {
++    //std::cout << std::dec << ticks_ << ": Removing instruction at address: 0x" << std::hex << insn->getInstructionAddress() << std::endl;
++    // if(insn->exceptionEncountered()) {
++    //   exceptionGenerated_ = true;
++    //   exceptionGeneratingInstruction_ = insn;
++    //   handleException();
++    // }
++    inorderIQ_.pop_front();
++    return true;
++  } else {
++    return false;
++  }
++}
++
++int16_t Core::isInterruptPending() {
++  if (interruptId_>=0) {
++    std::cout << std::dec << "[SimEng:Core] Interrupt Pending id: " << interruptId_ << ", at tick: " << ticks_ << std::endl;
++    return interruptId_;
++  } else {
++    return -1;
++  }
++}
++
++}  // namespace mcu
++}  // namespace models
++}  // namespace simeng
+diff --git a/src/lib/pipeline/FetchUnit.cc b/src/lib/pipeline/FetchUnit.cc
+index ade3d307..28d2eaba 100644
+--- a/src/lib/pipeline/FetchUnit.cc
++++ b/src/lib/pipeline/FetchUnit.cc
+@@ -129,7 +129,7 @@ void FetchUnit::tick() {
+     BranchPrediction prediction = {false, 0};
+     if (macroOp[0]->isBranch()) {
+       prediction = branchPredictor_.predict(pc_, macroOp[0]->getBranchType(),
+-                                            macroOp[0]->getKnownTarget());
++                                            macroOp[0]->getKnownOffset());
+       macroOp[0]->setBranchPrediction(prediction);
+     }
+ 
+diff --git a/src/lib/pipeline_hi/DecodeUnit.cc b/src/lib/pipeline_hi/DecodeUnit.cc
+new file mode 100644
+index 00000000..86a298a1
+--- /dev/null
++++ b/src/lib/pipeline_hi/DecodeUnit.cc
+@@ -0,0 +1,117 @@
++#include "simeng/pipeline_hi/DecodeUnit.hh"
++
++#include <cassert>
++
++namespace simeng {
++namespace pipeline_hi {
++
++DecodeUnit::DecodeUnit(PipelineBuffer<MacroOp>& input,
++                       PipelineBuffer<std::shared_ptr<Instruction>>& output,
++                       BranchPredictor& predictor,
++                       std::function<bool(const std::shared_ptr<Instruction>&)> canIssue)
++    : input_(input), output_(output), predictor_(predictor), canIssue_(canIssue){};
++
++void DecodeUnit::tick() {
++  // Stall if output buffer is stalled
++  if (output_.isStalled()) {
++    input_.stall(true);
++    return;
++  }
++
++  shouldFlush_ = false;
++  input_.stall(false);
++
++  // Stall if internal uop is overpopulated, otherwise add uops from input to
++  // internal buffer
++  if (microOps_.size() >= output_.getWidth()) {
++    input_.stall(true);
++  } else {
++    // Populate uop buffer with newly fetched macro-ops
++    for (size_t slot = 0; slot < input_.getWidth(); slot++) {
++      auto& macroOp = input_.getHeadSlots()[slot];
++
++      if (macroOp.size() == 0) {
++        // Nothing to process for this macro-op
++        continue;
++      }
++
++      for (uint8_t index = 0; index < macroOp.size(); index++) {
++        microOps_.push_back(std::move(macroOp[index]));
++      }
++
++      input_.getHeadSlots()[slot].clear();
++    }
++  }
++
++  // Process uops in buffer
++  for (size_t slot = 0; slot < output_.getWidth(); slot++) {
++    // If there's no more uops to decode, exit loop early
++    if (!microOps_.size()) break;
++
++    //Check for dependencies before forwarding to next stage
++    //Stop-gap implementation
++    if (!canIssue_(microOps_.front())) break;
++
++    // Move uop to output buffer and remove from internal buffer
++    auto& uop = (output_.getTailSlots()[slot] = std::move(microOps_.front()));
++    microOps_.pop_front();
++
++    // Check preliminary branch prediction results now that the instruction is
++    // decoded. Identifies:
++    // - Non-branch instructions mistakenly predicted as branches
++    // - Incorrect targets for immediate branches
++    // auto [misprediction, correctAddress] = uop->checkEarlyBranchMisprediction();
++    // if (misprediction) {
++    //   earlyFlushes_++;
++    //   shouldFlush_ = true;
++    //   pc_ = correctAddress;
++
++    //   if (!uop->isBranch()) {
++    //     // Non-branch incorrectly predicted as a branch; let the predictor know
++    //     predictor_.update(uop->getInstructionAddress(), false, pc_,
++    //                       uop->getBranchType());
++    //   }
++    //   // Remove macro-operations in microOps_ buffer after macro-operation
++    //   // decoded in this cycle
++    //   auto uopIt = microOps_.begin();
++    //   // Find first microOps_ entry not belonging to same address as flushing
++    //   // instruction
++    //   while (uopIt != microOps_.end()) {
++    //     if ((*uopIt)->getInstructionAddress() != uop->getInstructionAddress()) {
++    //       break;
++    //     } else {
++    //       uopIt++;
++    //     }
++    //   }
++    //   // Remove all entries after first macro-operation in buffer
++    //   while (uopIt != microOps_.end()) {
++    //     uopIt = microOps_.erase(uopIt);
++    //   }
++
++    //   // Skip processing remaining uops, as they need to be flushed
++    //   break;
++    // }
++  }
++}
++
++bool DecodeUnit::shouldFlush() const { return shouldFlush_; }
++uint64_t DecodeUnit::getFlushAddress() const { return pc_; }
++uint64_t DecodeUnit::getEarlyFlushes() const { return earlyFlushes_; };
++
++void DecodeUnit::purgeFlushed() {
++  if (output_.getTailSlots()[0] != nullptr) {
++    output_.getTailSlots()[0]->setFlushed();
++  }
++
++  if (input_.getHeadSlots()[0].size() != 0) {
++    input_.getHeadSlots()[0][0]->setFlushed();
++  }
++
++  if (microOps_.size())
++    microOps_.front()->setFlushed();
++  microOps_.clear();
++  input_.stall(false);
++}
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/lib/pipeline_hi/DispatchIssueUnit.cc b/src/lib/pipeline_hi/DispatchIssueUnit.cc
+new file mode 100644
+index 00000000..93ce9fa3
+--- /dev/null
++++ b/src/lib/pipeline_hi/DispatchIssueUnit.cc
+@@ -0,0 +1,269 @@
++#include "simeng/pipeline_hi/DispatchIssueUnit.hh"
++
++#include <algorithm>
++#include <iostream>
++
++namespace simeng {
++namespace pipeline_hi {
++
++DispatchIssueUnit::DispatchIssueUnit(
++    PipelineBuffer<std::shared_ptr<Instruction>>& fromRename,
++    std::vector<PipelineBuffer<std::shared_ptr<Instruction>>>& issuePorts,
++    const RegisterFileSet& registerFileSet, PortAllocator& portAllocator,
++    const std::vector<uint16_t>& physicalRegisterStructure, YAML::Node config)
++    : input_(fromRename),
++      issuePorts_(issuePorts),
++      registerFileSet_(registerFileSet),
++      scoreboard_(physicalRegisterStructure.size()),
++      dependencyMatrix_(physicalRegisterStructure.size()),
++      portAllocator_(portAllocator) {
++  // Initialise scoreboard
++  for (size_t type = 0; type < physicalRegisterStructure.size(); type++) {
++    scoreboard_[type].assign(physicalRegisterStructure[type], true);
++    dependencyMatrix_[type].resize(physicalRegisterStructure[type]);
++  }
++  // Create set of reservation station structs with correct issue port
++  // mappings
++  for (size_t i = 0; i < config["Reservation-Stations"].size(); i++) {
++    // Iterate over each reservation station in config
++    auto reservation_station = config["Reservation-Stations"][i];
++    // Create ReservationStation struct to be stored
++    ReservationStation rs = {
++        reservation_station["Size"].as<uint16_t>(),
++        reservation_station["Dispatch-Rate"].as<uint16_t>(),
++        0,
++        {}};
++    // Resize rs port attribute to match what's defined in config file
++    rs.ports.resize(reservation_station["Ports"].size());
++    for (size_t j = 0; j < reservation_station["Ports"].size(); j++) {
++      // Iterate over issue ports in config
++      uint16_t issue_port = reservation_station["Ports"][j].as<uint16_t>();
++      rs.ports[j].issuePort = issue_port;
++      // Add port mapping entry, resizing vector if needed
++      if ((issue_port + 1) > portMapping_.size()) {
++        portMapping_.resize((issue_port + 1));
++      }
++      portMapping_[issue_port] = {i, j};
++    }
++    reservationStations_.push_back(rs);
++  }
++  for (uint16_t i = 0; i < reservationStations_.size(); i++)
++    flushed_.emplace(i, std::initializer_list<std::shared_ptr<Instruction>>{});
++}
++
++void DispatchIssueUnit::tick() {
++  input_.stall(false);
++
++  /** Stores the number of instructions dispatched for each
++   * reservation station. */
++  std::vector<uint16_t> dispatches(
++      static_cast<unsigned short>(reservationStations_.size()), 0);
++
++  for (size_t slot = 0; slot < input_.getWidth(); slot++) {
++    auto& uop = input_.getHeadSlots()[slot];
++    if (uop == nullptr) {
++      continue;
++    }
++
++    const std::vector<uint16_t>& supportedPorts = uop->getSupportedPorts();
++    if (uop->exceptionEncountered()) {
++      // Exception; mark as ready to commit, and remove from pipeline
++      uop->setCommitReady();
++      input_.getHeadSlots()[slot] = nullptr;
++      continue;
++    }
++    // Allocate issue port to uop
++    uint16_t port = portAllocator_.allocate(supportedPorts);
++    uint16_t RS_Index = portMapping_[port].first;
++    uint16_t RS_Port = portMapping_[port].second;
++    assert(RS_Index < reservationStations_.size() &&
++           "Allocated port inaccessible");
++    ReservationStation& rs = reservationStations_[RS_Index];
++
++    // When appropriate, stall uop or input buffer if stall buffer full
++    if (rs.currentSize == rs.capacity ||
++        dispatches[RS_Index] == rs.dispatchRate) {
++      // Deallocate port given
++      portAllocator_.deallocate(port);
++      input_.stall(true);
++      rsStalls_++;
++      return;
++    }
++
++    // Assume the uop will be ready
++    bool ready = true;
++
++    // Register read
++    // Identify remaining missing registers and supply values
++    auto& sourceRegisters = uop->getOperandRegisters();
++    for (uint16_t i = 0; i < sourceRegisters.size(); i++) {
++      const auto& reg = sourceRegisters[i];
++
++      if (!uop->isOperandReady(i)) {
++        // The operand hasn't already been supplied
++        if (scoreboard_[reg.type][reg.tag]) {
++          // The scoreboard says it's ready; read and supply the register value
++          uop->supplyOperand(i, registerFileSet_.get(reg));
++        } else {
++          // This register isn't ready yet. Register this uop to the dependency
++          // matrix for a more efficient lookup later
++          dependencyMatrix_[reg.type][reg.tag].push_back({uop, port, i});
++          ready = false;
++        }
++      }
++    }
++
++    // Set scoreboard for all destination registers as not ready
++    auto& destinationRegisters = uop->getDestinationRegisters();
++    for (const auto& reg : destinationRegisters) {
++      scoreboard_[reg.type][reg.tag] = false;
++    }
++
++    // Increment dispatches made and RS occupied entries size
++    dispatches[RS_Index]++;
++    rs.currentSize++;
++
++    if (ready) {
++      rs.ports[RS_Port].ready.push_back(std::move(uop));
++    }
++
++    input_.getHeadSlots()[slot] = nullptr;
++  }
++}
++
++void DispatchIssueUnit::issue() {
++  int issued = 0;
++  // Check the ready queues, and issue an instruction from each if the
++  // corresponding port isn't blocked
++  for (size_t i = 0; i < issuePorts_.size(); i++) {
++    ReservationStation& rs = reservationStations_[portMapping_[i].first];
++    auto& queue = rs.ports[portMapping_[i].second].ready;
++    if (issuePorts_[i].isStalled()) {
++      if (queue.size() > 0) {
++        portBusyStalls_++;
++      }
++      continue;
++    }
++
++    if (queue.size() > 0) {
++      auto& uop = queue.front();
++      issuePorts_[i].getTailSlots()[0] = std::move(uop);
++      queue.pop_front();
++
++      // Inform the port allocator that an instruction issued
++      portAllocator_.issued(i);
++      issued++;
++
++      assert(rs.currentSize > 0);
++      rs.currentSize--;
++    }
++  }
++
++  if (issued == 0) {
++    for (const auto& rs : reservationStations_) {
++      if (rs.currentSize != 0) {
++        backendStalls_++;
++        return;
++      }
++    }
++    frontendStalls_++;
++  }
++}
++
++void DispatchIssueUnit::forwardOperands(const span<Register>& registers,
++                                        const span<RegisterValue>& values) {
++  assert(registers.size() == values.size() &&
++         "Mismatched register and value vector sizes");
++
++  for (size_t i = 0; i < registers.size(); i++) {
++    const auto& reg = registers[i];
++    // Flag scoreboard as ready now result is available
++    scoreboard_[reg.type][reg.tag] = true;
++
++    // Supply the value to all dependent uops
++    const auto& dependents = dependencyMatrix_[reg.type][reg.tag];
++    for (auto& entry : dependents) {
++      entry.uop->supplyOperand(entry.operandIndex, values[i]);
++      if (entry.uop->canExecute()) {
++        // Add the now-ready instruction to the relevant ready queue
++        auto rsInfo = portMapping_[entry.port];
++        reservationStations_[rsInfo.first].ports[rsInfo.second].ready.push_back(
++            std::move(entry.uop));
++      }
++    }
++
++    // Clear the dependency list
++    dependencyMatrix_[reg.type][reg.tag].clear();
++  }
++}
++
++void DispatchIssueUnit::setRegisterReady(Register reg) {
++  scoreboard_[reg.type][reg.tag] = true;
++}
++
++void DispatchIssueUnit::purgeFlushed() {
++  for (size_t i = 0; i < reservationStations_.size(); i++) {
++    // Search the ready queues for flushed instructions and remove them
++    auto& rs = reservationStations_[i];
++    for (auto& port : rs.ports) {
++      // Ready queue
++      auto readyIter = port.ready.begin();
++      while (readyIter != port.ready.end()) {
++        auto& uop = *readyIter;
++        if (uop->isFlushed()) {
++          portAllocator_.deallocate(port.issuePort);
++          readyIter = port.ready.erase(readyIter);
++          assert(rs.currentSize > 0);
++          rs.currentSize--;
++        } else {
++          readyIter++;
++        }
++      }
++    }
++  }
++
++  // Collect flushed instructions and remove them from the dependency matrix
++  for (auto& it : flushed_) it.second.clear();
++  for (auto& registerType : dependencyMatrix_) {
++    for (auto& dependencyList : registerType) {
++      auto it = dependencyList.begin();
++      while (it != dependencyList.end()) {
++        auto& entry = *it;
++        if (entry.uop->isFlushed()) {
++          auto rsIndex = portMapping_[entry.port].first;
++          if (!flushed_[rsIndex].count(entry.uop)) {
++            flushed_[rsIndex].insert(entry.uop);
++            portAllocator_.deallocate(entry.port);
++          }
++          it = dependencyList.erase(it);
++        } else {
++          it++;
++        }
++      }
++    }
++  }
++
++  // Update reservation station size
++  for (uint8_t i = 0; i < reservationStations_.size(); i++) {
++    assert(reservationStations_[i].currentSize >= flushed_[i].size());
++    reservationStations_[i].currentSize -= flushed_[i].size();
++  }
++}
++
++uint64_t DispatchIssueUnit::getRSStalls() const { return rsStalls_; }
++uint64_t DispatchIssueUnit::getFrontendStalls() const {
++  return frontendStalls_;
++}
++uint64_t DispatchIssueUnit::getBackendStalls() const { return backendStalls_; }
++uint64_t DispatchIssueUnit::getPortBusyStalls() const {
++  return portBusyStalls_;
++}
++
++void DispatchIssueUnit::getRSSizes(std::vector<uint64_t>& sizes) const {
++  for (auto& rs : reservationStations_) {
++    sizes.push_back(rs.capacity - rs.currentSize);
++  }
++}
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/lib/pipeline_hi/ExecuteUnit.cc b/src/lib/pipeline_hi/ExecuteUnit.cc
+new file mode 100644
+index 00000000..e3b5089d
+--- /dev/null
++++ b/src/lib/pipeline_hi/ExecuteUnit.cc
+@@ -0,0 +1,255 @@
++#include "simeng/pipeline_hi/ExecuteUnit.hh"
++
++#include <cstring>
++#include <iostream>
++
++namespace simeng {
++namespace pipeline_hi {
++
++ExecuteUnit::ExecuteUnit(
++    PipelineBuffer<std::shared_ptr<Instruction>>& input,
++    PipelineBuffer<std::shared_ptr<Instruction>>& output,
++    std::function<void(span<Register>, span<RegisterValue>)> forwardOperands,
++    std::function<void(const std::shared_ptr<Instruction>&)> handleLoad,
++    std::function<void(const std::shared_ptr<Instruction>&)> handleStore,
++    std::function<void(const std::shared_ptr<Instruction>&)> raiseException,
++    std::function<void(const std::shared_ptr<Instruction>&)> addInstrOrderQ,
++    std::function<int16_t(void)> isInterruptPending,
++    BranchPredictor& predictor, bool pipelined,
++    const std::vector<uint16_t>& blockingGroups)
++    : input_(input),
++      output_(output),
++      forwardOperands_(forwardOperands),
++      handleLoad_(handleLoad),
++      handleStore_(handleStore),
++      raiseException_(raiseException),
++      addInstrOrderQ_(addInstrOrderQ),
++      isInterruptPending_(isInterruptPending),
++      predictor_(predictor),
++      pipelined_(pipelined),
++      blockingGroups_(blockingGroups) {}
++
++void ExecuteUnit::tick() {
++  tickCounter_++;
++  shouldFlush_ = false;
++
++  if (stallUntil_ <= tickCounter_) {
++    input_.stall(false);
++    // Input isn't stalled; process instruction and add to pipeline
++
++    auto& uop = input_.getHeadSlots()[0];
++    if (uop != nullptr) {
++      if (!uop->isFlushed()) {
++        // Retrieve execution latency from the instruction
++        auto latency = uop->getLatency();
++        cycles_++;
++        // Block uop execution if appropriate
++        if (std::find(blockingGroups_.begin(), blockingGroups_.end(),
++                      uop->getGroup()) != blockingGroups_.end()) {
++          if (operationsStalled_.size() == 0) {
++            // Add uop to pipeline
++            pipeline_.push_back({nullptr, tickCounter_ + latency - 1});
++            pipeline_.back().insn = std::move(uop);
++            operationsStalled_.push_back(pipeline_.back().insn);
++          } else {
++            // Stall execution start cycle
++            operationsStalled_.push_back(nullptr);
++            operationsStalled_.back() = std::move(uop);
++          }
++        } else if (latency == 1 && pipeline_.size() == 0) {
++          // Pipeline is empty and insn will execute this cycle; bypass
++          execute(uop);
++        } else {
++          // This instruction may take more than a single cycle; check for a
++          // stall. For unpipelined units, the unit will stall for the full
++          // instruction duration.
++          auto stallCycles =
++              pipelined_ ? uop->getStallCycles() : uop->getLatency();
++          if (stallCycles > 1) {
++            stallUntil_ = tickCounter_ + stallCycles - 1;
++            input_.stall(true);
++          }
++
++          // Add insn to pipeline
++          pipeline_.push_back({nullptr, tickCounter_ + latency - 1});
++          pipeline_.back().insn = std::move(uop);
++        }
++      }
++      input_.getHeadSlots()[0] = nullptr;
++    }
++  }
++
++  if (pipeline_.size() == 0) {
++    return;
++  }
++
++  auto& head = pipeline_.front();
++  if (head.readyAt <= tickCounter_) {
++    // Check if the completion of an operation would unblock
++    // another stalled operation.
++    if (std::find(blockingGroups_.begin(), blockingGroups_.end(),
++                  head.insn->getGroup()) != blockingGroups_.end()) {
++      operationsStalled_.pop_front();
++      if (operationsStalled_.size() > 0) {
++        // Add uop to pipeline
++        auto& uop = operationsStalled_.front();
++        pipeline_.push_back({nullptr, tickCounter_ + uop->getLatency() - 1});
++        pipeline_.back().insn = std::move(uop);
++        operationsStalled_.front() = pipeline_.back().insn;
++      }
++    }
++    execute(head.insn);
++    pipeline_.pop_front();
++  }
++}
++
++void ExecuteUnit::execute(std::shared_ptr<Instruction>& uop) {
++  assert(uop->canExecute() &&
++         "Attempted to execute an instruction before it was ready");
++
++  int16_t pendingInterruptId = isInterruptPending_();
++  if(pendingInterruptId>=0) {
++    //std::cout << std::hex << "Execution encountered pending interrupt, PC 0x" << uop->getInstructionAddress() << std::endl;
++    uop->raiseInterrupt(pendingInterruptId);
++    uop->setFlushed();
++    raiseException_(uop);
++    shouldFlush_ = true;
++    return;
++  }
++
++  addInstrOrderQ_(uop);
++  if (uop->exceptionEncountered()) {
++    // Exception encountered prior to execution
++    // TODO: Identify whether this can be removed; executing an
++    // exception-encountered uop would have to be guaranteed to be safe
++    raiseException_(uop);
++    return;
++  }
++
++  if (uop->isLoad()) {
++    uop->generateAddresses();
++    if (uop->exceptionEncountered()) {
++      // Exception; don't pass handle load function
++      raiseException_(uop);
++      return;
++    }
++    handleLoad_(uop);
++    return;
++  } else if (uop->isStoreAddress() || uop->isStoreData()) {
++    if (uop->isStoreAddress()) {
++      uop->generateAddresses();
++    }
++    if (uop->isStoreData()) {
++      uop->execute();
++    }
++    handleStore_(uop);
++  } else {
++    uop->execute();
++  }
++
++  if (uop->exceptionEncountered()) {
++    // Exception; don't forward results, don't pass uop forward
++    raiseException_(uop);
++    shouldFlush_ = true;
++  //TODO: Let the instruction go into writeback stage
++  //   return;
++  }
++
++  if (uop->isBranch()) {
++    pc_ = uop->getBranchAddress();
++
++    // Update branch predictor with branch results
++    predictor_.update(uop->getInstructionAddress(), uop->wasBranchTaken(), pc_,
++                      uop->getBranchType());
++
++    // Update the branch instruction counter
++    branchesExecuted_++;
++
++    if (uop->wasBranchMispredicted()) {
++      //std::cout << std::dec << tickCounter_ << std::hex << ": Misprediction iaddr: 0x" << uop->getInstructionAddress() << ", " << uop->getBranchPrediction().taken << std::endl;
++      // Misprediction; flush the pipeline
++      shouldFlush_ = true;
++      flushAfter_ = uop->getInstructionId();
++      // Update the branch misprediction counter
++      branchMispredicts_++;
++    }
++  }
++
++  // Operand forwarding; allows a dependent uop to execute next cycle
++  //if (!uop->isMul() && !uop->isDiv()) {
++  //  forwardOperands_(uop->getDestinationRegisters(), uop->getResults());
++  //}
++
++  output_.getTailSlots()[0] = std::move(uop);
++}
++
++bool ExecuteUnit::shouldFlush() const { return shouldFlush_; }
++uint64_t ExecuteUnit::getFlushAddress() const { return pc_; }
++uint64_t ExecuteUnit::getFlushSeqId() const { return flushAfter_; }
++
++void ExecuteUnit::purgeFlushed() {
++  auto& uop = input_.getHeadSlots()[0];
++  if (uop != nullptr) {
++    if (!uop->isFlushed()) {
++      uop->setFlushed();
++    }
++  }
++
++  if (pipeline_.size() == 0) {
++    return;
++  }
++
++  // If the newest instruction has been flushed, clear any stalls.
++  if (pipeline_.back().insn->isFlushed()) {
++    stallUntil_ = tickCounter_;
++  }
++
++  // Iterate over the pipeline and remove flushed instructions
++  auto it = pipeline_.begin();
++  while (it != pipeline_.end()) {
++    auto& entry = *it;
++    if (entry.insn->isFlushed()) {
++      it = pipeline_.erase(it);
++    } else {
++      it++;
++    }
++  }
++
++  // If first blocking in-flight instruction is flushed, ensure another
++  // non-flushed stalled instruction takes it place in the pipeline if
++  // available.
++  bool replace = false;
++  if (operationsStalled_.size() > 0 &&
++      operationsStalled_.front()->isFlushed()) {
++    replace = true;
++  }
++  auto itStall = operationsStalled_.begin();
++  while (itStall != operationsStalled_.end()) {
++    auto& entry = *itStall;
++    if (entry->isFlushed()) {
++      itStall = operationsStalled_.erase(itStall);
++    } else {
++      itStall++;
++    }
++  }
++
++  if (replace && operationsStalled_.size() > 0) {
++    // Add uop to pipeline
++    auto& uop = operationsStalled_.front();
++    pipeline_.push_back({nullptr, tickCounter_ + uop->getLatency() - 1});
++    pipeline_.back().insn = std::move(uop);
++    operationsStalled_.front() = pipeline_.back().insn;
++  }
++}
++
++uint64_t ExecuteUnit::getBranchExecutedCount() const {
++  return branchesExecuted_;
++}
++uint64_t ExecuteUnit::getBranchMispredictedCount() const {
++  return branchMispredicts_;
++}
++
++uint64_t ExecuteUnit::getCycles() const { return cycles_; }
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/lib/pipeline_hi/FetchUnit.cc b/src/lib/pipeline_hi/FetchUnit.cc
+new file mode 100644
+index 00000000..4de190ef
+--- /dev/null
++++ b/src/lib/pipeline_hi/FetchUnit.cc
+@@ -0,0 +1,265 @@
++#include "simeng/pipeline_hi/FetchUnit.hh"
++
++namespace simeng {
++namespace pipeline_hi {
++
++FetchUnit::FetchUnit(PipelineBuffer<MacroOp>& output,
++                     MemoryInterface& instructionMemory,
++                     uint64_t programByteLength, uint64_t entryPoint,
++                     uint8_t blockSize, const arch::Architecture& isa,
++                     BranchPredictor& branchPredictor)
++    : output_(output),
++      pc_(entryPoint),
++      instructionMemory_(instructionMemory),
++      programByteLength_(programByteLength),
++      isa_(isa),
++      branchPredictor_(branchPredictor),
++      blockSize_(blockSize),
++      blockMask_(~(blockSize_ - 1)) {
++  assert(blockSize_ >= isa_.getMaxInstructionSize() &&
++         "fetch block size must be larger than the largest instruction");
++  fetchBuffer_ = new uint8_t[2 * blockSize_];
++  requestFromPC();
++}
++
++FetchUnit::~FetchUnit() { delete[] fetchBuffer_; }
++
++void FetchUnit::tick() {
++  if (output_.isStalled()) {
++    return;
++  }
++
++  if (hasHalted_ || waitSCEval_) {
++    return;
++  }
++
++  // If loop buffer has been filled, fill buffer to decode
++  // if (loopBufferState_ == LoopBufferState::SUPPLYING) {
++  //   auto outputSlots = output_.getTailSlots();
++  //   for (size_t slot = 0; slot < output_.getWidth(); slot++) {
++  //     auto& macroOp = outputSlots[slot];
++  //     auto bytesRead = isa_.predecode(&(loopBuffer_.front().encoding),
++  //                                     loopBuffer_.front().instructionSize,
++  //                                     loopBuffer_.front().address, macroOp);
++
++  //     assert(bytesRead != 0 && "predecode failure for loop buffer entry");
++
++  //     // Set prediction to recorded value during loop buffer filling
++  //     if (macroOp[0]->isBranch()) {
++  //       macroOp[0]->setBranchPrediction(loopBuffer_.front().prediction);
++  //     }
++
++  //     // Cycle queue by moving front entry to back
++  //     loopBuffer_.push_back(loopBuffer_.front());
++  //     loopBuffer_.pop_front();
++  //   }
++  //   return;
++  // }
++
++  // Pointer to the instruction data to decode from
++  const uint8_t* buffer;
++  uint8_t bufferOffset;
++
++  // Check if more instruction data is required
++  if (bufferedBytes_ < isa_.getMaxInstructionSize()) {
++    // Calculate the address of the next fetch block
++    uint64_t blockAddress;
++    if (bufferedBytes_ > 0) {
++      // There is already some data in the buffer, so check for the next block
++      bufferOffset = 0;
++      blockAddress = pc_ + bufferedBytes_;
++      assert((blockAddress & ~blockMask_) == 0 && "misaligned fetch buffer");
++    } else {
++      // Fetch buffer is empty, so start from the PC
++      blockAddress = pc_ & blockMask_;
++      bufferOffset = pc_ - blockAddress;
++    }
++
++    // Find fetched memory that matches the desired block
++    const auto& fetched = instructionMemory_.getCompletedReads();
++
++    size_t fetchIndex;
++    for (fetchIndex = 0; fetchIndex < fetched.size(); fetchIndex++) {
++      if (fetched[fetchIndex].target.address == blockAddress) {
++        break;
++      }
++    }
++    if (fetchIndex == fetched.size()) {
++      // Need to wait for fetched instructions
++      return;
++    }
++
++    // TODO: Handle memory faults
++    assert(fetched[fetchIndex].data && "Memory read failed");
++    const uint8_t* fetchData = fetched[fetchIndex].data.getAsVector<uint8_t>();
++
++    // Copy fetched data to fetch buffer after existing data
++    std::memcpy(fetchBuffer_ + bufferedBytes_, fetchData + bufferOffset,
++                blockSize_ - bufferOffset);
++
++    bufferedBytes_ += blockSize_ - bufferOffset;
++    buffer = fetchBuffer_;
++    // Decoding should start from the beginning of the fetchBuffer_.
++    bufferOffset = 0;
++  } else {
++    // There is already enough data in the fetch buffer, so use that
++    buffer = fetchBuffer_;
++    bufferOffset = 0;
++  }
++
++  // Check we have enough data to begin decoding
++  if (bufferedBytes_ == isa_.getMinInstructionSize()) {
++    //Check if those bytes points to a instruction with minimum size or more data is required. If more data is required return
++    // TODO: this is not generic solution, just trying to make it work
++    uint16_t rawBits;
++    memcpy(&rawBits, buffer + bufferOffset, 2);
++    if((rawBits & 0x3) == 0x3) {
++      //std::cout << std::hex << "Only 2 bytes left in fetch buffer and not compresses instr type, current PC: 0x" << pc_ << std::endl;
++      return;
++    }
++  }
++
++  auto outputSlots = output_.getTailSlots();
++  for (size_t slot = 0; slot < output_.getWidth(); slot++) {
++    auto& macroOp = outputSlots[slot];
++
++    auto bytesRead =
++        isa_.predecode(buffer + bufferOffset, bufferedBytes_, pc_, macroOp);
++
++    // If predecode fails, bail and wait for more data
++    if (bytesRead == 0) {
++      assert(bufferedBytes_ < isa_.getMinInstructionSize() &&
++             "unexpected predecode failure");
++      break;
++    }
++
++    // Create branch prediction after identifing instruction type
++    // (e.g. RET, BL, etc).
++    BranchPrediction prediction = {false, 0};
++    if (macroOp[0]->isBranch()) {
++      prediction = branchPredictor_.predict(pc_, macroOp[0]->getBranchType(),
++                                            macroOp[0]->getKnownOffset(),
++                                            (uint8_t)bytesRead);
++      macroOp[0]->setBranchPrediction(prediction);
++    }
++
++    // if (loopBufferState_ == LoopBufferState::FILLING) {
++    //   // Record instruction fetch information in loop body
++    //   uint32_t encoding;
++    //   memcpy(&encoding, buffer + bufferOffset, sizeof(uint32_t));
++    //   loopBuffer_.push_back(
++    //       {encoding, bytesRead, pc_, macroOp[0]->getBranchPrediction()});
++
++    //   if (pc_ == loopBoundaryAddress_) {
++    //     // loopBoundaryAddress_ has been fetched whilst filling the loop buffer.
++    //     // Stop filling as loop body has been recorded and begin to supply
++    //     // decode unit with instructions from the loop buffer
++    //     loopBufferState_ = LoopBufferState::SUPPLYING;
++    //     bufferedBytes_ = 0;
++    //     break;
++    //   }
++    // } else if (loopBufferState_ == LoopBufferState::WAITING &&
++    //            pc_ == loopBoundaryAddress_) {
++    //   // Once set loopBoundaryAddress_ is fetched, start to fill loop buffer
++    //   loopBufferState_ = LoopBufferState::FILLING;
++    // }
++
++    assert(bytesRead <= bufferedBytes_ &&
++           "Predecode consumed more bytes than were available");
++    // Increment the offset, decrement available bytes
++    bufferOffset += bytesRead;
++    bufferedBytes_ -= bytesRead;
++
++    if (!prediction.taken) {
++      // Predicted as not taken; increment PC to next instruction
++      pc_ += bytesRead;
++    } else {
++      // Predicted as taken; set PC to predicted target address
++      pc_ = prediction.target;
++    }
++//    std::cout << std::hex << "PC: 0x" << pc_ << ", PBL: 0x" << programByteLength_ << std::endl;
++    if (pc_ == 0 && (macroOp[0]->getBranchType() == BranchType::SubroutineCall)) {
++      waitSCEval_ = true;
++      break;
++    }
++
++    if (pc_ >= programByteLength_) {
++      hasHalted_ = true;
++      break;
++    }
++
++    if (prediction.taken) {
++      if (slot + 1 < output_.getWidth()) {
++        branchStalls_++;
++      }
++      // Can't continue fetch immediately after a branch
++      bufferedBytes_ = 0;
++      break;
++    }
++
++    // Too few bytes remaining in buffer to continue
++    if (bufferedBytes_ == 0) {
++      break;
++    }
++  }
++
++  if (bufferedBytes_ > 0) {
++    // Move start of fetched data to beginning of fetch buffer
++    std::memmove(fetchBuffer_, buffer + bufferOffset, bufferedBytes_);
++  }
++
++  instructionMemory_.clearCompletedReads();
++}
++
++void FetchUnit::registerLoopBoundary(uint64_t branchAddress) {
++  // Set branch which forms the loop as the loopBoundaryAddress_ and place loop
++  // buffer in state to begin filling once the loopBoundaryAddress_ has been
++  // fetched
++  loopBufferState_ = LoopBufferState::WAITING;
++  loopBoundaryAddress_ = branchAddress;
++}
++
++bool FetchUnit::hasHalted() const { return hasHalted_; }
++
++void FetchUnit::updatePC(uint64_t address) {
++  pc_ = address;
++  bufferedBytes_ = 0;
++  hasHalted_ = (pc_ >= programByteLength_);
++  waitSCEval_ = false;
++}
++
++void FetchUnit::requestFromPC() {
++  // Do nothing if buffer already contains enough data
++  if (bufferedBytes_ >= isa_.getMaxInstructionSize()) return;
++
++  // Do nothing if unit has halted to avoid invalid speculative memory reads
++  // beyond the programByteLength_
++  if (hasHalted_ || waitSCEval_) return;
++
++  uint64_t blockAddress;
++  if (bufferedBytes_ > 0) {
++    // There's already some data in the buffer, so fetch the next block
++    blockAddress = pc_ + bufferedBytes_;
++    assert((blockAddress & ~blockMask_) == 0 && "misaligned fetch buffer");
++  } else {
++    // Fetch buffer is empty, so fetch from the PC
++    blockAddress = pc_ & blockMask_;
++  }
++
++  instructionMemory_.requestRead({blockAddress, blockSize_});
++}
++
++uint64_t FetchUnit::getBranchStalls() const { return branchStalls_; }
++
++void FetchUnit::flushLoopBuffer() {
++  // loopBuffer_.clear();
++  // loopBufferState_ = LoopBufferState::IDLE;
++  // loopBoundaryAddress_ = 0;
++}
++
++void FetchUnit::flushPredictor(uint64_t address) {
++  branchPredictor_.flush(address);
++}
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/lib/pipeline_hi/LoadStoreQueue.cc b/src/lib/pipeline_hi/LoadStoreQueue.cc
+new file mode 100644
+index 00000000..c0b752e8
+--- /dev/null
++++ b/src/lib/pipeline_hi/LoadStoreQueue.cc
+@@ -0,0 +1,315 @@
++#include "simeng/pipeline_hi/LoadStoreQueue.hh"
++
++#include <array>
++#include <cassert>
++#include <cstring>
++#include <iostream>
++
++namespace simeng {
++namespace pipeline_hi {
++
++/** Check whether requests `a` and `b` overlap. */
++bool requestsOverlap(MemoryAccessTarget a, MemoryAccessTarget b) {
++  // Check whether one region ends before the other begins, implying no overlap,
++  // and negate
++  return !(a.address + a.size <= b.address || b.address + b.size <= a.address);
++}
++
++LoadStoreQueue::LoadStoreQueue(
++    unsigned int maxCombinedSpace, MemoryInterface& memory,
++    span<PipelineBuffer<std::shared_ptr<Instruction>>> completionSlots,
++    std::function<void(span<Register>, span<RegisterValue>)> forwardOperands,
++    bool exclusive, uint16_t loadBandwidth, uint16_t storeBandwidth,
++    uint16_t permittedRequests, uint16_t permittedLoads,
++    uint16_t permittedStores)
++    : completionSlots_(completionSlots),
++      forwardOperands_(forwardOperands),
++      maxCombinedSpace_(maxCombinedSpace),
++      combined_(true),
++      memory_(memory),
++      exclusive_(exclusive),
++      loadBandwidth_(loadBandwidth),
++      storeBandwidth_(storeBandwidth),
++      totalLimit_(permittedRequests),
++      // Set per-cycle limits for each request type
++      reqLimits_{permittedLoads, permittedStores} {};
++
++LoadStoreQueue::LoadStoreQueue(
++    unsigned int maxLoadQueueSpace, unsigned int maxStoreQueueSpace,
++    MemoryInterface& memory,
++    span<PipelineBuffer<std::shared_ptr<Instruction>>> completionSlots,
++    std::function<void(span<Register>, span<RegisterValue>)> forwardOperands,
++    bool exclusive, uint16_t loadBandwidth, uint16_t storeBandwidth,
++    uint16_t permittedRequests, uint16_t permittedLoads,
++    uint16_t permittedStores)
++    : completionSlots_(completionSlots),
++      forwardOperands_(forwardOperands),
++      maxLoadQueueSpace_(maxLoadQueueSpace),
++      maxStoreQueueSpace_(maxStoreQueueSpace),
++      combined_(false),
++      memory_(memory),
++      exclusive_(exclusive),
++      loadBandwidth_(loadBandwidth),
++      storeBandwidth_(storeBandwidth),
++      totalLimit_(permittedRequests),
++      // Set per-cycle limits for each request type
++      reqLimits_{permittedLoads, permittedStores} {};
++
++unsigned int LoadStoreQueue::getLoadQueueSpace() const {
++  if (combined_) {
++    return getCombinedSpace();
++  } else {
++    return getLoadQueueSplitSpace();
++  }
++}
++unsigned int LoadStoreQueue::getStoreQueueSpace() const {
++  if (combined_) {
++    return getCombinedSpace();
++  } else {
++    return getStoreQueueSplitSpace();
++  }
++}
++unsigned int LoadStoreQueue::getTotalSpace() const {
++  if (combined_) {
++    return getCombinedSpace();
++  } else {
++    return getLoadQueueSplitSpace() + getStoreQueueSplitSpace();
++  }
++}
++
++unsigned int LoadStoreQueue::getLoadQueueSplitSpace() const {
++  return maxLoadQueueSpace_ - loadQueue_.size();
++}
++unsigned int LoadStoreQueue::getStoreQueueSplitSpace() const {
++  return maxStoreQueueSpace_ - storeQueue_.size();
++}
++unsigned int LoadStoreQueue::getCombinedSpace() const {
++  return maxCombinedSpace_ - loadQueue_.size() - storeQueue_.size();
++}
++
++bool isMisAligned(uint64_t addr, uint8_t sz) {
++  if(((addr & 0x1) && sz==2) || ((addr & 0x3) && sz==4)) {
++    return true;
++  }
++  return false;
++}
++
++void LoadStoreQueue::addLoad(const std::shared_ptr<Instruction>& insn) {
++
++  const auto& addresses = insn->getGeneratedAddresses();
++
++  assert(addresses.size()==1 && "Expecting only 1 address in load request");
++  // Do something to split into multiple requests if alignment is required for case like crossing 4 byte boundary.
++
++  loadQueue_.push_back(insn);
++  uint64_t add_tick = 1;
++  bool isMisAlign = false;
++  if (isMisAligned(addresses[0].address, addresses[0].size)) {
++    add_tick+=1;
++    isMisAlign=true;
++  }
++  requestQueue_.push_back({{}, {}, insn, LOAD, (tickCounter_+add_tick) + insn->getLSQLatency(), isMisAlign});
++  // Submit request write to memory interface early as the architectural state
++  // considers the store to be retired and thus its operation complete
++
++  for (size_t i = 0; i < addresses.size(); i++) {
++    //memory_.requestWrite(addresses[i], data[i]);
++    // Still add addresses to requestQueue_ to ensure contention of resources is
++    // correctly simulated
++    requestQueue_.back().reqAddresses.push(addresses[i]);
++  }
++
++  //loadQueue_.push_back(insn);
++  //startLoad(insn);
++}
++
++void LoadStoreQueue::addStore(const std::shared_ptr<Instruction>& insn) {
++
++  const auto& addresses = insn->getGeneratedAddresses();
++  span<const simeng::RegisterValue> data = insn->getData();
++
++  assert(addresses.size()==1 && "Expecting only 1 address in store request");
++  // Do something to split into multiple requests if alignment is required for case like crossing 4 byte boundary.
++
++  storeQueue_.push_back({insn, data});
++
++  uint64_t add_tick = 1;
++  bool isMisAlign = false;
++  if (isMisAligned(addresses[0].address, addresses[0].size)) {
++    add_tick+=1;
++    isMisAlign = true;
++  }
++
++  requestQueue_.push_back({{}, {}, insn, STORE, (tickCounter_+add_tick) + insn->getLSQLatency(), isMisAlign});
++  // Submit request write to memory interface early as the architectural state
++  // considers the store to be retired and thus its operation complete
++
++  for (size_t i = 0; i < addresses.size(); i++) {
++    //memory_.requestWrite(addresses[i], data[i]);
++    // Still add addresses to requestQueue_ to ensure contention of resources is
++    // correctly simulated
++    requestQueue_.back().reqAddresses.push(addresses[i]);
++    requestQueue_.back().data.push(data[i]);
++  }
++  //storeQueue_.push_back({insn, {}});
++  //supplyStoreData(insn);
++  //commitStore(insn);
++}
++
++void LoadStoreQueue::startLoad(const std::shared_ptr<Instruction>& insn) {
++  return;
++}
++
++void LoadStoreQueue::supplyStoreData(const std::shared_ptr<Instruction>& insn) {
++  return;
++}
++
++bool LoadStoreQueue::commitStore(const std::shared_ptr<Instruction>& uop) {
++
++  if (storeQueue_.front().first == uop) {
++    storeQueue_.pop_front();
++  } else {
++    assert(false && "The commited store is not the one in the front of the storeQueue_");
++  }
++  return true;
++}
++
++void LoadStoreQueue::commitLoad(const std::shared_ptr<Instruction>& uop) {
++
++  if (loadQueue_.front() == uop) {
++    loadQueue_.pop_front();
++  } else {
++    assert(false && "The commited store is not the one in the front of the loadQueue_");
++  }
++  return;
++}
++
++void LoadStoreQueue::purgeFlushed() {
++
++  return;
++
++}
++
++bool LoadStoreQueue::isBusy() const {
++  // TODO: This is just to allow only 1 outstanding request to be used for SST integeration.
++  //if (activeMisAlignedOpr() || loadQueue_.size()>=1 || storeQueue_.size()>=1) {
++  if (activeMisAlignedOpr() || (loadQueue_.size()+storeQueue_.size())>=2) {
++    return true;
++  }
++  return false;
++}
++
++void LoadStoreQueue::tick() {
++  tickCounter_++;
++
++  //Request at the front of the queue should be sent to memory first
++  //Ensure its scheduled after necessary tick
++  if (requestQueue_.size() > 0) {
++    requestEntry1& oldestreq = requestQueue_.front();
++    if (tickCounter_ >= oldestreq.reqtick) {
++      if(oldestreq.type == LOAD) {
++        memory_.requestRead(oldestreq.reqAddresses.front(), (uint64_t) busReqId);
++        oldestreq.reqAddresses.pop();
++        if (oldestreq.reqAddresses.size() == 0) { // All requests sent
++          requestQueue_.pop_front();
++        }
++        requestedLoads_.emplace(busReqId, oldestreq.insn);
++        numLoads++;
++        latencyLoads_.emplace(busReqId, tickCounter_);
++        busReqId++;
++      } else if(oldestreq.type == STORE) {
++        memory_.requestWrite(oldestreq.reqAddresses.front(), oldestreq.data.front());
++        oldestreq.reqAddresses.pop();
++        oldestreq.data.pop();
++        if (oldestreq.reqAddresses.size() == 0) { // All requests sent
++          requestQueue_.pop_front();
++          //Verify same instruction. and remove from the storeQueue_ as well
++          //storeQueue_.pop_front();//No need
++        }
++      } else {
++        assert(false && "Unknown request type to be scheduled to memory");
++      }
++    }
++  }
++
++  //processResponse();
++}
++
++void LoadStoreQueue::processResponse() {
++  // Process completed read requests
++  for (const auto& response : memory_.getCompletedReads()) {
++    const auto& address = response.target.address;
++    const auto& data = response.data;
++
++    // TODO: Detect and handle non-fatal faults (e.g. page fault)
++
++    // Find instruction that requested the memory read
++    const auto& itr = requestedLoads_.find(response.requestId);
++    if (itr == requestedLoads_.end()) {
++      continue;
++    } else {
++      requestedLoads_.erase(response.requestId);
++      uint32_t ldLatency = ((tickCounter_ + 1) - latencyLoads_.at(response.requestId));
++      if (ldLatency > maxLdLatency) {
++        maxLdLatency = ldLatency;
++      }
++      if (ldLatency < minLdLatency) {
++        minLdLatency = ldLatency;
++      }
++      totalLdLatency += ldLatency;
++      //std::cout << std::dec << "Total Ld latency: " << totalLdLatency << ", numLoads: " << numLoads  << std::endl;
++      latencyLoads_.erase(response.requestId);
++    }
++    // Supply data to the instruction and execute if it is ready
++    const auto& load = itr->second;
++    load->supplyData(address, data);
++    if (load->hasAllData()) {
++      // This load has completed
++      load->execute();
++      /*if (load->isStoreData()) {
++        supplyStoreData(load);
++      }*/
++      completedLoads_.push(load);
++    }
++  }
++  memory_.clearCompletedReads();
++
++  // Pop from the front of the completed loads queue and send to writeback
++  size_t count = 0;
++  while (completedLoads_.size() > 0 && count < completionSlots_.size()) {
++    const auto& insn = completedLoads_.front();
++
++    // Don't process load instruction if it has been flushed
++    if (insn->isFlushed()) {
++      completedLoads_.pop();
++      continue;
++    }
++
++    // Forward the results
++    // forwardOperands_(insn->getDestinationRegisters(), insn->getResults());
++
++    completionSlots_[count].getTailSlots()[0] = std::move(insn);
++
++    completedLoads_.pop();
++
++    count++;
++  }
++}
++
++std::shared_ptr<Instruction> LoadStoreQueue::getViolatingLoad() const {
++  return violatingLoad_;
++}
++
++//Clean up is required!
++bool LoadStoreQueue::activeMisAlignedOpr() const {
++  //if the front of the request queue has a misaligned request that is not yet being sent to the bus then its better to halt LSU taking new requests.
++  // if(storeQueue_.size() > 0 && activeMisAlignedStore) {
++  //   return true;
++  // }
++  return (requestQueue_.size() > 0 && requestQueue_.front().isMisAligned && ((requestQueue_.front().reqtick-tickCounter_)==1));
++}
++
++bool LoadStoreQueue::isCombined() const { return combined_; }
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/lib/pipeline_hi/RegDepMap.cc b/src/lib/pipeline_hi/RegDepMap.cc
+new file mode 100644
+index 00000000..4ab004bf
+--- /dev/null
++++ b/src/lib/pipeline_hi/RegDepMap.cc
+@@ -0,0 +1,143 @@
++#include "simeng/pipeline_hi/RegDepMap.hh"
++
++#include <iostream>
++
++//#define RDMDEBUG
++#ifdef RDMDEBUG
++#define DEBUG(x) std::cout << "Core: " << std::hex << x << std::endl;
++#else
++#define DEBUG(x) do { } while (false);
++#endif
++
++namespace simeng {
++namespace pipeline_hi {
++
++const Register l_ZERO_REGISTER = {0, 0};
++
++RegDepMap::RegDepMap(const std::vector<RegisterFileStructure> registerFileStructures,
++                     const RegisterFileSet& registerFileSet) : 
++                registerFileStructures_(registerFileStructures),
++                registerFileSet_(registerFileSet) {
++  regMap_.resize(registerFileStructures_.size());//Just for Integer Register File for now
++  for (size_t type=0; type<registerFileStructures_.size(); type++) {
++    regMap_[type].resize(registerFileStructures_.at(type).quantity);
++  }
++}
++
++RegDepMap::~RegDepMap()
++{
++  for (unsigned i = 0; i < regMap_.size(); i++) {
++    for (unsigned j = 0; j < regMap_[i].size(); j++)
++      regMap_[i][j].clear();
++    regMap_[i].clear();
++  }
++  regMap_.clear();
++}
++
++void RegDepMap::insert(InstrPtr instr)
++{
++  //TODO: IRF X0 is not a dependency!
++  auto& destinationRegisters = instr->getDestinationRegisters();
++  for(const auto& reg: destinationRegisters) {
++    if(reg != l_ZERO_REGISTER) { //Not X0
++      outstandingDep_++;
++      DEBUG("Adding Depencency: addr, 0x" << instr->getInstructionAddress() << std::dec << ", dest: " << reg << ", outstanding: " << outstandingDep_);
++      regMap_[reg.type][reg.tag].push_back(instr);
++    }
++  }
++}
++
++void RegDepMap::remove(InstrPtr instr)
++{
++  auto& destinationRegisters = instr->getDestinationRegisters();
++  for(const auto& reg: destinationRegisters) {
++    auto it = regMap_[reg.type][reg.tag].begin();
++    while (it != regMap_[reg.type][reg.tag].end()) {
++      if(*it == instr) {
++        outstandingDep_--;
++        DEBUG("Removing Depencency: addr, 0x" << instr->getInstructionAddress() << std::dec << ", dest: " << reg << ", outstanding: " << outstandingDep_);
++        it = regMap_[reg.type][reg.tag].erase(it);
++        break;
++      } else {
++        it++;
++      }
++    }
++  }
++}
++
++bool RegDepMap::canRead(InstrPtr instr)
++{
++  bool dependency = false;
++  auto& sourceRegisters = instr->getOperandRegisters();
++  for (uint16_t i = 0; i < sourceRegisters.size(); i++) {
++    const auto& srcReg = sourceRegisters[i];
++
++    if (!instr->isOperandReady(i)) {
++      // The operand hasn't already been supplied
++      if (regMap_[srcReg.type][srcReg.tag].size() == 0) {//pick up value from register file
++        instr->supplyOperand(i, registerFileSet_.get(srcReg));         
++      } else if (regMap_[srcReg.type][srcReg.tag].back()->hasExecuted() &&
++                 !(regMap_[srcReg.type][srcReg.tag].back()->isMul() || regMap_[srcReg.type][srcReg.tag].back()->isDiv() ||
++                   (regMap_[srcReg.type][srcReg.tag].back()->isLoad() && !instr->isStoreData()))) {//pick up value from last executed instruction
++        const auto& destRegisters = regMap_[srcReg.type][srcReg.tag].back()->getDestinationRegisters();
++        const auto& destValues = regMap_[srcReg.type][srcReg.tag].back()->getResults();
++        for (size_t j = 0; j < destRegisters.size(); j++) {
++          const auto& destReg = destRegisters[j];
++          if (destReg == srcReg) {
++            instr->supplyOperand(i, destValues[j]);
++            break;
++          }
++        }
++      } else {
++        dependency = true;
++      }
++    }
++  }
++
++  return !dependency;
++}
++
++bool RegDepMap::canWrite(InstrPtr instr)
++{
++  bool dependency = false;
++  auto& destRegisters = instr->getDestinationRegisters();
++  for(uint16_t i = 0; i < destRegisters.size(); i++) {
++      const auto& destReg = destRegisters[i];
++      if (regMap_[destReg.type][destReg.tag].size() > 0 &&
++          !regMap_[destReg.type][destReg.tag].back()->hasExecuted()) {
++        dependency = true;
++        break;
++      }
++  }
++  return !dependency || (instr->isLoad());
++}
++
++//Clean up the options logic to ensure all of them work well together
++bool RegDepMap::canForward(InstrPtr instr)
++{
++  return true;
++}
++
++void RegDepMap::purgeFlushed() {
++  for (auto& registerType : regMap_) {
++    for (auto& dependencyList : registerType) {
++      auto it = dependencyList.begin();
++      while (it != dependencyList.end()) {
++        DEBUG("Purge entry present at addr: 0x" << (*it)->getInstructionAddress());
++        if ((*it)->isFlushed()) {
++          outstandingDep_--;
++          it = dependencyList.erase(it);
++        } else {
++          it++;
++        }
++      }
++    }
++  }
++}
++
++void RegDepMap::dump()
++{
++}
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/lib/pipeline_hi/RegisterAliasTable.cc b/src/lib/pipeline_hi/RegisterAliasTable.cc
+new file mode 100644
+index 00000000..0c813a6f
+--- /dev/null
++++ b/src/lib/pipeline_hi/RegisterAliasTable.cc
+@@ -0,0 +1,110 @@
++#include "simeng/pipeline_hi/RegisterAliasTable.hh"
++
++#include <cassert>
++
++namespace simeng {
++namespace pipeline_hi {
++
++RegisterAliasTable::RegisterAliasTable(
++    std::vector<RegisterFileStructure> architecturalStructure,
++    std::vector<uint16_t> physicalRegisterCounts)
++    : mappingTable_(architecturalStructure.size()),
++      historyTable_(architecturalStructure.size()),
++      destinationTable_(architecturalStructure.size()),
++      freeQueues_(architecturalStructure.size()) {
++  assert(architecturalStructure.size() == physicalRegisterCounts.size() &&
++         "The number of physical register types does not match the number of "
++         "architectural register types");
++
++  for (size_t type = 0; type < architecturalStructure.size(); type++) {
++    auto archCount = architecturalStructure[type].quantity;
++    auto physCount = physicalRegisterCounts[type];
++    assert(archCount <= physCount &&
++           "Cannot have fewer physical registers than architectural registers");
++
++    // Set up the initial mapping table state for this register type
++    mappingTable_[type].resize(archCount);
++
++    for (size_t tag = 0; tag < archCount; tag++) {
++      // Pre-assign a physical register to each architectural register
++      mappingTable_[type][tag] = tag;
++    }
++
++    // Add remaining physical registers to free queue
++    for (size_t tag = archCount; tag < physCount; tag++) {
++      freeQueues_[type].push(tag);
++    }
++
++    // Set up history/destination tables
++    historyTable_[type].resize(physCount);
++    destinationTable_[type].resize(physCount);
++  }
++};
++
++Register RegisterAliasTable::getMapping(Register architectural) const {
++  // Asserts to ensure mapping isn't attempted for an out-of-bound index (i.e.
++  // mapping of WZR / XZR)
++  assert(architectural.type < mappingTable_.size() &&
++         "Invalid register type. Cannot find RAT mapping.");
++  assert(architectural.type >= 0 &&
++         "Invalid register type. Cannot find RAT mapping.");
++
++  auto tag = mappingTable_[architectural.type][architectural.tag];
++  return {architectural.type, tag};
++}
++
++bool RegisterAliasTable::canAllocate(uint8_t type,
++                                     unsigned int quantity) const {
++  return (freeQueues_[type].size() >= quantity);
++}
++
++bool RegisterAliasTable::canRename(uint8_t type) const {
++  // Renaming possible iff there are more physical than architectural registers
++  return destinationTable_[type].size() > mappingTable_[type].size();
++}
++
++unsigned int RegisterAliasTable::freeRegistersAvailable(uint8_t type) const {
++  return freeQueues_[type].size();
++}
++
++Register RegisterAliasTable::allocate(Register architectural) {
++  std::queue<uint16_t>& freeQueue = freeQueues_[architectural.type];
++  assert(freeQueue.size() > 0 &&
++         "Attempted to allocate free register when none were available");
++
++  auto tag = freeQueue.front();
++  freeQueue.pop();
++
++  // Keep the old physical register in the history table
++  historyTable_[architectural.type][tag] =
++      mappingTable_[architectural.type][architectural.tag];
++
++  // Update the mapping table with the new tag, and mark the architectural
++  // register it replaces in the destination table
++  mappingTable_[architectural.type][architectural.tag] = tag;
++  destinationTable_[architectural.type][tag] = architectural.tag;
++
++  return {architectural.type, tag};
++}
++
++void RegisterAliasTable::commit(Register physical) {
++  // Find the register previously mapped to the same architectural register and
++  // free it
++  auto oldTag = historyTable_[physical.type][physical.tag];
++  freeQueues_[physical.type].push(oldTag);
++}
++void RegisterAliasTable::rewind(Register physical) {
++  // Find which architectural tag this referred to
++  auto destinationTag = destinationTable_[physical.type][physical.tag];
++  // Rewind the mapping table to the old physical tag
++  mappingTable_[physical.type][destinationTag] =
++      historyTable_[physical.type][physical.tag];
++  // Add the rewound physical tag back to the free queue
++  freeQueues_[physical.type].push(physical.tag);
++}
++void RegisterAliasTable::free(Register physical) {
++  freeQueues_[physical.type].push(physical.tag);
++}
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/lib/pipeline_hi/ReorderBuffer.cc b/src/lib/pipeline_hi/ReorderBuffer.cc
+new file mode 100644
+index 00000000..c653ffd5
+--- /dev/null
++++ b/src/lib/pipeline_hi/ReorderBuffer.cc
+@@ -0,0 +1,206 @@
++#include "simeng/pipeline_hi/ReorderBuffer.hh"
++
++#include <algorithm>
++#include <cassert>
++#include <iostream>
++
++namespace simeng {
++namespace pipeline_hi {
++
++ReorderBuffer::ReorderBuffer(
++    unsigned int maxSize, RegisterAliasTable& rat, LoadStoreQueue& lsq,
++    std::function<void(const std::shared_ptr<Instruction>&)> raiseException,
++    std::function<void(uint64_t branchAddress)> sendLoopBoundary,
++    BranchPredictor& predictor, uint16_t loopBufSize,
++    uint16_t loopDetectionThreshold)
++    : rat_(rat),
++      lsq_(lsq),
++      maxSize_(maxSize),
++      raiseException_(raiseException),
++      sendLoopBoundary_(sendLoopBoundary),
++      predictor_(predictor),
++      loopBufSize_(loopBufSize),
++      loopDetectionThreshold_(loopDetectionThreshold) {}
++
++void ReorderBuffer::reserve(const std::shared_ptr<Instruction>& insn) {
++  assert(buffer_.size() < maxSize_ &&
++         "Attempted to reserve entry in reorder buffer when already full");
++  insn->setSequenceId(seqId_);
++  seqId_++;
++  insn->setInstructionId(insnId_);
++  if (insn->isLastMicroOp()) insnId_++;
++
++  buffer_.push_back(insn);
++}
++
++void ReorderBuffer::commitMicroOps(uint64_t insnId) {
++  if (buffer_.size()) {
++    size_t index = 0;
++    int firstOp = -1;
++    bool validForCommit = false;
++
++    // Find first instance of uop belonging to macro-op instruction
++    for (; index < buffer_.size(); index++) {
++      if (buffer_[index]->getInstructionId() == insnId) {
++        firstOp = index;
++        break;
++      }
++    }
++
++    if (firstOp > -1) {
++      // If found, see if all uops are committable
++      for (; index < buffer_.size(); index++) {
++        if (buffer_[index]->getInstructionId() != insnId) break;
++        if (!buffer_[index]->isWaitingCommit()) {
++          return;
++        } else if (buffer_[index]->isLastMicroOp()) {
++          // all microOps must be in ROB for the commit to be valid
++          validForCommit = true;
++        }
++      }
++      if (!validForCommit) return;
++
++      // No early return thus all uops are committable
++      for (; firstOp < buffer_.size(); firstOp++) {
++        if (buffer_[firstOp]->getInstructionId() != insnId) break;
++        buffer_[firstOp]->setCommitReady();
++      }
++    }
++  }
++  return;
++}
++
++unsigned int ReorderBuffer::commit(unsigned int maxCommitSize) {
++  shouldFlush_ = false;
++  size_t maxCommits =
++      std::min(static_cast<size_t>(maxCommitSize), buffer_.size());
++
++  unsigned int n;
++  for (n = 0; n < maxCommits; n++) {
++    auto& uop = buffer_[0];
++    if (!uop->canCommit()) {
++      break;
++    }
++
++    if (uop->isLastMicroOp()) instructionsCommitted_++;
++
++    if (uop->exceptionEncountered()) {
++      raiseException_(uop);
++      buffer_.pop_front();
++      return n + 1;
++    }
++
++    const auto& destinations = uop->getDestinationRegisters();
++    for (int i = 0; i < destinations.size(); i++) {
++      rat_.commit(destinations[i]);
++    }
++
++    // If it's a memory op, commit the entry at the head of the respective queue
++    if (uop->isLoad()) {
++      lsq_.commitLoad(uop);
++    }
++    if (uop->isStoreAddress()) {
++      bool violationFound = lsq_.commitStore(uop);
++      if (violationFound) {
++        loadViolations_++;
++        // Memory order violation found; aborting commits and flushing
++        auto load = lsq_.getViolatingLoad();
++        shouldFlush_ = true;
++        flushAfter_ = load->getInstructionId() - 1;
++        pc_ = load->getInstructionAddress();
++
++        buffer_.pop_front();
++        return n + 1;
++      }
++    }
++
++    // Increment or swap out branch counter for loop detection
++    if (uop->isBranch() && !loopDetected_) {
++      bool increment = true;
++      if (branchCounter_.first.address != uop->getInstructionAddress()) {
++        // Mismatch on instruction address, reset
++        increment = false;
++      } else if (branchCounter_.first.outcome != uop->getBranchPrediction()) {
++        // Mismatch on branch outcome, reset
++        increment = false;
++      } else if ((instructionsCommitted_ - branchCounter_.first.commitNumber) >
++                 loopBufSize_) {
++        // Loop too big to fit in loop buffer, reset
++        increment = false;
++      }
++
++      if (increment) {
++        // Reset commitNumber value
++        branchCounter_.first.commitNumber = instructionsCommitted_;
++        // Increment counter
++        branchCounter_.second++;
++
++        if (branchCounter_.second > loopDetectionThreshold_) {
++          // If the same branch with the same outcome is sequentially retired
++          // more times than the loopDetectionThreshold_ value, identify as a
++          // loop boundary
++          loopDetected_ = true;
++          sendLoopBoundary_(uop->getInstructionAddress());
++        }
++      } else {
++        // Swap out latest branch
++        branchCounter_ = {{uop->getInstructionAddress(),
++                           uop->getBranchPrediction(), instructionsCommitted_},
++                          0};
++      }
++    }
++    buffer_.pop_front();
++  }
++
++  return n;
++}
++
++void ReorderBuffer::flush(uint64_t afterSeqId) {
++  // Iterate backwards from the tail of the queue to find and remove ops newer
++  // than `afterSeqId`
++  while (!buffer_.empty()) {
++    auto& uop = buffer_.back();
++    if (uop->getInstructionId() <= afterSeqId) {
++      break;
++    }
++
++    // To rewind destination registers in correct history order, rewinding of
++    // register renaming is done backwards
++    auto destinations = uop->getDestinationRegisters();
++    for (int i = destinations.size() - 1; i >= 0; i--) {
++      const auto& reg = destinations[i];
++      rat_.rewind(reg);
++    }
++    uop->setFlushed();
++    // If the instruction is a branch, supply address to branch flushing logic
++    if (uop->isBranch()) {
++      predictor_.flush(uop->getInstructionAddress());
++    }
++    buffer_.pop_back();
++  }
++
++  // Reset branch counter and loop detection
++  branchCounter_ = {{0, {false, 0}, 0}, 0};
++  loopDetected_ = false;
++}
++
++unsigned int ReorderBuffer::size() const { return buffer_.size(); }
++
++unsigned int ReorderBuffer::getFreeSpace() const {
++  return maxSize_ - buffer_.size();
++}
++
++bool ReorderBuffer::shouldFlush() const { return shouldFlush_; }
++uint64_t ReorderBuffer::getFlushAddress() const { return pc_; }
++uint64_t ReorderBuffer::getFlushSeqId() const { return flushAfter_; }
++
++uint64_t ReorderBuffer::getInstructionsCommittedCount() const {
++  return instructionsCommitted_;
++}
++
++uint64_t ReorderBuffer::getViolatingLoadsCount() const {
++  return loadViolations_;
++}
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/lib/pipeline_hi/StaticPredictor.cc b/src/lib/pipeline_hi/StaticPredictor.cc
+new file mode 100644
+index 00000000..66199899
+--- /dev/null
++++ b/src/lib/pipeline_hi/StaticPredictor.cc
+@@ -0,0 +1,120 @@
++#include "simeng/pipeline_hi/StaticPredictor.hh"
++
++#include <cassert>
++
++namespace simeng {
++namespace pipeline_hi {
++//TODO: temp for get rid of yaml, delete it later
++StaticPredictor::StaticPredictor(uint8_t sType)
++    : staticType_(sType) {}
++
++StaticPredictor::StaticPredictor(YAML::Node config)
++    : staticType_(config["Branch-Predictor"]["Static-Type"].as<uint8_t>()),
++      rasSize_(config["Branch-Predictor"]["RAS-entries"].as<uint64_t>()){}
++
++StaticPredictor::~StaticPredictor() {
++  ras_.clear();
++  rasHistory_.clear();
++}
++
++BranchPrediction StaticPredictor::predict(uint64_t address, BranchType type,
++                                          uint64_t knownOffset,
++                                          uint8_t byteLength) {
++  int64_t offset = knownOffset;
++  uint64_t predict_target = (knownOffset) ? knownOffset + address : 0;
++  BranchPrediction prediction = {false, 0};
++
++  assert(byteLength > 1 && "byteLength <= 1");
++
++  if (type == BranchType::Unconditional) {
++    prediction = { true, predict_target};
++  } else if (type == BranchType::Return) {
++    if (ras_.size() > 0) {
++      predict_target = ras_.back();
++      // Record top of RAS used for target prediction
++      rasHistory_[address] = ras_.back();
++      ras_.pop_back();
++    }
++    prediction = {true, predict_target};
++  } else if (type == BranchType::SubroutineCall) { //JAL and JALR
++    if (ras_.size() >= rasSize_) {
++      ras_.pop_front();
++    }
++    ras_.push_back(address + byteLength);
++    // Record that this address is a branch-and-link instruction
++    rasHistory_[address] = 0;
++    prediction = {true, predict_target};
++  } else if (type == BranchType::Conditional) {
++    switch (staticType_) {
++      case 0: //always-taken
++        prediction = {true, predict_target};
++        break;
++
++      case 1: //always-not-taken;
++        prediction = {false, 0};
++        break;
++
++      case 2: //Backward Taken, Forward Not Taken
++      {
++        if (offset >= 0) {
++          //not taken
++          prediction = {false, address+byteLength};
++        } else {
++          prediction = {true, predict_target};
++        }
++        break;
++      }
++
++      case 3: //Forward Taken, Backward Not Taken
++      {
++        if (offset <= 0) {
++          //not taken
++          prediction = {false, address+byteLength};
++        } else {
++          prediction = {true, predict_target};
++        }
++        break;
++      }
++
++      default:
++        assert(staticType_ < 4 && "Non-supported type for static predictor");
++        break;
++    }
++  }
++
++  return prediction;
++}
++
++void StaticPredictor::update(uint64_t address, bool taken,
++                             uint64_t targetAddress, BranchType type) {}
++
++void StaticPredictor::flush(uint64_t address) {
++  // If address interacted with RAS, rewind entry
++  auto it = rasHistory_.find(address);
++  if (it != rasHistory_.end()) {
++    uint64_t target = it->second;
++    if (target != 0) {
++      // If history entry belongs to a return instruction, push target back onto
++      // stack
++      if (ras_.size() >= rasSize_) {
++        ras_.pop_front();
++      }
++      ras_.push_back(target);
++    } else {
++      // If history entry belongs to a branch-and-link instruction, pop target
++      // off of stack
++      if (ras_.size()) {
++        ras_.pop_back();
++      }
++    }
++    rasHistory_.erase(it);
++  }
++}
++BranchPrediction StaticPredictor::predict(uint64_t address, BranchType type,
++                                          uint64_t knownTarget) {
++  printf("StaticPredictor::predict(), This is overloaded and deprecated! \n");
++  return predict(address, type, knownTarget, 4);
++}
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/lib/pipeline_hi/WritebackUnit.cc b/src/lib/pipeline_hi/WritebackUnit.cc
+new file mode 100644
+index 00000000..b0dfd971
+--- /dev/null
++++ b/src/lib/pipeline_hi/WritebackUnit.cc
+@@ -0,0 +1,74 @@
++#include "simeng/pipeline_hi/WritebackUnit.hh"
++
++#include <iostream>
++
++namespace simeng {
++namespace pipeline_hi {
++
++WritebackUnit::WritebackUnit(
++    std::vector<PipelineBuffer<std::shared_ptr<Instruction>>>& completionSlots,
++    RegisterFileSet& registerFileSet,
++    std::function<void(uint64_t insnId)> flagMicroOpCommits,
++    std::function<void(const std::shared_ptr<Instruction>&)> removeDep,
++    std::function<bool(const std::shared_ptr<Instruction>&)> removeInstrOrderQ)
++    : completionSlots_(completionSlots),
++      registerFileSet_(registerFileSet),
++      flagMicroOpCommits_(flagMicroOpCommits),
++      removeDep_(removeDep),
++      removeInstrOrderQ_(removeInstrOrderQ) {}
++
++void WritebackUnit::tick() {
++  for (size_t slot = 0; slot < completionSlots_.size(); slot++) {
++    auto& uop = completionSlots_[slot].getHeadSlots()[0];
++
++    if (uop == nullptr) {
++      continue;
++    }
++
++    auto& results = uop->getResults();
++    auto& destinations = uop->getDestinationRegisters();
++    for (size_t i = 0; i < results.size(); i++) {
++      // Write results to register file
++      registerFileSet_.set(destinations[i], results[i]);
++    }
++    if (uop->isMicroOp()) {
++      uop->setWaitingCommit();
++      flagMicroOpCommits_(uop->getInstructionId());
++      if (uop->isLastMicroOp()) {
++        instructionsWritten_++;
++        committedInstsForTrace_.push_back(uop);
++      }
++    } else {
++      uop->setCommitReady();
++      removeDep_(uop);
++      instructionsWritten_++;
++      committedInstsForTrace_.push_back(uop);
++    }
++
++    completionSlots_[slot].getHeadSlots()[0] = nullptr;
++  }
++}
++
++uint64_t WritebackUnit::getInstructionsWrittenCount() const {
++  return instructionsWritten_;
++}
++
++std::vector<std::shared_ptr<Instruction>> WritebackUnit::getInstsForTrace() {
++  std::shared_ptr<Instruction> instr;
++  std::deque<std::shared_ptr<Instruction>>::iterator it =  committedInstsForTrace_.begin();
++  while(it != committedInstsForTrace_.end()) {
++    instr = *it;
++    if (removeInstrOrderQ_(instr)) {
++      committedInstsForTrace_.erase(it);
++      return {instr};
++    }
++    it++;
++  }
++  return {}; //committedInstsForTrace_;
++}
++void WritebackUnit::traceFinished() {
++  //committedInstsForTrace_.clear();
++}
++
++}  // namespace pipeline_hi
++}  // namespace simeng
+diff --git a/src/tools/simeng/main.cc b/src/tools/simeng/main.cc
+index fa9b58ba..f5cfa535 100644
+--- a/src/tools/simeng/main.cc
++++ b/src/tools/simeng/main.cc
+@@ -10,7 +10,7 @@
+ #include "simeng/version.hh"
+ 
+ /** Tick the provided core model until it halts. */
+-int simulate(simeng::Core& core, simeng::MemoryInterface& dataMemory,
++uint64_t simulate(simeng::Core& core, simeng::MemoryInterface& dataMemory,
+              simeng::MemoryInterface& instructionMemory) {
+   uint64_t iterations = 0;
+ 
+@@ -91,7 +91,7 @@ int main(int argc, char** argv) {
+ 
+   // Run simulation
+   std::cout << "[SimEng] Starting...\n" << std::endl;
+-  int iterations = 0;
++  uint64_t iterations = 0;
+   auto startTime = std::chrono::high_resolution_clock::now();
+   iterations = simulate(*core, *dataMemory, *instructionMemory);
+ 
+diff --git a/sst/SimEngCoreWrapper.cc b/sst/SimEngCoreWrapper.cc
+index 45c1bdde..668439f5 100644
+--- a/sst/SimEngCoreWrapper.cc
++++ b/sst/SimEngCoreWrapper.cc
+@@ -10,9 +10,85 @@
+ 
+ #include "Assemble.hh"
+ 
++#include <fstream>
++
+ using namespace SST::SSTSimEng;
+ using namespace SST::Interfaces;
+ 
++//For now just make sure that the code and data is loaded into memory
++// at the correct addresses instead of sending the entire process image
++void SimEngCoreWrapper::processMemoryImage() {
++  std::ifstream file(executablePath_, std::ios::binary);
++  if (!file.is_open()) {
++    return;
++  }
++
++  char elfMagic[4] = {0x7f, 'E', 'L', 'F'};
++  char fileMagic[4];
++  file.read(fileMagic, 4);
++  if (std::memcmp(elfMagic, fileMagic, sizeof(elfMagic))) {
++    return;
++  }
++
++  /**
++   * The fifth byte of the ELF Header identifies the architecture
++   * of the ELF binary i.e 32-bit or 64-bit.
++   */
++
++  // Check whether this is a 32-bit executable
++  char bitFormat;
++  file.read(&bitFormat, sizeof(bitFormat));
++  if (bitFormat != ElfBitFormat::Format32) {
++    return;
++  }
++  struct Elf32Header {
++    uint32_t type;
++    uint32_t offset;
++    uint32_t virtualAddress;
++    uint32_t physicalAddress;
++    uint32_t fileSize;
++    uint32_t memorySize;
++  };
++  uint32_t entryPoint32_;
++  std::vector<Elf32Header> headers32_;
++
++  file.seekg(0x18);
++  file.read(reinterpret_cast<char*>(&entryPoint32_), sizeof(entryPoint32_));
++  uint32_t headerOffset;
++  file.read(reinterpret_cast<char*>(&headerOffset), sizeof(headerOffset));
++  file.seekg(0x2a);
++  uint16_t headerEntrySize;
++  file.read(reinterpret_cast<char*>(&headerEntrySize), sizeof(headerEntrySize));
++  uint16_t headerEntries;
++  file.read(reinterpret_cast<char*>(&headerEntries), sizeof(headerEntries));  
++  headers32_.resize(headerEntries);
++  // Loop over all headers and extract them.
++  for (size_t i = 0; i < headerEntries; i++) {
++    file.seekg(headerOffset + (i * headerEntrySize));
++    auto& header = headers32_[i];
++
++    const int fieldBytes = 4;
++    file.read(reinterpret_cast<char*>(&(header.type)), sizeof(header.type));
++    file.read(reinterpret_cast<char*>(&(header.offset)), fieldBytes);
++    file.read(reinterpret_cast<char*>(&(header.virtualAddress)), fieldBytes);
++    file.read(reinterpret_cast<char*>(&(header.physicalAddress)), fieldBytes);
++    file.read(reinterpret_cast<char*>(&(header.fileSize)), fieldBytes);
++    file.read(reinterpret_cast<char*>(&(header.memorySize)), fieldBytes);
++  }
++  // Process headers; only observe LOAD sections for this basic implementation
++  for (const auto& header : headers32_) {
++    if (header.type == 1) {  // LOAD
++      char* imagePointer;
++      imagePointer = (char*)calloc(header.memorySize, sizeof(char));
++      file.seekg(header.offset);
++      file.read(imagePointer, header.fileSize);
++      dataMemory_->sendProcessImageToSST(imagePointer, header.memorySize, header.virtualAddress);
++    }
++  }
++  std::cout << "[SSTSimEng:SimEngCoreWrapper] Done exporting elf data into SST memory" << std::endl;
++  //assert(false && "Incomplete implementation");
++}
++
+ SimEngCoreWrapper::SimEngCoreWrapper(SST::ComponentId_t id, SST::Params& params)
+     : SST::Component(id) {
+   output_.init("[SSTSimEng:SimEngCoreWrapper] " + getName() + ":@p:@l ", 999, 0,
+@@ -95,7 +171,7 @@ void SimEngCoreWrapper::finish() {
+     std::cout << "[SimEng] " << key << ": " << value << "\n";
+   }
+ 
+-  std::cout << "\n[SimEng] Finished " << iterations_ << " ticks in " << duration
++  std::cout << "\n[SimEng] Finished " << std::dec << iterations_ << " ticks in " << duration
+             << "ms (" << std::round(khz) << " kHz, " << std::setprecision(2)
+             << mips << " MIPS)" << std::endl;
+ }
+@@ -284,13 +360,13 @@ void SimEngCoreWrapper::fabricateSimEngCore() {
+             : std::make_unique<simeng::CoreInstance>(
+                   a64fxConfigPath_, executablePath_, executableArgs_);
+   }
+-  if (coreInstance_->getSimulationMode() !=
++  /*if (coreInstance_->getSimulationMode() !=
+       simeng::SimulationMode::OutOfOrder) {
+     output_.verbose(CALL_INFO, 1, 0,
+                     "SimEng currently only supports Out-of-Order "
+                     "archetypes with SST.");
+     std::exit(EXIT_FAILURE);
+-  }
++  }*/
+   // Set the SST data memory SimEng should use
+   coreInstance_->setL1DataMemory(dataMemory_);
+ 
+@@ -303,7 +379,7 @@ void SimEngCoreWrapper::fabricateSimEngCore() {
+ 
+   // This check ensures that SST has enough memory to store the entire
+   // processImage constructed by SimEng.
+-  if (maxAddrMemory_ < coreInstance_->getProcessImageSize()) {
++  /*if (maxAddrMemory_ < coreInstance_->getProcessImageSize()) {
+     output_.verbose(
+         CALL_INFO, 1, 0,
+         "Error: SST backend memory is less than processImage size. "
+@@ -312,7 +388,7 @@ void SimEngCoreWrapper::fabricateSimEngCore() {
+         "\'addr_range_end\'. \n");
+     primaryComponentOKToEndSim();
+     std::exit(EXIT_FAILURE);
+-  }
++  }*/
+ // If testing is enabled populate heap if heap values have been specified.
+ #ifdef SIMENG_ENABLE_SST_TESTS
+   if (heapStr_ != "") {
+@@ -320,8 +396,10 @@ void SimEngCoreWrapper::fabricateSimEngCore() {
+   }
+ #endif
+   // Send the process image data over to the SST memory
+-  dataMemory_->sendProcessImageToSST(coreInstance_->getProcessImage().get(),
+-                                     coreInstance_->getProcessImageSize());
++  //dataMemory_->sendProcessImageToSST(coreInstance_->getProcessImage().get(),
++  //                                   coreInstance_->getProcessImageSize());
++
++  processMemoryImage();
+ 
+   output_.verbose(CALL_INFO, 1, 0, "SimEng core setup successfully.\n");
+   // Print out build metadata
+@@ -356,4 +434,4 @@ std::vector<uint64_t> SimEngCoreWrapper::splitHeapStr() {
+   }
+   out.push_back(static_cast<uint64_t>(std::stoull(acc)));
+   return out;
+-}
+\ No newline at end of file
++}
+diff --git a/sst/SimEngMemInterface.cc b/sst/SimEngMemInterface.cc
+index 4e07801f..678d9853 100644
+--- a/sst/SimEngMemInterface.cc
++++ b/sst/SimEngMemInterface.cc
+@@ -18,7 +18,7 @@ SimEngMemInterface::SimEngMemInterface(StandardMem* mem, uint64_t cl,
+   this->debug_ = debug;
+ };
+ 
+-void SimEngMemInterface::sendProcessImageToSST(char* image, uint64_t size) {
++void SimEngMemInterface::sendProcessImageToSST(char* image, uint64_t size, uint64_t startAddr) {
+   std::vector<uint8_t> data;
+   data.reserve(size);
+ 
+@@ -26,7 +26,8 @@ void SimEngMemInterface::sendProcessImageToSST(char* image, uint64_t size) {
+     data.push_back((uint8_t)image[i]);
+   }
+ 
+-  StandardMem::Request* req = new StandardMem::Write(0, data.size(), data);
++  StandardMem::Request* req = new StandardMem::Write(startAddr, data.size(), data);
++  std::cout << std::hex << "[SSTSimEng:SimEngMemInterface] Sending image section to SST Memory at address 0x" << startAddr << ", size 0x" << data.size() << std::endl;
+   sstMem_->sendUntimedData(req);
+   return;
+ };
+@@ -176,7 +177,7 @@ void SimEngMemInterface::requestRead(const MemoryAccessTarget& target,
+   if (debug_) {
+     std::cout << "[SSTSimEng:SSTDebug] MemRead"
+               << "-read-request-" << requestId << "-cycle-" << tickCounter_
+-              << "-split-" << requests.size() << std::endl;
++              << "-split-" << requests.size() << "-addr-0x" << std::hex << addrStart << std::endl;
+   }
+   for (StandardMem::Request* req : requests) {
+     sstMem_->send(req);
+@@ -192,7 +193,11 @@ void SimEngMemInterface::requestWrite(const MemoryAccessTarget& target,
+   AggregateWriteRequest* aggrReq = new AggregateWriteRequest(target, data);
+   std::vector<StandardMem::Request*> requests =
+       makeSSTRequests<AggregateWriteRequest>(aggrReq, addrStart, addrEnd, size);
+-
++  if (debug_) {
++    std::cout << "[SSTSimEng:SSTDebug] MemWrite"
++              << "-write-request-xx" << "-cycle-" << tickCounter_
++              << "-split-" << requests.size() << "-addr-0x" << std::hex << addrStart << std::endl;
++  }
+   for (StandardMem::Request* req : requests) {
+     sstMem_->send(req);
+   }
+diff --git a/sst/config/mcu_int_example_config.py b/sst/config/mcu_int_example_config.py
+new file mode 100644
+index 00000000..fdd3b968
+--- /dev/null
++++ b/sst/config/mcu_int_example_config.py
+@@ -0,0 +1,74 @@
++import sst
++import os
++
++DEBUG_L1 = 1
++DEBUG_MEM = 1
++DEBUG_LEVEL = 1
++
++clw = "32"
++
++# Assume this is run from SimEng root dir
++simeng_path = os.getcwd()
++binary_file = simeng_path + "/share/dhrystone_rv32imc/memory.elf" # Apply the appropriate binary
++config_file = simeng_path + "/configs/DEMO_RISCV32_mcu_sst.yaml"
++
++# Define the simulation components
++cpu = sst.Component("core", "sstsimeng.simengcore")
++cpu.addParams({
++    "simeng_config_path": config_file,
++    "executable_path": binary_file,
++    "executable_args": "",
++    "clock" : "1GHz",
++    "max_addr_memory": 4*1024*1024*1024-1,
++    "cache_line_width": clw,
++    "source": "",
++    "assemble_with_source": False,
++    "heap": "",
++    "debug": False
++})
++
++iface = cpu.setSubComponent("memory", "memHierarchy.standardInterface")
++
++l1cache = sst.Component("l1cache.mesi", "memHierarchy.Cache")
++l1cache.addParams({
++      "access_latency_cycles" : "1",
++      "cache_frequency" : "1Ghz",
++      "replacement_policy" : "nmru",
++      "coherence_protocol" : "MESI",
++      "associativity" : "4",
++      "cache_line_size" : clw,
++      "debug" : DEBUG_L1,
++      "debug_level" : DEBUG_LEVEL,
++      "L1" : "1",
++      "cache_size" : "32KiB"
++})
++
++# Explicitly set the link subcomponents instead of having cache figure them out based on connected port names
++l1toC = l1cache.setSubComponent("cpulink", "memHierarchy.MemLink")
++l1toM = l1cache.setSubComponent("memlink", "memHierarchy.MemLink")
++
++# Memory controller
++memctrl = sst.Component("memory", "memHierarchy.MemController")
++memctrl.addParams({
++    "clock" : "1GHz",
++    "request_width" : clw,
++    "debug" : DEBUG_MEM,
++    "debug_level" : DEBUG_LEVEL,
++    "addr_range_end" : 4*1024*1024*1024-1,
++})
++Mtol1 = memctrl.setSubComponent("cpulink", "memHierarchy.MemLink")
++
++# Memory model
++memory = memctrl.setSubComponent("backend", "memHierarchy.simpleMem")
++memory.addParams({
++      "access_time" : "10ns",
++      "mem_size" : "4GiB",
++      "request_width": clw
++})
++
++# Define the simulation links
++link_cpu_cache_link = sst.Link("link_cpu_cache_link")
++link_cpu_cache_link.connect( (iface, "port", "0ps"), (l1toC, "port", "0ps") )
++link_mem_bus_link = sst.Link("link_mem_bus_link")
++link_mem_bus_link.connect( (l1toM, "port", "0ps"), (Mtol1, "port", "0ps") )
++
+diff --git a/sst/include/SimEngCoreWrapper.hh b/sst/include/SimEngCoreWrapper.hh
+index cb53c0f5..fc841949 100644
+--- a/sst/include/SimEngCoreWrapper.hh
++++ b/sst/include/SimEngCoreWrapper.hh
+@@ -141,6 +141,8 @@ class SimEngCoreWrapper : public SST::Component {
+   /** Method used to assemble SimEng core. */
+   void fabricateSimEngCore();
+ 
++  void processMemoryImage();
++
+   /** Method to split the passed executable argument's string into a vector of
+    * individual arguments. */
+   std::vector<std::string> splitArgs(std::string argString);
+@@ -210,7 +212,7 @@ class SimEngCoreWrapper : public SST::Component {
+   std::shared_ptr<SimEngMemInterface> dataMemory_;
+ 
+   /** Number of clock iterations. */
+-  int iterations_;
++  uint64_t iterations_;
+ 
+   /** Start time of simulation. */
+   std::chrono::high_resolution_clock::time_point startTime_;
+diff --git a/sst/include/SimEngMemInterface.hh b/sst/include/SimEngMemInterface.hh
+index 79789a9f..463d0dc9 100644
+--- a/sst/include/SimEngMemInterface.hh
++++ b/sst/include/SimEngMemInterface.hh
+@@ -33,7 +33,7 @@ class SimEngMemInterface : public MemoryInterface {
+                      bool debug);
+   /** Send SimEng's processImage to SST memory backend during `init` lifecycle
+    * phase of SST. */
+-  void sendProcessImageToSST(char* image, uint64_t size);
++  void sendProcessImageToSST(char* image, uint64_t size, uint64_t startAddr=0);
+ 
+   /**
+    * Construct an AggregatedReadRequest and use it to generate
diff --git a/src/include/simeng/BranchPredictor.hh b/src/include/simeng/BranchPredictor.hh
index 88be07dd3f..8d76f08753 100644
--- a/src/include/simeng/BranchPredictor.hh
+++ b/src/include/simeng/BranchPredictor.hh
@@ -46,6 +46,11 @@ class BranchPredictor {
  public:
   virtual ~BranchPredictor(){};
 
+  /** Overload predict() with more information in parameters */
+  virtual BranchPrediction predict(uint64_t address, BranchType type,
+                                   uint64_t knownTarget, uint8_t instByteLength)
+      = 0;
+
   /** Generate a branch prediction for the specified instruction address with a
    * branch type and possible known target. */
   virtual BranchPrediction predict(uint64_t address, BranchType type,
diff --git a/src/include/simeng/CoreInstance.hh b/src/include/simeng/CoreInstance.hh
index c8e151e884..e4d5b23248 100644
--- a/src/include/simeng/CoreInstance.hh
+++ b/src/include/simeng/CoreInstance.hh
@@ -16,6 +16,7 @@
 #include "simeng/kernel/Linux.hh"
 #include "simeng/models/emulation/Core.hh"
 #include "simeng/models/inorder/Core.hh"
+#include "simeng/models/mcu/Core.hh"
 #include "simeng/models/outoforder/Core.hh"
 #include "simeng/pipeline/A64FXPortAllocator.hh"
 #include "simeng/pipeline/BalancedPortAllocator.hh"
@@ -37,7 +38,7 @@ uint32_t hex_[] = {
 namespace simeng {
 
 /** The available modes of simulation. */
-enum class SimulationMode { Emulation, InOrderPipelined, OutOfOrder };
+enum class SimulationMode { Emulation, InOrderPipelined, MCU, OutOfOrder };
 
 /** A class to create a SimEng core instance from a supplied config. */
 class CoreInstance {
diff --git a/src/include/simeng/Elf.hh b/src/include/simeng/Elf.hh
index 14bcddcb28..485debea60 100644
--- a/src/include/simeng/Elf.hh
+++ b/src/include/simeng/Elf.hh
@@ -2,6 +2,7 @@
 
 #include <string>
 #include <vector>
+#include <unordered_map>
 
 #include "simeng/span.hh"
 
@@ -30,23 +31,85 @@ struct Elf32Header {
   uint32_t memorySize;
 };
 
+typedef struct {
+  unsigned char e_ident[16];
+  uint16_t      e_type;
+  uint16_t      e_machine;
+  uint32_t      e_version;
+  uint32_t      e_entry;
+  uint32_t      e_phoff;
+  uint32_t      e_shoff;
+  uint32_t      e_flags;
+  uint16_t      e_ehsize;
+  uint16_t      e_phentsize;
+  uint16_t      e_phnum;
+  uint16_t      e_shentsize;
+  uint16_t      e_shnum;
+  uint16_t      e_shstrndx;
+} Elf32_Ehdr;
+
+typedef struct {
+    uint32_t   p_type;
+    uint32_t   p_offset;
+    uint32_t   p_vaddr;
+    uint32_t   p_paddr;
+    uint32_t   p_filesz;
+    uint32_t   p_memsz;
+    uint32_t   p_flags;
+    uint32_t   p_align;
+} Elf32_Phdr;
+
+typedef struct {
+  uint32_t   sh_name;
+  uint32_t   sh_type;
+  uint32_t   sh_flags;
+  uint32_t   sh_addr;
+  uint32_t   sh_offset;
+  uint32_t   sh_size;
+  uint32_t   sh_link;
+  uint32_t   sh_info;
+  uint32_t   sh_addralign;
+  uint32_t   sh_entsize;
+} Elf32_Shdr;
+
+typedef struct {
+    uint32_t      st_name;
+    uint32_t      st_value;
+    uint32_t      st_size;
+    unsigned char st_info;
+    unsigned char st_other;
+    uint16_t      st_shndx;
+} Elf32_Sym;
+
+enum ElfPhType {
+  PT_NULL,
+  PT_LOAD
+};
+
+enum ElfShType {
+  SHT_NULL,
+  SHT_PROGBITS,
+  SHT_SYMTAB,
+  SHT_STRTAB
+};
+
 /** A processed Executable and Linkable Format (ELF) file. */
 class Elf {
- public:
-  Elf(std::string path, char** imagePointer);
-  ~Elf();
-  uint64_t getProcessImageSize() const;
-  bool isValid() const;
-  uint64_t getEntryPoint() const;
-
- private:
-  uint64_t entryPoint_;
-  std::vector<ElfHeader> headers_;
-  uint32_t entryPoint32_;
-  std::vector<Elf32Header> headers32_;
-  bool isValid_ = false;
-  uint64_t processImageSize_;
-  bool mode32bit_;
+  public:
+    Elf(std::string path, char** imagePointer, std::unordered_map<std::string, uint64_t>& symbols);
+    ~Elf();
+    uint64_t  getProcessImageSize() const;
+    bool      isValid() const;
+    uint64_t  getEntryPoint() const;
+
+  private:
+    uint64_t  entryPoint_;
+    std::vector<ElfHeader> headers_;
+    uint32_t  entryPoint32_;
+    std::vector<Elf32Header> headers32_;
+    bool      isValid_ = false;
+    uint64_t  processImageSize_;
+    bool      mode32bit_;
 };
 
 }  // namespace simeng
diff --git a/src/include/simeng/GenericPredictor.hh b/src/include/simeng/GenericPredictor.hh
index 21df57a4a5..aff5ade8fe 100644
--- a/src/include/simeng/GenericPredictor.hh
+++ b/src/include/simeng/GenericPredictor.hh
@@ -26,6 +26,9 @@ class GenericPredictor : public BranchPredictor {
   GenericPredictor(YAML::Node config);
   ~GenericPredictor();
 
+  BranchPrediction predict(uint64_t address, BranchType type,
+                           uint64_t knownTarget, uint8_t byteLength) override;
+
   /** Generate a branch prediction for the supplied instruction address, a
    * branch type, and a known target if not 0. Returns a branch direction and
    * branch target address. */
diff --git a/src/include/simeng/Instruction.hh b/src/include/simeng/Instruction.hh
index 8b1cf2f9db..9ffc4a8d27 100644
--- a/src/include/simeng/Instruction.hh
+++ b/src/include/simeng/Instruction.hh
@@ -23,6 +23,9 @@ class Instruction {
    * instruction. */
   bool exceptionEncountered() const;
 
+  /** Binds an interrupt to this instruction  */
+  virtual void raiseInterrupt(int16_t& interruptId)                     {}
+
   /** Retrieve the source registers this instruction reads. */
   virtual const span<Register> getOperandRegisters() const = 0;
 
@@ -99,8 +102,8 @@ class Instruction {
   /** Retrieve branch type. */
   virtual BranchType getBranchType() const = 0;
 
-  /** Retrieve a branch target from the instruction's metadata if known. */
-  virtual uint64_t getKnownTarget() const = 0;
+  /** Retrieve an offset of branch target from the instruction's metadata if known. */
+  virtual uint64_t getKnownOffset() const = 0;
 
   /** Is this a store address operation (a subcategory of store operations which
    * deal with the generation of store addresses to store data at)? */
@@ -178,6 +181,12 @@ class Instruction {
   /** Get arbitrary micro-operation index. */
   int getMicroOpIndex() const;
 
+  bool isDiv() const;
+
+  bool isMul() const;
+
+  bool isSysCall() const;
+
  protected:
   /** Whether an exception has been encountered. */
   bool exceptionEncountered_ = false;
@@ -208,8 +217,8 @@ class Instruction {
   /** What type of branch this instruction is. */
   BranchType branchType_ = BranchType::Unknown;
 
-  /** If the branch target is known at the time of decode, store it. */
-  uint64_t knownTarget_ = 0;
+  /** If the offset of branch target is known at the time of decode, store it. */
+  uint64_t knownOffset_ = 0;
 
   // Flushing
   /** This instruction's sequence ID; a higher ID represents a chronologically
@@ -252,6 +261,12 @@ class Instruction {
   /** An arbitrary index value for the micro-operation. Its use is based on the
    * implementation of specific micro-operations. */
   int microOpIndex_;
+
+  bool isMul_ = false;
+
+  bool isDiv_ = false;
+
+  bool isSysCall_ = false;
 };
 
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/Architecture.hh b/src/include/simeng/arch/Architecture.hh
index edd404c827..29874c6d69 100644
--- a/src/include/simeng/arch/Architecture.hh
+++ b/src/include/simeng/arch/Architecture.hh
@@ -101,6 +101,9 @@ class Architecture {
   /** Returns the maximum size of a valid instruction in bytes. */
   virtual uint8_t getMaxInstructionSize() const = 0;
 
+  /** Returns the minimum size of a valid instruction in bytes. */
+  virtual uint8_t getMinInstructionSize() const = 0;
+
   /** Returns the physical register structure as defined within the config
    * file
    */
@@ -113,7 +116,7 @@ class Architecture {
       YAML::Node config) const = 0;
 
   /** Updates System registers of any system-based timers. */
-  virtual void updateSystemTimerRegisters(RegisterFileSet* regFile,
+  virtual int16_t updateSystemTimerRegisters(RegisterFileSet* regFile,
                                           const uint64_t iterations) const = 0;
 
   /** Update trace file */
diff --git a/src/include/simeng/arch/aarch64/Architecture.hh b/src/include/simeng/arch/aarch64/Architecture.hh
index ad14dc1c0e..3c1ce27f59 100644
--- a/src/include/simeng/arch/aarch64/Architecture.hh
+++ b/src/include/simeng/arch/aarch64/Architecture.hh
@@ -51,6 +51,9 @@ class Architecture : public arch::Architecture {
   /** Returns the maximum size of a valid instruction in bytes. */
   uint8_t getMaxInstructionSize() const override;
 
+  /** Returns the minimum size of a valid instruction in bytes. */
+  uint8_t getMinInstructionSize() const override;
+
   /** Returns the current vector length set by the provided configuration. */
   uint64_t getVectorLength() const;
 
@@ -59,7 +62,7 @@ class Architecture : public arch::Architecture {
   uint64_t getStreamingVectorLength() const;
 
   /** Updates System registers of any system-based timers. */
-  void updateSystemTimerRegisters(RegisterFileSet* regFile,
+  int16_t updateSystemTimerRegisters(RegisterFileSet* regFile,
                                   const uint64_t iterations) const override;
 
   /** Returns the physical register structure as defined within the config file
diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh
index 43d1bd4961..bffa3c627e 100644
--- a/src/include/simeng/arch/aarch64/Instruction.hh
+++ b/src/include/simeng/arch/aarch64/Instruction.hh
@@ -301,7 +301,7 @@ class Instruction : public simeng::Instruction {
   BranchType getBranchType() const override;
 
   /** Retrieve a branch target from the instruction's metadata if known. */
-  uint64_t getKnownTarget() const override;
+  uint64_t getKnownOffset() const override;
 
   /** Is this a store address operation (a subcategory of store operations which
    * deal with the generation of store addresses to store data at)? */
diff --git a/src/include/simeng/arch/riscv/Architecture.hh b/src/include/simeng/arch/riscv/Architecture.hh
index de6c76c71e..3bdb6287e9 100644
--- a/src/include/simeng/arch/riscv/Architecture.hh
+++ b/src/include/simeng/arch/riscv/Architecture.hh
@@ -6,27 +6,18 @@
 #include <iomanip>
 
 #include "simeng/arch/Architecture.hh"
-#include "simeng/arch/riscv/ExceptionHandler.hh"
+
 #include "simeng/arch/riscv/Instruction.hh"
 #include "simeng/kernel/Linux.hh"
 
 using csh = size_t;
 
+#include "simeng/arch/riscv/SystemRegister.hh"
+#include "simeng/arch/riscv/ExceptionHandler.hh"
+
 namespace simeng {
 namespace arch {
 namespace riscv {
-
-enum riscv_sysreg {
-  SYSREG_MSTATUS = 0x300,
-  SYSREG_MSTATUSH = 0x310,
-  SYSREG_MEPC = 0x341,
-  SYSREG_MCAUSE = 0x342,
-  SYSREG_MHARTID = 0xF14,
-  SYSREG_CYCLE = 0xC00,
-  SYSREG_TIME = 0xC01,
-  SYSREG_INSTRRET = 0xC02
-};
-
 struct constantsPool {
   const uint8_t alignMask = 0x3;
   const uint8_t alignMaskCompressed = 0x1;
@@ -45,7 +36,7 @@ struct archConstants {
 /* A basic RISC-V implementation of the `Architecture` interface. */
 class Architecture : public arch::Architecture {
  public:
-  Architecture(kernel::Linux& kernel, YAML::Node config);
+  Architecture(kernel::Linux& kernel, YAML::Node config, std::shared_ptr<simeng::MemoryInterface>& dataMemory);
   ~Architecture();
   /** Pre-decode instruction memory into a macro-op of `Instruction`
    * instances. Returns the number of bytes consumed to produce it (always 4),
@@ -60,6 +51,9 @@ class Architecture : public arch::Architecture {
   /** Returns a zero-indexed register tag for a system register encoding. */
   int32_t getSystemRegisterTag(uint16_t reg) const override;
 
+  /** Returns a System Register index from a system register tag. */
+  uint16_t getSystemRegisterIdFromTag(int32_t tag) const;
+
   /** Returns the number of system registers that have a mapping. */
   uint16_t getNumSystemRegisters() const override;
 
@@ -77,8 +71,11 @@ class Architecture : public arch::Architecture {
   /** Returns the maximum size of a valid instruction in bytes. */
   uint8_t getMaxInstructionSize() const override;
 
-  /** Updates System registers of any system-based timers. */
-  void updateSystemTimerRegisters(RegisterFileSet* regFile,
+  /** Returns the minimum size of a valid instruction in bytes. */
+  uint8_t getMinInstructionSize() const override;
+
+  /** Updates System registers of any system-based timers. Return +ve id if interrupt occurs */
+  int16_t updateSystemTimerRegisters(RegisterFileSet* regFile,
                                   const uint64_t iterations) const override;
 
   /** Returns the physical register structure as defined within the config file
@@ -117,6 +114,18 @@ class Architecture : public arch::Architecture {
   /** A mapping from system register encoding to a zero-indexed tag. */
   std::unordered_map<uint16_t, uint16_t> systemRegisterMap_;
 
+  /** Ordered map of memory mapped system regsiters banks **/
+  std::map<uint64_t, MemoryMappedSystemRegisterBlock*> memoryMappedSystemRegisterBlocks;
+
+  /* Memory Interface through which memory mapped system registers are accessed */
+  std::shared_ptr<SystemRegisterMemoryInterface> systemRegisterMemoryInterface;
+
+  /* Optional Clint block which replicates that functionality in spike */
+  std::shared_ptr<Clint> clint;
+
+  /* Optional Host Target Interface block which replicates that functionality in spike */
+  std::shared_ptr<HostTargetInterface> htif;
+
   /** A map to hold the relationship between aarch64 instruction groups and
    * user-defined execution information. */
   std::unordered_map<uint16_t, executionInfo> groupExecutionInfo_;
diff --git a/src/include/simeng/arch/riscv/ExceptionHandler.hh b/src/include/simeng/arch/riscv/ExceptionHandler.hh
index 02d29c93bb..36cfd5d187 100644
--- a/src/include/simeng/arch/riscv/ExceptionHandler.hh
+++ b/src/include/simeng/arch/riscv/ExceptionHandler.hh
@@ -57,6 +57,9 @@ class ExceptionHandler : public simeng::arch::ExceptionHandler {
    */
   bool readBufferThen(uint64_t ptr, uint64_t length, std::function<bool()> then,
                       bool firstCall = true);
+  
+  /** generate system register changes associated with taking an exception **/
+  void takeException(uint64_t causecode);
 
   /** A data buffer used for reading data from memory. */
   std::vector<uint8_t> dataBuffer;
diff --git a/src/include/simeng/arch/riscv/Instruction.hh b/src/include/simeng/arch/riscv/Instruction.hh
index 3f023d28b6..60966ce044 100644
--- a/src/include/simeng/arch/riscv/Instruction.hh
+++ b/src/include/simeng/arch/riscv/Instruction.hh
@@ -48,7 +48,8 @@ enum class InstructionException {
   HypervisorCall,
   SecureMonitorCall,
   UnmappedSysReg,
-  NoAvailablePort
+  NoAvailablePort,
+  Interrupt
 };
 
 enum CInstructionFormat {
@@ -87,6 +88,18 @@ class Instruction : public simeng::Instruction {
    * processing this instruction. */
   virtual InstructionException getException() const;
 
+  /** Raise an interrupt. */
+  void raiseInterrupt(int16_t& interruptId)
+  {
+    interruptId_          = interruptId;
+    exceptionEncountered_ = true;
+    exception_            = InstructionException::Interrupt;
+    interruptId           = -1;
+  }
+
+  /** Get Id of this interrupr */
+  int16_t getInterruptId() const { return interruptId_; }
+
   /** Retrieve the source registers this instruction reads. */
   const span<Register> getOperandRegisters() const override;
 
@@ -139,8 +152,8 @@ class Instruction : public simeng::Instruction {
   /** Retrieve branch type. */
   BranchType getBranchType() const override;
 
-  /** Retrieve a branch target from the instruction's metadata if known. */
-  uint64_t getKnownTarget() const override;
+  /** Retrieve an offset of branch target from the instruction's metadata if known. */
+  uint64_t getKnownOffset() const override;
 
   /** Is this a store address operation (a subcategory of store operations which
    * deal with the generation of store addresses to store data at)? */
@@ -186,6 +199,8 @@ class Instruction : public simeng::Instruction {
   /** ONLY valid after decode. Return regByteWidth */
   uint8_t getArchRegWidth() const;
 
+  const Architecture& getArchitecture() const;
+
  private:
   /** The maximum number of source registers any supported RISC-V instruction
    * can have. */
@@ -292,7 +307,9 @@ class Instruction : public simeng::Instruction {
   std::vector<RegisterValue> memoryData;
 
   /** Return integer register value, to support both 32-bit and 64-bit mode */
-  int64_t getSignedInt(RegisterValue& value) const;
+  int64_t  getSignedInt(RegisterValue& value) const;
+
+  int16_t interruptId_;
 };
 
 }  // namespace riscv
diff --git a/src/include/simeng/arch/riscv/SystemRegister.hh b/src/include/simeng/arch/riscv/SystemRegister.hh
new file mode 100644
index 0000000000..0556156ef6
--- /dev/null
+++ b/src/include/simeng/arch/riscv/SystemRegister.hh
@@ -0,0 +1,229 @@
+#pragma once
+
+#include <forward_list>
+#include <unordered_map>
+#include <fstream>
+#include <iomanip>
+
+#include "simeng/arch/Architecture.hh"
+
+#include "simeng/arch/riscv/Instruction.hh"
+#include "simeng/kernel/Linux.hh"
+
+namespace simeng {
+namespace arch {
+namespace riscv {
+
+// Should probably move to Capstone
+
+enum riscv_sysreg {
+  SYSREG_MSTATUS    = 0x300,
+  SYSREG_MIE        = 0x304,
+  SYSREG_MTVEC      = 0x305,
+  SYSREG_MSTATUSH   = 0x310,
+  SYSREG_MSCRATCH   = 0x340,
+  SYSREG_MEPC       = 0x341,
+  SYSREG_MCAUSE     = 0x342,
+  SYSREG_MHARTID    = 0xF14,
+  SYSREG_MXCPTSC    = 0xFC2,
+  SYSREG_CYCLE      = 0xC00,
+  SYSREG_TIME       = 0xC01,
+  SYSREG_INSTRRET   = 0xC02
+};
+
+enum riscv_causecode_enum {
+  CAUSE_IADDRESS_MISALIGN   = 0,
+  CAUSE_IACCESS_FAULT       = 1,
+  CAUSE_ILLEGAL_INSTRUCTION = 2,
+  CAUSE_BREAKPOINT          = 3,
+  CAUSE_LDADDRESS_MISALIGN  = 4,
+  CAUSE_LDACCESS_FAULT      = 5,
+  CAUSE_STADDRESS_MISALIGN  = 6,
+  CAUSE_STACCESS_FAULT      = 7,
+  CAUSE_ECALL_FROM_M        = 11
+};
+
+enum class InterruptId {
+  HALT             = 1,
+  TIMER            = 7
+};
+
+enum riscv_sysreg_masks {
+  MSTATUS_MIE_MASK           = 0x8,
+  MSTATUS_MPIE_MASK          = 0x80
+};
+
+typedef uint16_t riscv_causecode;
+
+class MemoryMappedSystemRegister {
+  public:
+    MemoryMappedSystemRegister(const RegisterValue& val)          : state(val) {}                
+    bool size()                                                   { return state.size(); }
+    virtual void  put(const RegisterValue& val)                   { state = val; }
+    virtual const RegisterValue& get()                            { return state; }
+  private:
+    RegisterValue state;
+};
+
+class MemoryMappedSystemRegisterBlock {
+  public:
+    MemoryMappedSystemRegisterBlock(size_t sz)                    : size_(sz) {}
+    size_t size()                                                 { return size_; }
+    virtual bool put(uint16_t, const RegisterValue&);
+    virtual bool get(uint16_t, RegisterValue&);
+    virtual void tick()                                           {}
+  protected:
+    /** Ordered map of memory mapped system regsiters **/
+    std::map<uint16_t, MemoryMappedSystemRegister*> memoryMappedSystemRegisters;
+    size_t size_;
+};
+
+class SystemRegisterMemoryInterface : public MemoryInterface {
+  public:
+    SystemRegisterMemoryInterface(
+      std::shared_ptr<simeng::MemoryInterface>& dataMemory, 
+      std::map<uint64_t,MemoryMappedSystemRegisterBlock*>& memoryMappedSystemRegisterBlocks
+    ) :
+      dataMemory_(dataMemory),
+      memoryMappedSystemRegisterBlocks_(memoryMappedSystemRegisterBlocks)
+    {}
+
+    /** Request a read from the supplied target location. */
+    virtual void requestRead(const MemoryAccessTarget& target,
+                            uint64_t requestId = 0)
+    {
+      RegisterValue data(0,target.size);
+      if (getMemoryMappedSystemRegister(target.address, data))
+        completedReads_.push_back({target, data, requestId});
+      else 
+        dataMemory_.get()->requestRead(target,requestId);
+    }
+
+    /** Request a write of `data` to the target location. */
+    virtual void requestWrite(const MemoryAccessTarget& target,
+                              const RegisterValue& data)
+    {
+      if (!putMemoryMappedSystemRegister(target.address, data))
+        dataMemory_.get()->requestWrite(target,data);
+    }
+
+    /** Retrieve all completed read requests. */
+    virtual const span<MemoryReadResult> getCompletedReads() const
+    {
+      if (completedReads_.empty())
+        return dataMemory_.get()->getCompletedReads();
+      else
+        return {const_cast<MemoryReadResult*>(completedReads_.data()), completedReads_.size()};
+    }
+
+    /** Clear the completed reads. */
+    virtual void clearCompletedReads()
+    {
+      if (completedReads_.empty())
+        dataMemory_.get()->clearCompletedReads();
+      else 
+        completedReads_.clear();
+    }
+
+    /** Returns true if there are any oustanding memory requests in-flight. */
+    virtual bool hasPendingRequests() const
+    {
+      return dataMemory_.get()->hasPendingRequests();
+    }
+
+    /** Tick the memory interface to allow it to process internal tasks.
+    *
+    * TODO: Move ticking out of the memory interface and into a central "memory
+    * system" covering a set of related interfaces.
+    */
+    virtual void tick()
+    {
+      dataMemory_.get()->tick();
+    }
+
+  private :
+    /** Put/Get Memory Mapped Registers */
+    bool putMemoryMappedSystemRegister(uint64_t address, const RegisterValue& value);
+    bool getMemoryMappedSystemRegister(uint64_t address, RegisterValue& value);
+
+    std::shared_ptr<simeng::MemoryInterface> dataMemory_;
+
+    /** Address map of all system register blocks */
+    std::map<uint64_t,MemoryMappedSystemRegisterBlock*>& memoryMappedSystemRegisterBlocks_;
+    
+    /** A vector containing all completed read requests. */
+    std::vector<MemoryReadResult> completedReads_;
+};
+
+class Architecture;
+
+class HostTargetInterface : public MemoryMappedSystemRegisterBlock {
+  public:
+    enum { 
+      PAYLOAD_OFFSET  = 0,
+      DEVICEID_OFFSET = 4
+    };
+
+    HostTargetInterface(Architecture& architecture)
+    : 
+      MemoryMappedSystemRegisterBlock(8),
+      architecture_(architecture),
+      isHalted_(false)
+    {
+      memoryMappedSystemRegisters[PAYLOAD_OFFSET]  = new MemoryMappedSystemRegister(static_cast<uint32_t>(0));
+      memoryMappedSystemRegisters[DEVICEID_OFFSET] = new MemoryMappedSystemRegister(static_cast<uint32_t>(0));
+    }
+
+    bool put(uint16_t offset, const RegisterValue&value);
+
+    int16_t updateSystemTimerRegisters(RegisterFileSet* regFile, const uint64_t iterations) {
+      if (isHalted_)
+        return static_cast<int16_t>(InterruptId::HALT);
+      return -1;
+    }
+
+  private :
+    Architecture& architecture_;
+    bool          isHalted_;
+};
+
+class Clint : public MemoryMappedSystemRegisterBlock {
+  public:
+    enum {
+      CLINT_BASE        = 0x02000000,
+      CLINT_SIZE        = 0x0000c000,
+      MTIMECMP_OFFSET   = 0x4000,
+      MTIME_OFFSET      = 0xbff8
+    };
+
+    Clint(Architecture& architecture)
+    :
+      MemoryMappedSystemRegisterBlock(CLINT_SIZE),
+      architecture_(architecture),
+      mtime_(static_cast<uint64_t>(0)),
+      mtimecmp_(static_cast<uint64_t>(0)),
+      mtime_freq(100),
+      mtime_count(0),
+      last_tick(0)
+    {
+      memoryMappedSystemRegisters[MTIME_OFFSET]    = &mtime_;
+      memoryMappedSystemRegisters[MTIMECMP_OFFSET] = &mtimecmp_;
+    }
+
+    int16_t updateSystemTimerRegisters(RegisterFileSet* regFile, const uint64_t iterations);
+
+  private :
+    Architecture& architecture_;
+
+    MemoryMappedSystemRegister mtime_;
+    MemoryMappedSystemRegister mtimecmp_;
+
+    uint32_t      mtime_freq;
+    uint32_t      mtime_count;
+    uint64_t      last_tick;
+};
+
+
+}  // namespace riscv
+}  // namespace arch
+}  // namespace simeng
diff --git a/src/include/simeng/kernel/Linux.hh b/src/include/simeng/kernel/Linux.hh
index 0908d59006..635bd427d5 100644
--- a/src/include/simeng/kernel/Linux.hh
+++ b/src/include/simeng/kernel/Linux.hh
@@ -93,6 +93,8 @@ struct LinuxProcessState {
   std::vector<int64_t> fileDescriptorTable;
   /** Set of deallocated virtual file descriptors available for reuse. */
   std::set<int64_t> freeFileDescriptors;
+  /** Pointer to LinuxProcess from which ProcessState derived*/
+  const LinuxProcess* process;
 };
 
 /** Fixed-width definition of 'rusage' (from <sys/resource.h>). */
@@ -236,6 +238,9 @@ class Linux {
   /** The maximum size of a filesystem path. */
   static const size_t LINUX_PATH_MAX = 4096;
 
+  /** Lookup symbol value from table in elf file. */
+  bool lookupSymbolValue(const std::string symbol, uint64_t& value);
+
  private:
   /** Resturn correct Dirfd depending on given pathname abd dirfd given to
    * syscall. */
diff --git a/src/include/simeng/kernel/LinuxProcess.hh b/src/include/simeng/kernel/LinuxProcess.hh
index 9796b52937..d6b2c4a967 100644
--- a/src/include/simeng/kernel/LinuxProcess.hh
+++ b/src/include/simeng/kernel/LinuxProcess.hh
@@ -77,6 +77,9 @@ class LinuxProcess {
   /** Check whether the process image was created successfully. */
   bool isValid() const;
 
+  /** Lookup symbol value from table in elf file. */
+  bool lookupSymbolValue(const std::string symbol, uint64_t& value) const;
+
  private:
   /** The size of the stack, in bytes. */
   const uint64_t STACK_SIZE;
@@ -113,6 +116,8 @@ class LinuxProcess {
 
   /** Shared pointer to processImage. */
   std::shared_ptr<char> processImage_;
+  
+  std::unordered_map<std::string, uint64_t> symbols_;
 };
 
 }  // namespace kernel
diff --git a/src/include/simeng/models/emulation/Core.hh b/src/include/simeng/models/emulation/Core.hh
index c4a4acc453..1db10d2381 100644
--- a/src/include/simeng/models/emulation/Core.hh
+++ b/src/include/simeng/models/emulation/Core.hh
@@ -108,6 +108,9 @@ class Core : public simeng::Core {
 
   /** The number of branches executed. */
   uint64_t branchesExecuted_ = 0;
+
+  /** Set to interruptId when interrupt occurs, otherwise -1 */
+  int16_t  interruptId_;
 };
 
 }  // namespace emulation
diff --git a/src/include/simeng/models/mcu/Core.hh b/src/include/simeng/models/mcu/Core.hh
new file mode 100644
index 0000000000..de6a53d3ca
--- /dev/null
+++ b/src/include/simeng/models/mcu/Core.hh
@@ -0,0 +1,181 @@
+#pragma once
+
+#include <vector>
+
+#include "simeng/ArchitecturalRegisterFileSet.hh"
+#include "simeng/Core.hh"
+#include "simeng/FlatMemoryInterface.hh"
+#include "simeng/pipeline_hi/DecodeUnit.hh"
+#include "simeng/pipeline_hi/ExecuteUnit.hh"
+#include "simeng/pipeline_hi/FetchUnit.hh"
+#include "simeng/pipeline_hi/WritebackUnit.hh"
+#include "simeng/pipeline_hi/StaticPredictor.hh"
+#include "simeng/pipeline_hi/LoadStoreQueue.hh"
+#include "simeng/pipeline_hi/RegDepMap.hh"
+
+#include "simeng/arch/riscv/Architecture.hh"
+
+namespace simeng {
+namespace models {
+namespace mcu {
+
+/** An entry in the reservation station. */
+struct dependencyEntry1 {
+  /** The instruction to execute. */
+  std::shared_ptr<Instruction> uop;
+
+  /** The operand waiting on a value. */
+  uint16_t operandIndex;
+};
+
+/** A simple scalar in-order pipelined core model. */
+class Core : public simeng::Core {
+ public:
+  /** Construct a core model, providing an ISA and branch predictor to use,
+   * along with a pointer and size of instruction memory, and a pointer to
+   * process memory. */
+  Core(MemoryInterface& instructionMemory, MemoryInterface& dataMemory,
+       uint64_t processMemorySize, uint64_t entryPoint,
+       const arch::Architecture& isa, BranchPredictor& branchPredictor, YAML::Node config);
+
+  /** Tick the core. Ticks each of the pipeline stages sequentially, then ticks
+   * the buffers between them. Checks for and executes pipeline flushes at the
+   * end of each cycle. */
+  void tick() override;
+
+  /** Check whether the program has halted. */
+  bool hasHalted() const override;
+
+  /** Retrieve the architectural register file set. */
+  const ArchitecturalRegisterFileSet& getArchitecturalRegisterFileSet()
+      const override;
+
+  /** Retrieve the number of instructions retired. */
+  uint64_t getInstructionsRetiredCount() const override;
+
+  /** Retrieve the simulated nanoseconds elapsed since the core started. */
+  uint64_t getSystemTimer() const override;
+
+  /** Generate a map of statistics to report. */
+  std::map<std::string, std::string> getStats() const override;
+
+ private:
+  /** Raise an exception to the core, providing the generating instruction. */
+  void raiseException(const std::shared_ptr<Instruction>& instruction);
+
+  /** Handle an exception raised during the cycle. */
+  void handleException();
+
+  /** Load and supply memory data requested by an instruction. */
+  void loadData(const std::shared_ptr<Instruction>& instruction);
+  /** Store data supplied by an instruction to memory. */
+  void storeData(const std::shared_ptr<Instruction>& instruction);
+
+  /** Forward operands to the most recently decoded instruction. */
+  void forwardOperands(const span<Register>& destinations,
+                       const span<RegisterValue>& values);
+
+  bool canIssue(const std::shared_ptr<Instruction>& instruction);
+  void removeDep(const std::shared_ptr<Instruction>& instruction);
+
+  /** Read pending registers for the most recently decoded instruction. */
+  void readRegisters();
+
+  /** Process the active exception handler. */
+  void processExceptionHandler();
+
+  /** Apply changes to the process state. */
+  void applyStateChange(const arch::ProcessStateChange& change);
+
+  /** Handle requesting/execution of a load instruction. */
+  void handleLoad(const std::shared_ptr<Instruction>& instruction);
+
+  void addInstrOrderQ(const std::shared_ptr<Instruction>& instruction);
+  bool removeInstrOrderQ(const std::shared_ptr<Instruction>& instruction);
+
+  /** The process memory. */
+  MemoryInterface& dataMemory_;
+
+  /** A reference to the core's architecture. */
+  const arch::Architecture& isa_;
+
+  /** The core's register file set. */
+  RegisterFileSet registerFileSet_;
+
+  /** An architectural register file set, serving as a simple wrapper around the
+   * register file set. */
+  ArchitecturalRegisterFileSet architecturalRegisterFileSet_;
+
+  /** The process memory. */
+  span<char> processMemory;
+
+  /** The buffer between fetch and decode. */
+  pipeline_hi::PipelineBuffer<MacroOp> fetchToDecodeBuffer_;
+
+  /** The buffer between decode and execute. */
+  pipeline_hi::PipelineBuffer<std::shared_ptr<Instruction>> decodeToExecuteBuffer_;
+
+  /** The buffer between execute and writeback. */
+  std::vector<pipeline_hi::PipelineBuffer<std::shared_ptr<Instruction>>>
+      completionSlots_;
+
+  /** The previously generated addresses. */
+  std::queue<simeng::MemoryAccessTarget> previousAddresses_;
+
+  /** The register dependency map. */
+  pipeline_hi::RegDepMap regDepMap_;
+
+  /** The fetch unit; fetches instructions from memory. */
+  pipeline_hi::FetchUnit fetchUnit_;
+
+  /** The decode unit; decodes instructions into uops and reads operands. */
+  pipeline_hi::DecodeUnit decodeUnit_;
+
+  /** The execute unit; executes uops and sends to writeback, also forwarding
+   * results. */
+  pipeline_hi::ExecuteUnit executeUnit_;
+
+  /** The writeback unit; writes uop results to the register files. */
+  pipeline_hi::WritebackUnit writebackUnit_;
+
+  pipeline_hi::LoadStoreQueue loadStoreQueue_;
+
+  /** The number of times the pipeline has been flushed. */
+  uint64_t flushes_ = 0;
+
+  /** The number of times this core has been ticked. */
+  uint64_t ticks_ = 0;
+
+  uint64_t lastCommitTick_ = 0;
+
+  /** Whether an exception was generated during the cycle. */
+  bool exceptionGenerated_ = false;
+
+  /** A pointer to the instruction responsible for generating the exception. */
+  std::shared_ptr<Instruction> exceptionGeneratingInstruction_;
+
+  /** Whether the core has halted. */
+  bool hasHalted_ = false;
+
+  /** The active exception handler. */
+  std::shared_ptr<arch::ExceptionHandler> exceptionHandler_;
+
+  std::deque<std::shared_ptr<Instruction>> inorderIQ_;
+
+  void checkHalting();
+  bool enableHaltCheck = false;
+  uint64_t maxStallCycleTimeout;
+  uint64_t maxSimCycleTimeout;
+  uint64_t maxInstrTimeout;
+
+  /** Set to interruptId when interrupt occurs, otherwise -1 */
+  int16_t  interruptId_;
+
+  /** Return interrupt id of the pending interrupt*/
+  int16_t isInterruptPending();
+
+};
+
+}  // namespace mcu
+}  // namespace models
+}  // namespace simeng
diff --git a/src/include/simeng/pipeline_hi/DecodeUnit.hh b/src/include/simeng/pipeline_hi/DecodeUnit.hh
new file mode 100644
index 0000000000..728dff88f6
--- /dev/null
+++ b/src/include/simeng/pipeline_hi/DecodeUnit.hh
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <functional>
+#include <queue>
+
+#include "simeng/arch/Architecture.hh"
+#include "simeng/pipeline_hi/PipelineBuffer.hh"
+
+namespace simeng {
+namespace pipeline_hi {
+
+/** A decode unit for a pipelined processor. Splits pre-decoded macro-ops into
+ * uops. */
+class DecodeUnit {
+ public:
+  /** Constructs a decode unit with references to input/output buffers and the
+   * current branch predictor. */
+  DecodeUnit(PipelineBuffer<MacroOp>& input,
+             PipelineBuffer<std::shared_ptr<Instruction>>& output,
+             BranchPredictor& predictor,
+             std::function<bool(const std::shared_ptr<Instruction>&)> canIssue);
+
+  /** Ticks the decode unit. Breaks macro-ops into uops, and performs early
+   * branch misprediction checks. */
+  void tick();
+
+  /** Check whether the core should be flushed this cycle. */
+  bool shouldFlush() const;
+
+  /** Retrieve the target instruction address associated with the most recently
+   * discovered misprediction. */
+  uint64_t getFlushAddress() const;
+
+  /** Retrieve the number of times that the decode unit requested a flush due to
+   * discovering a branch misprediction early. */
+  uint64_t getEarlyFlushes() const;
+
+  /** Clear the microOps_ queue. */
+  void purgeFlushed();
+
+ private:
+  /** A buffer of macro-ops to split into uops. */
+  PipelineBuffer<MacroOp>& input_;
+  /** An internal buffer for storing one or more uops. */
+  std::deque<std::shared_ptr<Instruction>> microOps_;
+  /** A buffer for writing decoded uops into. */
+  PipelineBuffer<std::shared_ptr<Instruction>>& output_;
+
+  /** A reference to the current branch predictor. */
+  BranchPredictor& predictor_;
+
+  /** Whether the core should be flushed after this cycle. */
+  bool shouldFlush_;
+
+  /** The target instruction address the PC should be updated to upon flush. */
+  uint64_t pc_;
+
+  /** The number of times that the decode unit requested a flush due to
+   * discovering a branch misprediction early. */
+  uint64_t earlyFlushes_ = 0;
+
+  std::function<bool(const std::shared_ptr<Instruction>&)> canIssue_;
+};
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/include/simeng/pipeline_hi/DispatchIssueUnit.hh b/src/include/simeng/pipeline_hi/DispatchIssueUnit.hh
new file mode 100644
index 0000000000..132358fd33
--- /dev/null
+++ b/src/include/simeng/pipeline_hi/DispatchIssueUnit.hh
@@ -0,0 +1,150 @@
+#pragma once
+
+#include <deque>
+#include <initializer_list>
+#include <queue>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "simeng/Instruction.hh"
+#include "simeng/pipeline_hi/PipelineBuffer.hh"
+#include "simeng/pipeline_hi/PortAllocator.hh"
+#include "yaml-cpp/yaml.h"
+
+namespace simeng {
+namespace pipeline_hi {
+
+/** A reservation station issue port */
+struct ReservationStationPort {
+  /** Issue port this port maps to */
+  uint16_t issuePort;
+  /** Queue of instructions that are ready to be
+   * issued */
+  std::deque<std::shared_ptr<Instruction>> ready;
+};
+
+/** A reservation station */
+struct ReservationStation {
+  /** Size of reservation station */
+  uint16_t capacity;
+  /** Number of instructions that can be dispatched to this unit per cycle. */
+  uint16_t dispatchRate;
+  /** Current number of non-stalled instructions
+   * in reservation station */
+  uint16_t currentSize;
+  /** Issue ports belonging to reservation station */
+  std::vector<ReservationStationPort> ports;
+};
+
+/** An entry in the reservation station. */
+struct dependencyEntry {
+  /** The instruction to execute. */
+  std::shared_ptr<Instruction> uop;
+  /** The port to issue to. */
+  uint16_t port;
+  /** The operand waiting on a value. */
+  uint16_t operandIndex;
+};
+
+/** A dispatch/issue unit for an out-of-order pipelined processor. Reads
+ * instruction operand and performs scoreboarding. Issues instructions to the
+ * execution unit once ready. */
+class DispatchIssueUnit {
+ public:
+  /** Construct a dispatch/issue unit with references to input/output buffers,
+   * the register file, the port allocator, and a description of the number of
+   * physical registers the scoreboard needs to reflect. */
+  DispatchIssueUnit(
+      PipelineBuffer<std::shared_ptr<Instruction>>& fromRename,
+      std::vector<PipelineBuffer<std::shared_ptr<Instruction>>>& issuePorts,
+      const RegisterFileSet& registerFileSet, PortAllocator& portAllocator,
+      const std::vector<uint16_t>& physicalRegisterStructure,
+      YAML::Node config);
+
+  /** Ticks the dispatch/issue unit. Reads available input operands for
+   * instructions and sets scoreboard flags for destination registers. */
+  void tick();
+
+  /** Identify the oldest ready instruction in the reservation station and issue
+   * it. */
+  void issue();
+
+  /** Forwards operands and performs register reads for the currently queued
+   * instruction. */
+  void forwardOperands(const span<Register>& destinations,
+                       const span<RegisterValue>& values);
+
+  /** Set the scoreboard entry for the provided register as ready. */
+  void setRegisterReady(Register reg);
+
+  /** Clear the RS of all flushed instructions. */
+  void purgeFlushed();
+
+  /** Retrieve the number of cycles this unit stalled due to insufficient RS
+   * space. */
+  uint64_t getRSStalls() const;
+
+  /** Retrieve the number of cycles no instructions were issued due to an empty
+   * RS. */
+  uint64_t getFrontendStalls() const;
+
+  /** Retrieve the number of cycles no instructions were issued due to
+   * dependencies or a lack of available ports. */
+  uint64_t getBackendStalls() const;
+
+  /** Retrieve the number of times an instruction was unable to issue due to a
+   * busy port. */
+  uint64_t getPortBusyStalls() const;
+
+  /** Retrieve the current sizes and capacities of the reservation stations*/
+  void getRSSizes(std::vector<uint64_t>&) const;
+
+ private:
+  /** A buffer of instructions to dispatch and read operands for. */
+  PipelineBuffer<std::shared_ptr<Instruction>>& input_;
+
+  /** Ports to the execution units, for writing ready instructions to. */
+  std::vector<PipelineBuffer<std::shared_ptr<Instruction>>>& issuePorts_;
+
+  /** A reference to the physical register file set. */
+  const RegisterFileSet& registerFileSet_;
+
+  /** The register availability scoreboard. */
+  std::vector<std::vector<bool>> scoreboard_;
+
+  /** Reservation stations */
+  std::vector<ReservationStation> reservationStations_;
+
+  /** A mapping from port to RS port */
+  std::vector<std::pair<uint16_t, uint16_t>> portMapping_;
+
+  /** A dependency matrix, containing all the instructions waiting on an
+   * operand. For a register `{type,tag}`, the vector of dependents may be found
+   * at `dependencyMatrix[type][tag]`. */
+  std::vector<std::vector<std::vector<dependencyEntry>>> dependencyMatrix_;
+
+  /** A map to collect flushed instructions for each reservation station. */
+  std::unordered_map<uint16_t, std::unordered_set<std::shared_ptr<Instruction>>>
+      flushed_;
+
+  /** A reference to the execution port allocator. */
+  PortAllocator& portAllocator_;
+
+  /** The number of cycles stalled due to a full reservation station. */
+  uint64_t rsStalls_ = 0;
+
+  /** The number of cycles no instructions were issued due to an empty RS. */
+  uint64_t frontendStalls_ = 0;
+
+  /** The number of cycles no instructions were issued due to dependencies or a
+   * lack of available ports. */
+  uint64_t backendStalls_ = 0;
+
+  /** The number of times an instruction was unable to issue due to a busy port.
+   */
+  uint64_t portBusyStalls_ = 0;
+};
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/include/simeng/pipeline_hi/ExecuteUnit.hh b/src/include/simeng/pipeline_hi/ExecuteUnit.hh
new file mode 100644
index 0000000000..da51db3480
--- /dev/null
+++ b/src/include/simeng/pipeline_hi/ExecuteUnit.hh
@@ -0,0 +1,147 @@
+#pragma once
+
+#include <deque>
+#include <functional>
+
+#include "simeng/BranchPredictor.hh"
+#include "simeng/Instruction.hh"
+#include "simeng/pipeline_hi/PipelineBuffer.hh"
+
+namespace simeng {
+namespace pipeline_hi {
+
+/** An execution unit pipeline entry, containing an instruction, and an
+ * indication of when it's reached the front of the execution pipeline. */
+struct ExecutionUnitPipelineEntry {
+  /** The instruction queued for execution. */
+  std::shared_ptr<Instruction> insn;
+  /** The tick number this instruction will reach the front of the queue at. */
+  uint64_t readyAt;
+};
+
+/** An execute unit for a pipelined processor. Executes instructions and
+ * forwards results. */
+class ExecuteUnit {
+ public:
+  /** Constructs an execute unit with references to an input and output buffer,
+   * the currently used branch predictor, and handlers for forwarding operands,
+   * loads/stores, and exceptions. */
+  ExecuteUnit(
+      PipelineBuffer<std::shared_ptr<Instruction>>& input,
+      PipelineBuffer<std::shared_ptr<Instruction>>& output,
+      std::function<void(span<Register>, span<RegisterValue>)> forwardOperands,
+      std::function<void(const std::shared_ptr<Instruction>&)> handleLoad,
+      std::function<void(const std::shared_ptr<Instruction>&)> handleStore,
+      std::function<void(const std::shared_ptr<Instruction>&)> raiseException,
+      std::function<void(const std::shared_ptr<Instruction>&)> addInstrOrderQ,
+      std::function<int16_t(void)> isInterruptPending,
+      BranchPredictor& predictor, bool pipelined = true,
+      const std::vector<uint16_t>& blockingGroups = {});
+
+  /** Tick the execute unit. Places incoming instructions into the pipeline and
+   * executes an instruction that has reached the head of the pipeline, if
+   * present. */
+  void tick();
+
+  /** Query whether a branch misprediction was discovered this cycle. */
+  bool shouldFlush() const;
+
+  /** Retrieve the target instruction address associated with the most recently
+   * discovered misprediction. */
+  uint64_t getFlushAddress() const;
+
+  /** Retrieve the sequence ID associated with the most recently discovered
+   * misprediction. */
+  uint64_t getFlushSeqId() const;
+
+  /** Purge flushed instructions from the internal pipeline and clear any active
+   * stall, if applicable. */
+  void purgeFlushed();
+
+  /** Retrieve the number of branch instructions that have been executed. */
+  uint64_t getBranchExecutedCount() const;
+
+  /** Retrieve the number of branch mispredictions. */
+  uint64_t getBranchMispredictedCount() const;
+
+  /** Retrieve the number of active execution cycles. */
+  uint64_t getCycles() const;
+
+ private:
+  /** Execute the supplied uop, write it into the output buffer, and forward
+   * results back to dispatch/issue. */
+  void execute(std::shared_ptr<Instruction>& uop);
+
+  /** A buffer of instructions to execute. */
+  PipelineBuffer<std::shared_ptr<Instruction>>& input_;
+
+  /** A buffer for writing executed instructions into. */
+  PipelineBuffer<std::shared_ptr<Instruction>>& output_;
+
+  /** A function handle called when forwarding operands. */
+  std::function<void(span<Register>, span<RegisterValue>)> forwardOperands_;
+
+  /** A function handle called after generating the addresses for a load. */
+  std::function<void(const std::shared_ptr<Instruction>&)> handleLoad_;
+  /** A function handle called after acquiring the data for a store. */
+  std::function<void(const std::shared_ptr<Instruction>&)> handleStore_;
+
+  /** A function handle called upon exception generation. */
+  std::function<void(const std::shared_ptr<Instruction>&)> raiseException_;
+
+  /** A function to add the executed instruction into an ordering queue. */
+  std::function<void(const std::shared_ptr<Instruction>&)> addInstrOrderQ_;
+
+  /** Check if any interrupts are pending */
+  std::function<int16_t(void)> isInterruptPending_;
+
+  /** A reference to the branch predictor, for updating with prediction results.
+   */
+  BranchPredictor& predictor_;
+
+  /** Whether this unit is pipelined, or if all instructions should stall until
+   * complete. */
+  bool pipelined_;
+
+  /** The execution unit's internal pipeline, holding instructions until their
+   * execution latency has expired and they are ready for their final results to
+   * be calculated and forwarded. */
+  std::deque<ExecutionUnitPipelineEntry> pipeline_;
+
+  /** A group of operation types that are blocked whilst a similar operation
+   * is being executed. */
+  std::vector<uint16_t> blockingGroups_;
+
+  /** A queue to hold blocked instructions of a similar group type to
+   * blockingGroup_. */
+  std::deque<std::shared_ptr<Instruction>> operationsStalled_;
+
+  /** Whether the core should be flushed after this cycle. */
+  bool shouldFlush_ = false;
+
+  /** The target instruction address the PC should be reset to after this cycle.
+   */
+  uint64_t pc_;
+
+  /** The sequence ID of the youngest instruction that should remain after the
+   * current flush. */
+  uint64_t flushAfter_;
+
+  /** The number of times this unit has been ticked. */
+  uint64_t tickCounter_ = 0;
+
+  /** The cycle this unit will become unstalled. */
+  uint64_t stallUntil_ = 0;
+
+  /** The number of branch instructions that were executed. */
+  uint64_t branchesExecuted_ = 0;
+
+  /** The number of branch mispredictions that were observed. */
+  uint64_t branchMispredicts_ = 0;
+
+  /** The number of active execution cycles that were observed. */
+  uint64_t cycles_ = 0;
+};
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/include/simeng/pipeline_hi/FetchUnit.hh b/src/include/simeng/pipeline_hi/FetchUnit.hh
new file mode 100644
index 0000000000..1c8f40c212
--- /dev/null
+++ b/src/include/simeng/pipeline_hi/FetchUnit.hh
@@ -0,0 +1,127 @@
+#pragma once
+
+#include <queue>
+
+#include "simeng/MemoryInterface.hh"
+#include "simeng/arch/Architecture.hh"
+#include "simeng/pipeline_hi/PipelineBuffer.hh"
+
+namespace simeng {
+namespace pipeline_hi {
+
+/** The various states of the loop buffer. */
+enum class LoopBufferState {
+  IDLE = 0,  // No operations
+  WAITING,   // Waiting to find boundary instruction in fetch stream
+  FILLING,   // Filling loop buffer with loop body
+  SUPPLYING  // Feeding loop buffer content to output buffer
+};
+
+// Struct to hold information about a fetched instruction
+struct loopBufferEntry {
+  // Encoding of the instruction
+  const uint64_t encoding;
+
+  // Size of the instruction
+  const uint16_t instructionSize;
+
+  // PC of the instruction
+  const uint64_t address;
+
+  // Branch prediction made for instruction
+  const BranchPrediction prediction;
+};
+
+/** A fetch and pre-decode unit for a pipelined processor. Responsible for
+ * reading instruction memory and maintaining the program counter. */
+class FetchUnit {
+ public:
+  /** Construct a fetch unit with a reference to an output buffer, the ISA, and
+   * the current branch predictor, and information on the instruction memory. */
+  FetchUnit(PipelineBuffer<MacroOp>& output, MemoryInterface& instructionMemory,
+            uint64_t programByteLength, uint64_t entryPoint, uint8_t blockSize,
+            const arch::Architecture& isa, BranchPredictor& branchPredictor);
+
+  ~FetchUnit();
+
+  /** Tick the fetch unit. Retrieves and pre-decodes the instruction at the
+   * current program counter. */
+  void tick();
+
+  /** Function handle to retrieve branch that represents loop boundary. */
+  void registerLoopBoundary(uint64_t branchAddress);
+
+  /** Check whether the program has ended. Returns `true` if the current PC is
+   * outside of instruction memory. */
+  bool hasHalted() const;
+
+  /** Update the program counter to the specified address. */
+  void updatePC(uint64_t address);
+
+  /** Request instructions at the current program counter for a future cycle. */
+  void requestFromPC();
+
+  /** Retrieve the number of cycles fetch terminated early due to a predicted
+   * branch. */
+  uint64_t getBranchStalls() const;
+
+  /** Clear the loop buffer. */
+  void flushLoopBuffer();
+
+  /**  */
+  void flushPredictor(uint64_t address);
+
+ private:
+  /** An output buffer connecting this unit to the decode unit. */
+  PipelineBuffer<MacroOp>& output_;
+
+  /** The current program counter. */
+  uint64_t pc_ = 0;
+
+  /** An interface to the instruction memory. */
+  MemoryInterface& instructionMemory_;
+
+  /** The length of the available instruction memory. */
+  uint64_t programByteLength_;
+
+  /** Reference to the currently used ISA. */
+  const arch::Architecture& isa_;
+
+  /** Reference to the current branch predictor. */
+  BranchPredictor& branchPredictor_;
+
+  /** A loop buffer to supply a detected loop instruction stream. */
+  std::deque<loopBufferEntry> loopBuffer_;
+
+  /** State of the loop buffer. */
+  LoopBufferState loopBufferState_ = LoopBufferState::IDLE;
+
+  /** The branch instruction that forms the loop. */
+  uint64_t loopBoundaryAddress_ = 0;
+
+  /** The current program halt state. Set to `true` when the PC leaves the
+   * instruction memory region, and set back to `false` if the PC is returned to
+   * the instruction region. */
+  bool hasHalted_ = false;
+
+  bool waitSCEval_ = false;
+
+  /** The number of cycles fetch terminated early due to a predicted branch. */
+  uint64_t branchStalls_ = 0;
+
+  /** The size of a fetch block, in bytes. */
+  uint8_t blockSize_;
+
+  /** A mask of the bits of the program counter to use for obtaining the block
+   * address to fetch. */
+  uint64_t blockMask_;
+
+  /** The buffer used to hold fetched instruction data. */
+  uint8_t* fetchBuffer_;
+
+  /** The amount of data currently in the fetch buffer. */
+  uint8_t bufferedBytes_ = 0;
+};
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/include/simeng/pipeline_hi/LoadStoreQueue.hh b/src/include/simeng/pipeline_hi/LoadStoreQueue.hh
new file mode 100644
index 0000000000..211b1ef72d
--- /dev/null
+++ b/src/include/simeng/pipeline_hi/LoadStoreQueue.hh
@@ -0,0 +1,235 @@
+#pragma once
+
+#include <deque>
+#include <functional>
+#include <map>
+#include <queue>
+#include <unordered_map>
+
+#include "simeng/Instruction.hh"
+#include "simeng/MemoryInterface.hh"
+#include "simeng/pipeline_hi/PipelineBuffer.hh"
+
+namespace simeng {
+namespace pipeline_hi {
+
+/** The memory access types which are processed. */
+enum accessType { LOAD = 0, STORE };
+
+/** A requestQueue_ entry. */
+struct requestEntry {
+  /** The memory address(es) to be accessed. */
+  std::queue<simeng::MemoryAccessTarget> reqAddresses;
+  /** The instruction sending the request(s). */
+  std::shared_ptr<Instruction> insn;
+};
+/** A requestQueue_ entry. */
+struct requestEntry1 {
+  /** The memory address(es) to be accessed. */
+  std::queue<simeng::MemoryAccessTarget> reqAddresses;
+  /** The memory address(es) to be accessed. */
+  std::queue<simeng::RegisterValue> data;
+  /** The instruction sending the request(s). */
+  std::shared_ptr<Instruction> insn;
+  accessType type;
+  uint64_t reqtick;
+  bool isMisAligned;
+};
+/** A load store queue (known as "load/store buffers" or "memory order buffer").
+ * Holds in-flight memory access requests to ensure load/store consistency. */
+class LoadStoreQueue {
+ public:
+  /** Constructs a combined load/store queue model, simulating a shared queue
+   * for both load and store instructions, supplying completion slots for loads
+   * and an operand forwarding handler. */
+  LoadStoreQueue(
+      unsigned int maxCombinedSpace, MemoryInterface& memory,
+      span<PipelineBuffer<std::shared_ptr<Instruction>>> completionSlots,
+      std::function<void(span<Register>, span<RegisterValue>)> forwardOperands,
+      bool exclusive = false, uint16_t loadBandwidth = UINT16_MAX,
+      uint16_t storeBandwidth = UINT16_MAX,
+      uint16_t permittedRequests = UINT16_MAX,
+      uint16_t permittedLoads = UINT16_MAX,
+      uint16_t permittedStores = UINT16_MAX);
+
+  /** Constructs a split load/store queue model, simulating discrete queues for
+   * load and store instructions, supplying completion slots for loads and an
+   * operand forwarding handler. */
+  LoadStoreQueue(
+      unsigned int maxLoadQueueSpace, unsigned int maxStoreQueueSpace,
+      MemoryInterface& memory,
+      span<PipelineBuffer<std::shared_ptr<Instruction>>> completionSlots,
+      std::function<void(span<Register>, span<RegisterValue>)> forwardOperands,
+      bool exclusive = false, uint16_t loadBandwidth = UINT16_MAX,
+      uint16_t storeBandwidth = UINT16_MAX,
+      uint16_t permittedRequests = UINT16_MAX,
+      uint16_t permittedLoads = UINT16_MAX,
+      uint16_t permittedStores = UINT16_MAX);
+
+  /** Retrieve the available space for load uops. For combined queue this is the
+   * total remaining space. */
+  unsigned int getLoadQueueSpace() const;
+
+  /** Retrieve the available space for store uops. For a combined queue this is
+   * the total remaining space. */
+  unsigned int getStoreQueueSpace() const;
+
+  /** Retrieve the available space for any memory uops. For a split queue this
+   * is the sum of the space in both queues. */
+  unsigned int getTotalSpace() const;
+
+  /** Add a load uop to the queue. */
+  void addLoad(const std::shared_ptr<Instruction>& insn);
+
+  /** Add a store uop to the queue. */
+  void addStore(const std::shared_ptr<Instruction>& insn);
+
+  /** Add the load instruction's memory requests to the requestQueue_. */
+  void startLoad(const std::shared_ptr<Instruction>& insn);
+
+  /** Supply the data to be stored by a store operation. */
+  void supplyStoreData(const std::shared_ptr<Instruction>& insn);
+
+  /** Commit and write the oldest store instruction to memory, removing it from
+   * the store queue. Returns `true` if memory disambiguation has discovered a
+   * memory order violation during the commit. */
+  bool commitStore(const std::shared_ptr<Instruction>& uop);
+
+  /** Remove the oldest load instruction from the load queue. */
+  void commitLoad(const std::shared_ptr<Instruction>& uop);
+
+  /** Remove all flushed instructions from the queues. */
+  void purgeFlushed();
+
+  /** Whether this is a combined load/store queue. */
+  bool isCombined() const;
+
+  /** Process received load data and send any completed loads for writeback. */
+  void tick();
+
+  /** Retrieve the load instruction associated with the most recently discovered
+   * memory order violation. */
+  std::shared_ptr<Instruction> getViolatingLoad() const;
+
+  void processResponse();
+
+  bool activeMisAlignedOpr() const;
+
+  bool isBusy() const;
+
+  float getAvgLdLat() const { return (totalLdLatency)/numLoads; };
+
+  uint32_t getMaxLdLat() const { return maxLdLatency; };
+  uint32_t getMinLdLat() const { return minLdLatency; };
+
+ private:
+  /** The load queue: holds in-flight load instructions. */
+  std::deque<std::shared_ptr<Instruction>> loadQueue_;
+
+  /** The store queue: holds in-flight store instructions with its associated
+   * data. */
+  std::deque<std::pair<std::shared_ptr<Instruction>,
+                       span<const simeng::RegisterValue>>>
+      storeQueue_;
+
+  /** Slots to write completed load instructions into for writeback. */
+  span<PipelineBuffer<std::shared_ptr<Instruction>>> completionSlots_;
+
+  /** Map of loads that have requested their data, keyed by sequence ID. */
+  std::unordered_map<uint64_t, std::shared_ptr<Instruction>> requestedLoads_;
+
+    /** Map of loads that have requested their data, keyed by sequence ID. */
+  std::unordered_map<uint64_t, uint64_t> latencyLoads_;
+
+  /** A function handler to call to forward the results of a completed load. */
+  std::function<void(span<Register>, span<RegisterValue>)> forwardOperands_;
+
+  /** The maximum number of loads that can be in-flight. Undefined if this
+   * is a combined queue. */
+  unsigned int maxLoadQueueSpace_;
+
+  /** The maximum number of stores that can be in-flight. Undefined if this is a
+   * combined queue. */
+  unsigned int maxStoreQueueSpace_;
+
+  /** The maximum number of memory ops that can be in-flight. Undefined if this
+   * is a split queue. */
+  unsigned int maxCombinedSpace_;
+
+  /** Whether this queue is combined or split. */
+  bool combined_;
+
+  /** Retrieve the load queue space for a split queue. */
+  unsigned int getLoadQueueSplitSpace() const;
+
+  /** Retrieve the store queue space for a split queue. */
+  unsigned int getStoreQueueSplitSpace() const;
+
+  /** Retrieve the total memory uop space available for a combined queue. */
+  unsigned int getCombinedSpace() const;
+
+  /** A pointer to process memory. */
+  MemoryInterface& memory_;
+
+  /** The load instruction associated with the most recently discovered memory
+   * order violation. */
+  std::shared_ptr<Instruction> violatingLoad_ = nullptr;
+
+  /** The number of times this unit has been ticked. */
+  uint64_t tickCounter_ = 0;
+
+  /** A map to hold load instructions that are stalled due to a detected
+   * memory reordering confliction. First key is a store's sequence id and the
+   * second key the conflicting address. The value takes the form of a vector of
+   * pairs containing a pointer to the conflicted load and the size of the data
+   * needed at that address by the load. */
+  std::unordered_map<
+      uint64_t,
+      std::unordered_map<
+          uint64_t,
+          std::vector<std::pair<std::shared_ptr<Instruction>, uint16_t>>>>
+      conflictionMap_;
+
+  /** A map between LSQ cycles and load requests ready on that cycle. */
+  std::map<uint64_t, std::deque<requestEntry>> requestLoadQueue_;
+
+  /** A map between LSQ cycles and store requests ready on that cycle. */
+  std::map<uint64_t, std::deque<requestEntry>> requestStoreQueue_;
+
+  /** A queue of completed loads ready for writeback. */
+  std::queue<std::shared_ptr<Instruction>> completedLoads_;
+
+  /** Whether the LSQ can only process loads xor stores within a cycle. */
+  bool exclusive_;
+
+  /** The amount of data readable from the L1D cache per cycle. */
+  uint16_t loadBandwidth_;
+
+  /** The amount of data writable to the L1D cache per cycle. */
+  uint16_t storeBandwidth_;
+
+  /** The combined limit of loads and store requests permitted per cycle. */
+  uint16_t totalLimit_;
+
+  /** The number of loads and stores permitted per cycle. */
+  std::array<uint16_t, 2> reqLimits_;
+
+  /** A map between LSQ cycles and load or store requests ready on that cycle. */
+  std::deque<requestEntry1> requestQueue_;
+
+  /* Identifier for request to memory*/
+  uint8_t busReqId = 0;
+
+  //bool activeMisAlignedStore = false;
+
+  //Stats
+  uint64_t numLoads = 0;
+  double totalLdLatency = 0;
+  uint32_t maxLdLatency = 0;
+  uint32_t minLdLatency = 0xFFFF;
+  float averageAccessLdLatency = 0.0;
+};
+
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/include/simeng/pipeline_hi/PipelineBuffer.hh b/src/include/simeng/pipeline_hi/PipelineBuffer.hh
new file mode 100644
index 0000000000..dd2ed70ce7
--- /dev/null
+++ b/src/include/simeng/pipeline_hi/PipelineBuffer.hh
@@ -0,0 +1,107 @@
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+namespace simeng {
+namespace pipeline_hi {
+
+/** A tickable pipelined buffer. Values are shifted from the tail slot to the
+ * head slot each time `tick()` is called. */
+template <class T>
+class PipelineBuffer {
+ public:
+  /** Construct a pipeline buffer of width `width`, and fill all slots with
+   * `initialValue`. */
+  PipelineBuffer(int width, const T& initialValue)
+      : width(width), buffer(width * defaultLength_, initialValue),
+        length_(defaultLength_), headIndex_(defaultLength_-1),
+        tailIndex_(0) {}
+
+  PipelineBuffer(int width, const T& initialValue, int length)
+      : width(width), buffer(width * length, initialValue), length_(length),
+        headIndex_(length_-1), tailIndex_(0) {
+    assert(length_ != 0 && "Pipeline buffer length cannot be 0");
+  }
+
+  /** Tick the buffer and move head/tail pointers, or do nothing if it's
+   * stalled. */
+  void tick() {
+    if (isStalled_) return;
+
+    //length ==1 shortcut? condition check cost
+
+    if (headIndex_) { // when headIndex != 0
+      headIndex_--;
+    } else {
+      headIndex_ = length_ - 1;
+    }
+    if (tailIndex_) { // when tailIndex != 0
+      tailIndex_--;
+    } else {
+      tailIndex_ = length_ - 1;
+    }
+  }
+
+  /** Get a tail slots pointer. */
+  T* getTailSlots() {
+    T* ptr = buffer.data();
+    return &ptr[tailIndex_ * width];
+  }
+
+  /** Get a const tail slots pointer. */
+  const T* getTailSlots() const {
+    const T* ptr = buffer.data();
+    return &ptr[tailIndex_ * width];
+  }
+
+  /** Get a head slots pointer. */
+  T* getHeadSlots() {
+    T* ptr = buffer.data();
+    return &ptr[headIndex_ * width];
+  }
+
+  /** Get a const head slots pointer. */
+  const T* getHeadSlots() const {
+    const T* ptr = buffer.data();
+    return &ptr[headIndex_ * width];
+  }
+
+  /** Check if the buffer is stalled. */
+  bool isStalled() const { return isStalled_; }
+
+  /** Set the buffer's stall flag to `stalled`. */
+  void stall(bool stalled) { isStalled_ = stalled; }
+
+  /** Fill the buffer with a specified value. */
+  void fill(const T& value) { std::fill(buffer.begin(), buffer.end(), value); }
+
+  /** Get the width of the buffer slots. */
+  unsigned short getWidth() const { return width; }
+
+ private:
+  /** The width of each row of slots. */
+  unsigned short width;
+
+  /** The buffer. */
+  std::vector<T> buffer;
+
+  /** Whether the buffer is stalled or not. */
+  bool isStalled_ = false;
+
+  /** Buffer length */
+  const unsigned int length_;
+
+  /**  */
+  unsigned int headIndex_;
+
+  /**  */
+  unsigned int tailIndex_;
+
+  /** The number of stages in the pipeline. */
+  static const unsigned int defaultLength_ = 2;
+};
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/include/simeng/pipeline_hi/PipelineBuffer1.hh b/src/include/simeng/pipeline_hi/PipelineBuffer1.hh
new file mode 100644
index 0000000000..dfb465a33c
--- /dev/null
+++ b/src/include/simeng/pipeline_hi/PipelineBuffer1.hh
@@ -0,0 +1,133 @@
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+namespace simeng {
+namespace pipeline_hi {
+
+// TODO: Extend to allow specifying the number of cycles it will take for
+// information to move from tail to head (currently fixed at 1 by
+// implementation)
+
+/** A tickable pipelined buffer. Values are shifted from the tail slot to the
+ * head slot each time `tick()` is called. */
+template <class T>
+class PipelineBuffer {
+ public:
+  /** Construct a pipeline buffer of width `width`, and fill all slots with
+   * `initialValue`. */
+  PipelineBuffer(int width, const T& initialValue)
+      : width(width), buffer(width * defaultLength_, initialValue),
+        length_(defaultLength_) {}
+
+  //TODO:currently length > 2 is not working, oscillate between 0 and 1
+  PipelineBuffer(int width, const T& initialValue, int length)
+      : width(width), buffer(width * length, initialValue), length_(length),
+        useDefaultLength_(false) {
+    assert(length_ != 0 && "Pipeline buffer length cannot be 0");
+  }
+
+  /** Tick the buffer and move head/tail pointers, or do nothing if it's
+   * stalled. */
+  void tick() {
+    if (useDefaultLength_) {
+      if (isStalled_) return;
+
+      headIsStart = !headIsStart;
+    } else {
+      if (length_ == 1) {
+        return;
+      } else if (length_ > 2) {
+        //TODO
+      }
+    }
+  }
+
+  /** Get a tail slots pointer. */
+  T* getTailSlots() {
+    T* ptr = buffer.data();
+    if (useDefaultLength_) {
+      return &ptr[headIsStart * width];
+    } else {
+      if (length_ == 1) {
+        return &ptr[0];
+      }
+    }
+  }
+
+  /** Get a const tail slots pointer. */
+  const T* getTailSlots() const {
+    const T* ptr = buffer.data();
+    if (useDefaultLength_) {
+      return &ptr[headIsStart * width];
+    } else {
+      if (length_ == 1) {
+        return &ptr[0];
+      }
+    }
+  }
+
+
+      /** Get a head slots pointer. */
+  T* getHeadSlots() {
+    T* ptr = buffer.data();
+    if (useDefaultLength_) {
+      return &ptr[!headIsStart * width];
+    } else {
+      if (length_ == 1) {
+        return &ptr[0];
+      }
+    }
+  }
+
+  /** Get a const head slots pointer. */
+  const T* getHeadSlots() const {
+    const T* ptr = buffer.data();
+    if (useDefaultLength_) {
+      return &ptr[!headIsStart * width];
+    } else {
+      if (length_ == 1) {
+        return &ptr[0];
+      }
+    }
+  }
+
+  /** Check if the buffer is stalled. */
+  bool isStalled() const { return isStalled_; }
+
+  /** Set the buffer's stall flag to `stalled`. */
+  void stall(bool stalled) { isStalled_ = stalled; }
+
+  /** Fill the buffer with a specified value. */
+  void fill(const T& value) { std::fill(buffer.begin(), buffer.end(), value); }
+
+  /** Get the width of the buffer slots. */
+  unsigned short getWidth() const { return width; }
+
+ private:
+  /** The width of each row of slots. */
+  unsigned short width;
+
+  /** The buffer. */
+  std::vector<T> buffer;
+
+  /** The offset of the head pointer; either 0 or 1. */
+  bool headIsStart = 0;
+
+  /** Whether the buffer is stalled or not. */
+  bool isStalled_ = false;
+
+  /** Buffer length */
+  const unsigned int length_;
+
+  /** True if using default length (== 2) */
+  bool useDefaultLength_ = true;
+
+  /** The number of stages in the pipeline. */
+  static const unsigned int defaultLength_ = 2;
+};
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/include/simeng/pipeline_hi/PortAllocator.hh b/src/include/simeng/pipeline_hi/PortAllocator.hh
new file mode 100644
index 0000000000..bc985c0aaa
--- /dev/null
+++ b/src/include/simeng/pipeline_hi/PortAllocator.hh
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <cstdint>
+#include <functional>
+
+namespace simeng {
+namespace pipeline_hi {
+
+namespace PortType {
+/** Instructions have to match the exact group(s) in set. */
+const uint8_t COMPULSORY = 0;
+/** Instructions can optional match group(s) in set. */
+const uint8_t OPTIONAL = 1;
+}  // namespace PortType
+
+/** An abstract execution port allocator interface. */
+class PortAllocator {
+ public:
+  virtual ~PortAllocator(){};
+
+  /** Allocate a port for the specified instruction group; returns the allocated
+   * port. */
+  virtual uint16_t allocate(const std::vector<uint16_t>& ports) = 0;
+
+  /** Inform the allocator that an instruction was issued to the specified port.
+   */
+  virtual void issued(uint16_t port) = 0;
+
+  /** Inform the allocator that an instruction will not issue to its
+   * allocated port. */
+  virtual void deallocate(uint16_t port) = 0;
+
+  /** Set function from DispatchIssueUnit to retrieve reservation
+   * station sizes during execution. */
+  virtual void setRSSizeGetter(
+      std::function<void(std::vector<uint64_t>&)> rsSizes) = 0;
+
+  /** Tick the port allocator to allow it to process internal tasks. */
+  virtual void tick() = 0;
+};
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/include/simeng/pipeline_hi/RegDepMap.hh b/src/include/simeng/pipeline_hi/RegDepMap.hh
new file mode 100644
index 0000000000..7145fd1903
--- /dev/null
+++ b/src/include/simeng/pipeline_hi/RegDepMap.hh
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <deque>
+#include <map>
+#include <queue>
+#include <unordered_map>
+
+#include "simeng/Instruction.hh"
+
+namespace simeng {
+namespace pipeline_hi {
+
+typedef std::shared_ptr<Instruction> InstrPtr;
+class RegDepMap
+{
+  public:
+    RegDepMap(const std::vector<RegisterFileStructure> registerFileStructures, 
+              const RegisterFileSet& registerFileSet);
+    ~RegDepMap();
+
+    /** Clear the Entire Map */
+    void clear();
+
+    /** Insert all of a instruction's destination registers into map*/
+    void insert(InstrPtr instr);
+
+    /** Remove all of a instruction's destination registers into map*/
+    void remove(InstrPtr instr);
+
+    /** Is the current instruction able to read from this
+     *  destination register?
+     */
+    bool canRead(InstrPtr instr);
+
+    /** Is the current instruction able to write to this
+     *  destination register?
+     */
+    bool canWrite(InstrPtr instr);
+
+    /* Is there any instr that can forward the data for this instr. If yes, set
+     * the data*/
+    bool canForward(InstrPtr instr);
+
+    void purgeFlushed();
+
+    void dump();
+    
+  private:
+    const std::vector<RegisterFileStructure> registerFileStructures_;
+    const RegisterFileSet& registerFileSet_;
+    typedef std::vector<std::vector<InstrPtr> > DepMap;
+    std::vector<DepMap> regMap_;
+    uint32_t outstandingDep_ = 0;
+};
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/include/simeng/pipeline_hi/RegisterAliasTable.hh b/src/include/simeng/pipeline_hi/RegisterAliasTable.hh
new file mode 100644
index 0000000000..1b2327fc52
--- /dev/null
+++ b/src/include/simeng/pipeline_hi/RegisterAliasTable.hh
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <queue>
+
+#include "simeng/RegisterFileSet.hh"
+
+namespace simeng {
+namespace pipeline_hi {
+
+/** A Register Alias Table (RAT) implementation. Contains information on
+ * the current register renaming state. */
+class RegisterAliasTable {
+ public:
+  /** Construct a RAT, supplying a description of the architectural register
+   * structure, and the corresponding numbers of physical registers that should
+   * be available. */
+  RegisterAliasTable(std::vector<RegisterFileStructure> architecturalStructure,
+                     std::vector<uint16_t> physicalStructure);
+
+  /** Retrieve the current physical register assigned to the provided
+   * architectural register. */
+  Register getMapping(Register architectural) const;
+
+  /** Determine whether it's possible to allocate `quantity` physical registers
+   * of type `type` this cycle. */
+  bool canAllocate(uint8_t type, unsigned int quantity) const;
+
+  /** Check whether registers of type `type` can be renamed by this RAT. */
+  bool canRename(uint8_t type) const;
+
+  /** Allocate a physical register for the provided architectural register. */
+  Register allocate(Register architectural);
+
+  /** Get the number of free registers available for allocation this cycle. */
+  unsigned int freeRegistersAvailable(uint8_t type) const;
+
+  /** Commit the provided physical register. This register now holds the
+   * committed state of the corresponding architectural register, and previous
+   * physical register is freed. */
+  void commit(Register physical);
+
+  /** Rewind the allocation of a physical register. The former physical register
+   * is reinstated to the mapping table, and the provided register is freed. */
+  void rewind(Register physical);
+
+  /** Free the provided physical register. */
+  void free(Register physical);
+
+ private:
+  /** The register mapping tables. Holds a map of architectural -> physical
+   * register mappings for each register type. */
+  std::vector<std::vector<uint16_t>> mappingTable_;
+
+  /** The register history tables. Each table holds an entry for each physical
+   * register, recording the physical register formerly assigned to its
+   * architectural register; one table is available per register type. */
+  std::vector<std::vector<uint16_t>> historyTable_;
+
+  /** The register destination tables. Holds a map of physical -> architectural
+   * register mappings for each register type. Used for rewind behaviour. */
+  std::vector<std::vector<uint16_t>> destinationTable_;
+
+  /** The free register queues. Holds a list of unallocated physical registers
+   * for each register type. */
+  std::vector<std::queue<uint16_t>> freeQueues_;
+};
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/include/simeng/pipeline_hi/ReorderBuffer.hh b/src/include/simeng/pipeline_hi/ReorderBuffer.hh
new file mode 100644
index 0000000000..1e5fd8408c
--- /dev/null
+++ b/src/include/simeng/pipeline_hi/ReorderBuffer.hh
@@ -0,0 +1,136 @@
+#pragma once
+
+#include <deque>
+#include <functional>
+
+#include "simeng/Instruction.hh"
+#include "simeng/pipeline_hi/LoadStoreQueue.hh"
+#include "simeng/pipeline_hi/RegisterAliasTable.hh"
+
+namespace simeng {
+namespace pipeline_hi {
+
+/** A branch prediction outcome with an associated instruction address. */
+struct latestBranch {
+  /** Branch instruction address. */
+  uint64_t address;
+
+  /** Outcome of the branch. */
+  BranchPrediction outcome;
+
+  /** The related instructionsCommitted_ value that this instruction was
+   * committed on. */
+  uint64_t commitNumber;
+};
+
+/** A Reorder Buffer (ROB) implementation. Contains an in-order queue of
+ * in-flight instructions. */
+class ReorderBuffer {
+ public:
+  /** Constructs a reorder buffer of maximum size `maxSize`, supplying a
+   * reference to the register alias table. */
+  ReorderBuffer(
+      unsigned int maxSize, RegisterAliasTable& rat, LoadStoreQueue& lsq,
+      std::function<void(const std::shared_ptr<Instruction>&)> raiseException,
+      std::function<void(uint64_t branchAddress)> sendLoopBoundary,
+      BranchPredictor& predictor, uint16_t loopBufSize,
+      uint16_t loopDetectionThreshold);
+
+  /** Add the provided instruction to the ROB. */
+  void reserve(const std::shared_ptr<Instruction>& insn);
+
+  void commitMicroOps(uint64_t insnId);
+
+  /** Commit and remove up to `maxCommitSize` instructions. */
+  unsigned int commit(unsigned int maxCommitSize);
+
+  /** Flush all instructions with a sequence ID greater than `afterSeqId`. */
+  void flush(uint64_t afterSeqId);
+
+  /** Retrieve the current size of the ROB. */
+  unsigned int size() const;
+
+  /** Retrieve the current amount of free space in the ROB. */
+  unsigned int getFreeSpace() const;
+
+  /** Query whether a memory order violation was discovered in the most recent
+   * cycle. */
+  bool shouldFlush() const;
+
+  /** Retrieve the instruction address associated with the most recently
+   * discovered memory order violation. */
+  uint64_t getFlushAddress() const;
+
+  /** Retrieve the sequence ID associated with the most recently discovered
+   * memory order violation. */
+  uint64_t getFlushSeqId() const;
+
+  /** Get the number of instructions the ROB has committed. */
+  uint64_t getInstructionsCommittedCount() const;
+
+  /** Get the number of speculated loads which violated load-store ordering. */
+  uint64_t getViolatingLoadsCount() const;
+
+ private:
+  /** A reference to the register alias table. */
+  RegisterAliasTable& rat_;
+
+  /** A reference to the load/store queue. */
+  LoadStoreQueue& lsq_;
+
+  /** The maximum size of the ROB. */
+  unsigned int maxSize_;
+
+  /** A function to call upon exception generation. */
+  std::function<void(std::shared_ptr<Instruction>)> raiseException_;
+
+  /** A function to send an instruction at a detected loop boundary. */
+  std::function<void(uint64_t branchAddress)> sendLoopBoundary_;
+
+  /** Whether or not a loop has been detected. */
+  bool loopDetected_ = false;
+
+  /** A reference to the current branch predictor. */
+  BranchPredictor& predictor_;
+
+  /** The buffer containing in-flight instructions. */
+  std::deque<std::shared_ptr<Instruction>> buffer_;
+
+  /** Whether the core should be flushed after the most recent commit. */
+  bool shouldFlush_ = false;
+
+  /** The target instruction address the PC should be reset to after the most
+   * recent commit.
+   */
+  uint64_t pc_;
+
+  /** The sequence ID of the youngest instruction that should remain after the
+   * current flush. */
+  uint64_t flushAfter_;
+
+  /** Latest retired branch outcome with a counter. */
+  std::pair<latestBranch, uint64_t> branchCounter_ = {{0, {false, 0}, 0}, 0};
+
+  /** Loop buffer size. */
+  uint16_t loopBufSize_;
+
+  /** Amount of times a branch must be seen without interruption for it to be
+   * considered a loop. */
+  uint16_t loopDetectionThreshold_;
+
+  /** The next available sequence ID. */
+  uint64_t seqId_ = 0;
+
+  /** The next available instruction ID. Used to identify in-order groups of
+   * micro-operations. */
+  uint64_t insnId_ = 0;
+
+  /** The number of instructions committed. */
+  uint64_t instructionsCommitted_ = 0;
+
+  /** The number of speculatived loads which violated load-store ordering. */
+  uint64_t loadViolations_ = 0;
+};
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/include/simeng/pipeline_hi/StaticPredictor.hh b/src/include/simeng/pipeline_hi/StaticPredictor.hh
new file mode 100644
index 0000000000..d8923dc23c
--- /dev/null
+++ b/src/include/simeng/pipeline_hi/StaticPredictor.hh
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <deque>
+
+#include "simeng/BranchPredictor.hh"
+#include "yaml-cpp/yaml.h"
+
+namespace simeng {
+namespace pipeline_hi {
+
+/** A static branch predictor; configurable in YAML config
+ */
+class StaticPredictor : public BranchPredictor {
+ public:
+  StaticPredictor(uint8_t sType); //TODO: temp constructor, get rid of yaml, delete it later
+  StaticPredictor(YAML::Node config);
+  ~StaticPredictor();
+
+  BranchPrediction predict(uint64_t address, BranchType type,
+                           uint64_t knownTarget, uint8_t byteLength) override;
+
+  /** Generate a branch prediction for the specified instruction address; will
+   * behave based on the configuration  */
+  BranchPrediction predict(uint64_t address, BranchType type,
+                           uint64_t knownTarget) override;
+
+  /** Provide branch results to update the prediction model for the specified
+   * instruction address. As this model is static, this does nothing. */
+  void update(uint64_t address, bool taken, uint64_t targetAddress,
+              BranchType type) override;
+
+  /** Provide flush logic for branch prediction scheme. The behaviour will
+   * be based on the configuration */
+  void flush(uint64_t address) override;
+
+ private:
+  /** Decide which static predictor will be in use */
+  uint8_t staticType_;
+
+  /** A return address stack. */
+  std::deque<uint64_t> ras_;
+
+  /** RAS history with instruction address as the keys. A non-zero value
+   * represents the target prediction for a return instruction and a 0 entry for
+   * a branch-and-link instruction. */
+  std::map<uint64_t, uint64_t> rasHistory_;
+
+  /** The size of the RAS. */
+  uint64_t rasSize_ = 1000;
+};
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/include/simeng/pipeline_hi/WritebackUnit.hh b/src/include/simeng/pipeline_hi/WritebackUnit.hh
new file mode 100644
index 0000000000..0816d3b5dc
--- /dev/null
+++ b/src/include/simeng/pipeline_hi/WritebackUnit.hh
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <functional>
+
+#include "simeng/Instruction.hh"
+#include "simeng/pipeline_hi/PipelineBuffer.hh"
+#include <deque>
+
+namespace simeng {
+namespace pipeline_hi {
+
+/** A writeback pipeline unit. Responsible for writing instruction results to
+ * the register files. */
+class WritebackUnit {
+ public:
+  /** Constructs a writeback unit with references to an input buffer and
+   * register file to write to. */
+  WritebackUnit(std::vector<PipelineBuffer<std::shared_ptr<Instruction>>>&
+                    completionSlots,
+                RegisterFileSet& registerFileSet,
+                std::function<void(uint64_t insnId)> flagMicroOpCommits,
+                std::function<void(const std::shared_ptr<Instruction>&)> removeDep,
+                std::function<bool(const std::shared_ptr<Instruction>&)> removeInstrOrderQ);
+
+  /** Tick the writeback unit to perform its operation for this cycle. */
+  void tick();
+
+  /** Retrieve a count of the number of instructions retired. */
+  uint64_t getInstructionsWrittenCount() const;
+
+  /** Retrieve instruction(s) to be printed out to the trace */
+  std::vector<std::shared_ptr<Instruction>> getInstsForTrace();
+
+  /** Clear the container for tracing */
+  void traceFinished(); //Might be safer to update trace within WritebackUnit
+
+ private:
+  /** Buffers of completed instructions to process. */
+  std::vector<PipelineBuffer<std::shared_ptr<Instruction>>>& completionSlots_;
+
+  /** The register file set to write results into. */
+  RegisterFileSet& registerFileSet_;
+
+  /** A function handle called to determine if uops associated to an instruction
+   * ID can now be committed. */
+  std::function<void(uint64_t insnId)> flagMicroOpCommits_;
+
+    /** A function to remove the commited instruction from dependency queue. */
+  std::function<void(const std::shared_ptr<Instruction>&)> removeDep_;
+
+    /** A function to remove the commited instruction from ordering queue. */
+  std::function<bool(const std::shared_ptr<Instruction>&)> removeInstrOrderQ_;
+
+  /** The number of instructions processed and retired by this stage. */
+  uint64_t instructionsWritten_ = 0;
+
+  /** Instruction(s) to be printed out to the trace */
+  std::deque<std::shared_ptr<Instruction>> committedInstsForTrace_;
+};
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index 1fbf286509..97de63eb46 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -9,6 +9,7 @@ set(SIMENG_SOURCES
     arch/aarch64/MicroDecoder.cc
     arch/riscv/Architecture.cc
     arch/riscv/ExceptionHandler.cc
+    arch/riscv/SystemRegister.cc
     arch/riscv/Instruction.cc
     arch/riscv/Instruction_address.cc
     arch/riscv/Instruction_decode.cc
@@ -18,6 +19,7 @@ set(SIMENG_SOURCES
     kernel/LinuxProcess.cc
     models/emulation/Core.cc
     models/inorder/Core.cc
+    models/mcu/Core.cc
     models/outoforder/Core.cc
     pipeline/A64FXPortAllocator.cc
     pipeline/BalancedPortAllocator.cc
@@ -32,6 +34,16 @@ set(SIMENG_SOURCES
     pipeline/RenameUnit.cc
     pipeline/ReorderBuffer.cc
     pipeline/WritebackUnit.cc
+    pipeline_hi/DecodeUnit.cc
+    pipeline_hi/DispatchIssueUnit.cc
+    pipeline_hi/ExecuteUnit.cc
+    pipeline_hi/FetchUnit.cc
+    pipeline_hi/LoadStoreQueue.cc
+    pipeline_hi/RegDepMap.cc
+    pipeline_hi/RegisterAliasTable.cc
+    pipeline_hi/ReorderBuffer.cc
+    pipeline_hi/StaticPredictor.cc
+    pipeline_hi/WritebackUnit.cc
     AlwaysNotTakenPredictor.cc
     ArchitecturalRegisterFileSet.cc
     CMakeLists.txt
diff --git a/src/lib/CoreInstance.cc b/src/lib/CoreInstance.cc
index 8ba06c8e08..e8f91d3450 100644
--- a/src/lib/CoreInstance.cc
+++ b/src/lib/CoreInstance.cc
@@ -90,6 +90,10 @@ void CoreInstance::setSimulationMode() {
              "outoforder") {
     mode_ = SimulationMode::OutOfOrder;
     modeString_ = "Out-of-Order";
+  } else if (config_["Core"]["Simulation-Mode"].as<std::string>() ==
+             "mcu") {
+    mode_ = SimulationMode::MCU;
+    modeString_ = "MCU";
   }
 
   return;
@@ -236,7 +240,7 @@ void CoreInstance::createCore() {
   if (config_["Core"]["ISA"].as<std::string>() == "rv64" ||
       config_["Core"]["ISA"].as<std::string>() == "rv32") {
     arch_ =
-        std::make_unique<simeng::arch::riscv::Architecture>(kernel_, config_);
+        std::make_unique<simeng::arch::riscv::Architecture>(kernel_, config_,dataMemory_);
   } else if (config_["Core"]["ISA"].as<std::string>() == "AArch64") {
     arch_ =
         std::make_unique<simeng::arch::aarch64::Architecture>(kernel_, config_);
@@ -244,6 +248,9 @@ void CoreInstance::createCore() {
 
   // Construct branch predictor object
   predictor_ = std::make_unique<simeng::GenericPredictor>(config_);
+  if (mode_ == SimulationMode::MCU) {
+    predictor_ = std::make_unique<simeng::pipeline_hi::StaticPredictor>(2); //config_
+  }
 
   // Extract port arrangement from config file
   auto config_ports = config_["Ports"];
@@ -268,6 +275,10 @@ void CoreInstance::createCore() {
     core_ = std::make_shared<simeng::models::inorder::Core>(
         *instructionMemory_, *dataMemory_, processMemorySize_, entryPoint,
         *arch_, *predictor_);
+  } else if (mode_ == SimulationMode::MCU) {
+    core_ = std::make_shared<simeng::models::mcu::Core>(
+        *instructionMemory_, *dataMemory_, processMemorySize_, entryPoint,
+        *arch_, *predictor_, config_);
   } else if (mode_ == SimulationMode::OutOfOrder) {
     core_ = std::make_shared<simeng::models::outoforder::Core>(
         *instructionMemory_, *dataMemory_, processMemorySize_, entryPoint,
diff --git a/src/lib/Elf.cc b/src/lib/Elf.cc
index 6281598403..901f370eec 100644
--- a/src/lib/Elf.cc
+++ b/src/lib/Elf.cc
@@ -2,6 +2,7 @@
 
 #include <cstring>
 #include <fstream>
+#include <iostream>
 
 namespace simeng {
 
@@ -13,7 +14,8 @@ namespace simeng {
  * https://man7.org/linux/man-pages/man5/elf.5.html
  */
 
-Elf::Elf(std::string path, char** imagePointer) {
+Elf::Elf(std::string path, char** imagePointer, std::unordered_map<std::string, uint64_t>& symbols)
+{
   std::ifstream file(path, std::ios::binary);
 
   if (!file.is_open()) {
@@ -174,120 +176,69 @@ Elf::Elf(std::string path, char** imagePointer) {
       }
     }
   } else {
-    /**
-     * Starting from the 24th byte of the ELF header a 32-bit value
-     * represents the virtual address to which the system first transfers
-     * control, thus starting the process.
-     * In `elf32_hdr` this value maps to the member `Elf32_Addr e_entry`.
-     */
+    file.seekg(0);
 
-    // Seek to the entry point of the file.
-    // The information in between is discarded
-    file.seekg(0x18);
-    file.read(reinterpret_cast<char*>(&entryPoint32_), sizeof(entryPoint32_));
+    Elf32_Ehdr eheader;
+    file.read(reinterpret_cast<char*>(&eheader), sizeof(eheader));
 
-    /**
-     * Starting from the 32nd byte of the ELF Header a 64-bit value
-     * represents the offset of the ELF Program header or
-     * Program header table in the ELF file.
-     * In `elf32_hdr` this value maps to the member `Elf32_Addr e_phoff`.
-     */
-
-    // Seek to the byte representing the start of the header offset table.
-    uint32_t headerOffset;
-    file.read(reinterpret_cast<char*>(&headerOffset), sizeof(headerOffset));
-
-    /**
-     * Starting 42th byte of the ELF Header a 16-bit value indicates
-     * the size of each entry in the ELF Program header. In the `elf32_hdr`
-     * struct this value maps to the member `Elf32_Half e_phentsize`. All
-     * header entries have the same size.
-     * Starting from the 44th byte a 16-bit value represents the number
-     * of header entries in the ELF Program header. In the `elf32_hdr`
-     * struct this value maps to `Elf32_Half e_phnum`.
-     */
-
-    // Seek to the byte representing header entry size.
-    file.seekg(0x2a);
-    uint16_t headerEntrySize;
-    file.read(reinterpret_cast<char*>(&headerEntrySize), sizeof(headerEntrySize));
-    uint16_t headerEntries;
-    file.read(reinterpret_cast<char*>(&headerEntries), sizeof(headerEntries));
-
-    // Resize the header to equal the number of header entries.
-    headers32_.resize(headerEntries);
+    entryPoint32_ = eheader.e_entry;
+  
     processImageSize_ = 0;
 
-    // Loop over all headers and extract them.
-    for (size_t i = 0; i < headerEntries; i++) {
-      // Since all headers entries have the same size.
-      // We can extract the nth header using the header offset
-      // and header entry size.
-      file.seekg(headerOffset + (i * headerEntrySize));
-      auto& header = headers32_[i];
+    // Loop over pheaders and extract them.
+    file.seekg(eheader.e_phoff);
+    std::vector<Elf32_Phdr> pheaders(eheader.e_phnum);
+    for (auto &ph : pheaders) {
+        file.read(reinterpret_cast<char*>(&ph), sizeof(ph));
+        if ((ph.p_type == PT_LOAD) && (ph.p_vaddr+ph.p_memsz > processImageSize_))
+           processImageSize_ = ph.p_vaddr+ph.p_memsz;
+    }
 
-      /**
-       * Like the ELF Header, the ELF Program header is also defined
-       * using a struct:
-       *  typedef struct {
-       *    uint32_t   p_type;
-       *    Elf32_Off  p_offset;
-       *    Elf32_Addr p_vaddr;
-       *    Elf32_Addr p_paddr;
-       *    uint32_t   p_filesz;
-       *    uint32_t   p_memsz;
-       *    uint32_t   p_flags;
-       *    uint32_t   p_align;
-       *  } Elf32_Phdr;
-       *
-       * The ELF Program header table is an array of structures,
-       * each describing a segment or other information the system
-       * needs to prepare the program for execution. A segment
-       * contains one or more sections (ELF Program Section).
-       *
-       * The `p_vaddr` field holds the virtual address at which the first
-       * byte of the segment resides in memory and the `p_memsz` field
-       * holds the number of bytes in the memory image of the segment.
-       * It may be zero. The `p_offset` member holds the offset from the
-       * beginning of the file at which the first byte of the segment resides.
-       */
+    *imagePointer = (char*)malloc(processImageSize_ * sizeof(char));
 
-      // Each address-related field is 4 bytes in a 32-bit ELF file
-      const int fieldBytes = 4;
-      file.read(reinterpret_cast<char*>(&(header.type)), sizeof(header.type));
-      file.read(reinterpret_cast<char*>(&(header.offset)), fieldBytes);
-      file.read(reinterpret_cast<char*>(&(header.virtualAddress)), fieldBytes);
-      file.read(reinterpret_cast<char*>(&(header.physicalAddress)), fieldBytes);
-      file.read(reinterpret_cast<char*>(&(header.fileSize)), fieldBytes);
-      file.read(reinterpret_cast<char*>(&(header.memorySize)), fieldBytes);
+    for (const auto& ph : pheaders) {
+       if (ph.p_type == PT_LOAD) {
+        file.seekg(ph.p_offset);
+        // Read `fileSize` bytes from `file` into the appropriate place in process memory
+        file.read(*imagePointer+ph.p_vaddr, ph.p_filesz);
 
-      // To construct the process we look for the largest virtual address and
-      // add it to the memory size of the header. This way we obtain a very
-      // large array which can hold data at large virtual address.
-      // However, this way we end up creating a sparse array, in which most
-      // of the entries are unused. Also SimEng internally treats these
-      // virtual address as physical addresses to index into this large array.
-      if (header.virtualAddress + header.memorySize > processImageSize_) {
-        processImageSize_ = header.virtualAddress + header.memorySize;
+        if (ph.p_memsz>ph.p_filesz)
+          // Need to padd the rest of the section memory with zeros
+          memset(*imagePointer+ph.p_vaddr+ph.p_filesz, 0, ph.p_memsz-ph.p_filesz);
       }
     }
 
-    *imagePointer = (char*)malloc(processImageSize_ * sizeof(char));
-    /**
-     * The ELF Program header has a member called `p_type`, which represents
-     * the kind of data or memory segments described by the program header.
-     * The value PT_LOAD=1 represents a loadable segment. In other words,
-     * it contains initialized data that contributes to the program's
-     * memory image.
-     */
-
-    // Process headers; only observe LOAD sections for this basic implementation
-    for (const auto& header : headers32_) {
-      if (header.type == 1) {  // LOAD
-        file.seekg(header.offset);
-        // Read `fileSize` bytes from `file` into the appropriate place in process
-        // memory
-        file.read(*imagePointer + header.virtualAddress, header.fileSize);
+    // read section headers
+    Elf32_Shdr* sh_strtab = NULL;
+    Elf32_Shdr* sh_symtab = NULL;
+    file.seekg(eheader.e_shoff);
+    std::vector<Elf32_Shdr> sheaders(eheader.e_shnum);
+    unsigned int sh_idx = 0;
+    for (auto &sh : sheaders) {
+      file.read(reinterpret_cast<char*>(&sh), sizeof(sh));
+
+      // find section header for strings to use for symbol table.
+      if (sh.sh_type==SHT_SYMTAB)
+        sh_symtab = &sh;
+      else if (sh.sh_type==SHT_STRTAB && sh_idx!=eheader.e_shstrndx)
+        sh_strtab = &sh;
+      sh_idx++;
+    };
+
+    // Read strings table
+    file.seekg(sh_strtab->sh_offset);
+    std::vector<char> strtab(sh_strtab->sh_size);
+    file.read(&strtab[0], sh_strtab->sh_size);
+
+    // Read symbols tables
+    file.seekg(sh_symtab->sh_offset);
+    unsigned num_symbols = sh_symtab->sh_size/sh_symtab->sh_entsize;
+    Elf32_Sym sym;
+    while(num_symbols--) {
+      file.read(reinterpret_cast<char*>(&sym), sizeof(sym));
+      if (strtab[sym.st_name]) {
+        std::string name(&strtab[sym.st_name]);
+        symbols[name] = sym.st_value;
       }
     }
   }
diff --git a/src/lib/GenericPredictor.cc b/src/lib/GenericPredictor.cc
index 2539d7ae59..4b93d832bc 100644
--- a/src/lib/GenericPredictor.cc
+++ b/src/lib/GenericPredictor.cc
@@ -110,4 +110,11 @@ void GenericPredictor::flush(uint64_t address) {
   }
 }
 
+
+BranchPrediction GenericPredictor::predict(uint64_t address, BranchType type,
+                                           uint64_t knownTarget,
+                                           uint8_t byteLength) {
+  return predict(address, type, knownTarget);
+}
+
 }  // namespace simeng
diff --git a/src/lib/Instruction.cc b/src/lib/Instruction.cc
index ac923c11b2..d1b7b112c5 100644
--- a/src/lib/Instruction.cc
+++ b/src/lib/Instruction.cc
@@ -57,5 +57,8 @@ bool Instruction::isLastMicroOp() const { return isLastMicroOp_; }
 void Instruction::setWaitingCommit() { waitingCommit_ = true; }
 bool Instruction::isWaitingCommit() const { return waitingCommit_; }
 int Instruction::getMicroOpIndex() const { return microOpIndex_; }
+bool Instruction::isDiv() const { return isDiv_; }
+bool Instruction::isMul() const { return isMul_; }
+bool Instruction::isSysCall() const { return isSysCall_; }
 
 }  // namespace simeng
diff --git a/src/lib/ModelConfig.cc b/src/lib/ModelConfig.cc
index 88cc1f7d59..342476347c 100644
--- a/src/lib/ModelConfig.cc
+++ b/src/lib/ModelConfig.cc
@@ -69,7 +69,7 @@ void ModelConfig::validate() {
       configFile_[root][subFields[0]], subFields[0],
       std::vector<std::string>({"AArch64", "rv64", "rv32"}), ExpectedValue::String);
   nodeChecker<std::string>(configFile_[root][subFields[1]], subFields[1],
-                           {"emulation", "inorderpipelined", "outoforder"},
+                           {"emulation", "inorderpipelined", "mcu", "outoforder"},
                            ExpectedValue::String);
   nodeChecker<float>(configFile_[root][subFields[2]], subFields[2],
                      std::make_pair(0.f, 10.f), ExpectedValue::Float);
diff --git a/src/lib/arch/aarch64/Architecture.cc b/src/lib/arch/aarch64/Architecture.cc
index 23ebf86ae3..5ad11c70d1 100644
--- a/src/lib/arch/aarch64/Architecture.cc
+++ b/src/lib/arch/aarch64/Architecture.cc
@@ -281,11 +281,13 @@ ProcessStateChange Architecture::getInitialState() const {
 
 uint8_t Architecture::getMaxInstructionSize() const { return 4; }
 
+uint8_t Architecture::getMinInstructionSize() const { return 4; }
+
 uint64_t Architecture::getVectorLength() const { return VL_; }
 
 uint64_t Architecture::getStreamingVectorLength() const { return SVL_; }
 
-void Architecture::updateSystemTimerRegisters(RegisterFileSet* regFile,
+int16_t Architecture::updateSystemTimerRegisters(RegisterFileSet* regFile,
                                               const uint64_t iterations) const {
   // Update the Processor Cycle Counter to total cycles completed.
   regFile->set(PCCreg_, iterations);
@@ -293,6 +295,8 @@ void Architecture::updateSystemTimerRegisters(RegisterFileSet* regFile,
   if (iterations % (uint64_t)vctModulo_ == 0) {
     regFile->set(VCTreg_, regFile->get(VCTreg_).get<uint64_t>() + 1);
   }
+  // interrupts NYI
+  return -1;
 }
 
 std::vector<RegisterFileStructure>
diff --git a/src/lib/arch/aarch64/Instruction.cc b/src/lib/arch/aarch64/Instruction.cc
index 909f5263ae..602bdc7fb3 100644
--- a/src/lib/arch/aarch64/Instruction.cc
+++ b/src/lib/arch/aarch64/Instruction.cc
@@ -43,7 +43,7 @@ Instruction::Instruction(const Instruction& insn)
   branchAddress_ = insn.branchAddress_;
   branchTaken_ = insn.branchTaken_;
   branchType_ = insn.branchType_;
-  knownTarget_ = insn.knownTarget_;
+  knownOffset_ = insn.knownOffset_;
   sequenceId_ = insn.sequenceId_;
   flushed_ = insn.flushed_;
   latency_ = insn.latency_;
@@ -182,7 +182,7 @@ std::tuple<bool, uint64_t> Instruction::checkEarlyBranchMisprediction() const {
 
 BranchType Instruction::getBranchType() const { return branchType_; }
 
-uint64_t Instruction::getKnownTarget() const { return knownTarget_; }
+uint64_t Instruction::getKnownOffset() const { return knownOffset_; }
 
 uint16_t Instruction::getGroup() const {
   // Use identifiers to decide instruction group
diff --git a/src/lib/arch/aarch64/Instruction_decode.cc b/src/lib/arch/aarch64/Instruction_decode.cc
index e3f0529ee9..ca86962958 100644
--- a/src/lib/arch/aarch64/Instruction_decode.cc
+++ b/src/lib/arch/aarch64/Instruction_decode.cc
@@ -364,7 +364,7 @@ void Instruction::decode() {
     switch (metadata.opcode) {
       case Opcode::AArch64_B:  // b label
         branchType_ = BranchType::Unconditional;
-        knownTarget_ = metadata.operands[0].imm;
+        knownOffset_ = metadata.operands[0].imm;
         break;
       case Opcode::AArch64_BR: {  // br xn
         branchType_ = BranchType::Unconditional;
@@ -372,7 +372,7 @@ void Instruction::decode() {
       }
       case Opcode::AArch64_BL:  // bl #imm
         branchType_ = BranchType::SubroutineCall;
-        knownTarget_ = metadata.operands[0].imm;
+        knownOffset_ = metadata.operands[0].imm;
         break;
       case Opcode::AArch64_BLR: {  // blr xn
         branchType_ = BranchType::SubroutineCall;
@@ -383,7 +383,7 @@ void Instruction::decode() {
           branchType_ = BranchType::LoopClosing;
         else
           branchType_ = BranchType::Conditional;
-        knownTarget_ = metadata.operands[0].imm;
+        knownOffset_ = metadata.operands[0].imm;
         break;
       }
       case Opcode::AArch64_CBNZW:  // cbnz wn, #imm
@@ -397,7 +397,7 @@ void Instruction::decode() {
           branchType_ = BranchType::LoopClosing;
         else
           branchType_ = BranchType::Conditional;
-        knownTarget_ = metadata.operands[1].imm;
+        knownOffset_ = metadata.operands[1].imm;
         break;
       }
       case Opcode::AArch64_TBNZW:  // tbnz wn, #imm, label
@@ -411,7 +411,7 @@ void Instruction::decode() {
           branchType_ = BranchType::LoopClosing;
         else
           branchType_ = BranchType::Conditional;
-        knownTarget_ = metadata.operands[2].imm;
+        knownOffset_ = metadata.operands[2].imm;
         break;
       }
       case Opcode::AArch64_RET: {  // ret {xr}
diff --git a/src/lib/arch/riscv/Architecture.cc b/src/lib/arch/riscv/Architecture.cc
index d1a18777e8..84afcc0996 100644
--- a/src/lib/arch/riscv/Architecture.cc
+++ b/src/lib/arch/riscv/Architecture.cc
@@ -4,6 +4,7 @@
 #include <cassert>
 #include <iostream>
 #include <queue>
+#include <string>
 
 #include "InstructionMetadata.hh"
 
@@ -14,8 +15,10 @@ namespace riscv {
 std::unordered_map<uint32_t, Instruction> Architecture::decodeCache;
 std::forward_list<InstructionMetadata> Architecture::metadataCache;
 
-Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
-    : linux_(kernel) {
+Architecture::Architecture(kernel::Linux& kernel, YAML::Node config, std::shared_ptr<simeng::MemoryInterface>& dataMemory)
+: 
+  linux_(kernel)
+{
   is32Bit_ = ARCH_64BIT;
   if (config["Core"]["ISA"].as<std::string>() == "rv32") {
     is32Bit_ = ARCH_32BIT;
@@ -46,14 +49,39 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
 
   // Generate zero-indexed system register map
   systemRegisterMap_[SYSREG_MSTATUS] = systemRegisterMap_.size();
+  systemRegisterMap_[SYSREG_MIE] = systemRegisterMap_.size();
+  systemRegisterMap_[SYSREG_MTVEC] = systemRegisterMap_.size();
   systemRegisterMap_[SYSREG_MSTATUSH] = systemRegisterMap_.size();
+  systemRegisterMap_[SYSREG_MSCRATCH] = systemRegisterMap_.size();
   systemRegisterMap_[SYSREG_MEPC] = systemRegisterMap_.size();
   systemRegisterMap_[SYSREG_MCAUSE] = systemRegisterMap_.size();
   systemRegisterMap_[SYSREG_MHARTID] = systemRegisterMap_.size();
+  systemRegisterMap_[SYSREG_MXCPTSC] = systemRegisterMap_.size();
   systemRegisterMap_[SYSREG_CYCLE] = systemRegisterMap_.size();
   systemRegisterMap_[SYSREG_TIME] = systemRegisterMap_.size();
   systemRegisterMap_[SYSREG_INSTRRET] = systemRegisterMap_.size();
 
+  // Memory Mapped System Register Blocks
+
+  // if elf file includes the label tohost then assume that this binary supports HTIF protocol (used by spike) and include an HTI block
+  uint64_t htifAddress;
+  if (linux_.lookupSymbolValue("tohost",htifAddress))
+  {
+    std::cout << "[SimEng] HTIF detected at: " << std::hex << htifAddress << std::endl;
+    htif = std::make_shared<HostTargetInterface>(*this);
+    memoryMappedSystemRegisterBlocks[htifAddress] = htif.get();
+  }
+
+  // Install CLINT into memort map, this is optional
+  clint = std::make_shared<Clint>(*this);
+  memoryMappedSystemRegisterBlocks[Clint::CLINT_BASE] = clint.get();
+
+  if (!memoryMappedSystemRegisterBlocks.empty())
+  {
+    systemRegisterMemoryInterface = std::make_shared<SystemRegisterMemoryInterface>(dataMemory, memoryMappedSystemRegisterBlocks);
+    dataMemory = systemRegisterMemoryInterface;
+  }
+
   // Instantiate an executionInfo entry for each group in the InstructionGroup
   // namespace.
   for (int i = 0; i < NUM_GROUPS; i++) {
@@ -145,7 +173,7 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
       }
     }
   }
-  if (config["Core"]["Trace"].as<bool>()) {
+  if (config["Core"]["Trace"].IsDefined() && config["Core"]["Trace"].as<bool>()) {
     traceFile_ = new std::ofstream();
     traceFile_->open("./trace.log");
     traceOn_ = true;
@@ -164,6 +192,7 @@ Architecture::~Architecture() {
 uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
                                 uint64_t instructionAddress,
                                 MacroOp& output) const {
+
   // Check that instruction address is 4-byte aligned as required by RISC-V
   // 2-byte when Compressed ISA is supported
   if (instructionAddress & constants_.alignMask) {
@@ -221,9 +250,11 @@ uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
   output.resize(1);
   auto& uop = output[0];
 
-  // Retrieve the cached instruction and write to output
-  uop = std::make_shared<Instruction>(iter->second);
+  // Retrieve the cached instruction
+  auto newinsn = std::make_shared<Instruction>(iter->second);
 
+  // write to output
+  uop = newinsn;
   uop->setInstructionAddress(instructionAddress);
 
   return iter->second.getMetadata().lenBytes;
@@ -265,8 +296,19 @@ int32_t Architecture::getSystemRegisterTag(uint16_t reg) const {
   // Check below is done for speculative instructions that may be passed into
   // the function but will not be executed. If such invalid speculative
   // instructions get through they can cause an out-of-range error.
-  if (!systemRegisterMap_.count(reg)) return 0;
-  return systemRegisterMap_.at(reg);
+  if (systemRegisterMap_.count(reg))
+    return systemRegisterMap_.at(reg);
+  else
+    return -1;
+}
+
+/** Returns a System Register index from a system register tag.
+    reverse lookup slow but only used in printing so will be fine */
+uint16_t Architecture::getSystemRegisterIdFromTag(int32_t tag) const {
+  for (auto it = systemRegisterMap_.begin();it != systemRegisterMap_.end();it++)
+      if (it->second == tag)
+        return it->first;
+  assert(0 && "Tag not found in systemRegisterMap");
 }
 
 ProcessStateChange Architecture::getInitialState() const {
@@ -289,6 +331,8 @@ ProcessStateChange Architecture::getInitialState() const {
 
 uint8_t Architecture::getMaxInstructionSize() const { return 4; }
 
+uint8_t Architecture::getMinInstructionSize() const { return 2; }
+
 std::vector<RegisterFileStructure>
 Architecture::getConfigPhysicalRegisterStructure(YAML::Node config) const {
   return {{constants_.regWidth, config["Register-Set"]["GeneralPurpose-Count"].as<uint16_t>()},
@@ -306,9 +350,21 @@ uint16_t Architecture::getNumSystemRegisters() const {
   return static_cast<uint16_t>(systemRegisterMap_.size());
 }
 
-// Left blank as no implementation necessary
-void Architecture::updateSystemTimerRegisters(RegisterFileSet* regFile,
+int16_t Architecture::updateSystemTimerRegisters(RegisterFileSet* regFile,
                                               const uint64_t iterations) const {
+  int16_t interruptId = -1;
+
+  if (htif)
+  {
+    interruptId = htif->updateSystemTimerRegisters(regFile, iterations);
+    if (interruptId>=0)
+       return interruptId;
+  }
+
+  if (clint)
+    interruptId = clint->updateSystemTimerRegisters(regFile, iterations);
+
+  return interruptId;
 }
 
 void Architecture::updateInstrTrace(const std::shared_ptr<simeng::Instruction>& instruction,
@@ -346,7 +402,7 @@ void Architecture::updateInstrTrace(const std::shared_ptr<simeng::Instruction>&
           } else if(reg.type == RegisterType::FLOAT) {
             s << "f" << std::dec << std::setfill('0') << std::setw(2) << reg.tag << "=0x";
           } else if(reg.type == RegisterType::SYSTEM) {
-            s << "csr_0x" << std::hex << std::setfill('0') << std::setw(3) << metadata.csr << "=0x";
+            s << "csr_0x" << std::hex << std::setfill('0') << std::setw(3) << getSystemRegisterIdFromTag(reg.tag) << "=0x";
           }
           s << std::hex << std::setfill('0') << std::setw(8) << regFile->get(reg).get<uint32_t>();
           if(i < (num_dest-1)) {
@@ -364,7 +420,7 @@ void Architecture::updateInstrTrace(const std::shared_ptr<simeng::Instruction>&
           } else if(reg.type == RegisterType::FLOAT) {
             s << "f" << std::dec << std::setfill('0') << std::setw(2) << reg.tag << "=0x";
           } else if(reg.type == RegisterType::SYSTEM) {
-            s << "csr_0x" << std::hex << std::setfill('0') << std::setw(3) << metadata.csr << "=0x";
+            s << "csr_0x" << std::hex << std::setfill('0') << std::setw(3) << getSystemRegisterIdFromTag(reg.tag) << "=0x";
           }
           s << std::hex << std::setfill('0') << std::setw(8) << regFile->get(reg).get<uint32_t>();
           if(i < (num_src-1)) {
diff --git a/src/lib/arch/riscv/ExceptionHandler.cc b/src/lib/arch/riscv/ExceptionHandler.cc
index c88448048d..9ba2200858 100644
--- a/src/lib/arch/riscv/ExceptionHandler.cc
+++ b/src/lib/arch/riscv/ExceptionHandler.cc
@@ -1,5 +1,5 @@
+#include "simeng/arch/riscv/Architecture.hh"
 #include "simeng/arch/riscv/ExceptionHandler.hh"
-
 #include <iomanip>
 #include <iostream>
 
@@ -646,6 +646,18 @@ bool ExceptionHandler::init() {
     }
 
     return concludeSyscall(stateChange);
+
+  } else if (exception == InstructionException::SecureMonitorCall) {
+    printException(instruction_);
+    takeException(CAUSE_BREAKPOINT);
+    return true;
+   } else if (exception == InstructionException::Interrupt) {
+    printException(instruction_);
+    if (instruction_.getInterruptId() == static_cast<uint16_t>(InterruptId::HALT))
+      return fatal();
+    uint64_t mcause_val = static_cast<uint64_t>(instruction_.getInterruptId()) | (1<<(8*instruction_.getArchRegWidth()-1));
+    takeException(mcause_val);
+    return true;
   }
 
   printException(instruction_);
@@ -745,6 +757,45 @@ void ExceptionHandler::readLinkAt(span<char> path) {
   concludeSyscall(stateChange);
 }
 
+void ExceptionHandler::takeException(uint64_t causecode)
+{
+  const auto& registerFileSet = core.getArchitecturalRegisterFileSet();
+  auto& architecture    = instruction_.getArchitecture();
+  uint16_t mtvec_tag    = static_cast<uint16_t>(architecture.getSystemRegisterTag(SYSREG_MTVEC));
+  uint16_t mstatus_tag  = static_cast<uint16_t>(architecture.getSystemRegisterTag(SYSREG_MSTATUS));
+  uint16_t mepc_tag     = static_cast<uint16_t>(architecture.getSystemRegisterTag(SYSREG_MEPC));
+  uint16_t mcause_tag   = static_cast<uint16_t>(architecture.getSystemRegisterTag(SYSREG_MCAUSE));
+  uint64_t mcause_val   = static_cast<uint64_t>(causecode);
+
+  auto  mstatus_bits = registerFileSet.get( { RegisterType::SYSTEM, mstatus_tag } ).get<uint64_t>();
+
+  // mpie=mie, mie=0
+  mstatus_bits &= ~MSTATUS_MPIE_MASK;
+  if (mstatus_bits & MSTATUS_MIE_MASK)
+    mstatus_bits |= MSTATUS_MPIE_MASK;
+  mstatus_bits &= ~MSTATUS_MIE_MASK;
+
+  RegisterValue mstatus (mstatus_bits,                          architecture.getConstants().regWidth);
+  RegisterValue mepc    (instruction_.getInstructionAddress(),  architecture.getConstants().regWidth);
+  RegisterValue mcause  (mcause_val,                            architecture.getConstants().regWidth);
+
+  uint64_t      mtvec   = registerFileSet.get( { RegisterType::SYSTEM, mtvec_tag } ).get<uint64_t>();
+
+  ProcessStateChange changes = {
+    ChangeType::REPLACEMENT,
+    {
+      { RegisterType::SYSTEM, mstatus_tag },
+      { RegisterType::SYSTEM, mepc_tag },
+      { RegisterType::SYSTEM, mcause_tag }
+    },
+    {mstatus,  mepc,   mcause}
+  };
+
+  result_ = {false, mtvec, changes};
+  //result_ = {false, instruction_.getInstructionAddress(), changes};
+}
+
+
 bool ExceptionHandler::readBufferThen(uint64_t ptr, uint64_t length,
                                       std::function<bool()> then,
                                       bool firstCall) {
@@ -827,6 +878,9 @@ void ExceptionHandler::printException(const Instruction& insn) const {
     case InstructionException::NoAvailablePort:
       std::cout << "unsupported execution port";
       break;
+    case InstructionException::Interrupt:
+      std::cout << "interrupt (id: " << insn.getInterruptId() << ")";
+      break;
     case InstructionException::UnmappedSysReg:
       std::cout << "unmapped system register";
       break;
diff --git a/src/lib/arch/riscv/Instruction.cc b/src/lib/arch/riscv/Instruction.cc
index 6cfc173b9d..e292b889b9 100644
--- a/src/lib/arch/riscv/Instruction.cc
+++ b/src/lib/arch/riscv/Instruction.cc
@@ -131,7 +131,7 @@ std::tuple<bool, uint64_t> Instruction::checkEarlyBranchMisprediction() const {
 
 BranchType Instruction::getBranchType() const { return branchType_; }
 
-uint64_t Instruction::getKnownTarget() const { return knownTarget_; }
+uint64_t Instruction::getKnownOffset() const { return knownOffset_; }
 
 uint16_t Instruction::getGroup() const {
   uint16_t base = InstructionGroups::INT;
@@ -171,6 +171,10 @@ void Instruction::setArchRegWidth(uint8_t len) { archRegWidth_ = len; }
 
 uint8_t Instruction::getArchRegWidth() const { return archRegWidth_; }
 
+const Architecture& Instruction::getArchitecture() const {
+  return architecture_;
+}
+
 }  // namespace riscv
 }  // namespace arch
 }  // namespace simeng
diff --git a/src/lib/arch/riscv/InstructionMetadata.cc b/src/lib/arch/riscv/InstructionMetadata.cc
index f2b5a9b736..d293bc7fdb 100644
--- a/src/lib/arch/riscv/InstructionMetadata.cc
+++ b/src/lib/arch/riscv/InstructionMetadata.cc
@@ -264,7 +264,10 @@ void InstructionMetadata::alterPseudoInstructions(const cs_insn& insn) {
       csr = ((uint32_t)encoding[3] << 4) | ((uint32_t)encoding[2] >> 4);
       //If there are less than 2 operands provided add necessary x0 operand
       if(operandCount == 1) {
-        if(strcmp(mnemonic, "csrr") == 0) { //csrrs rd,csr,x0
+        if((strcmp(mnemonic, "rdinstret") == 0) ||
+           (strcmp(mnemonic, "rdcycle") == 0) ||
+           (strcmp(mnemonic, "rdtime") == 0) ||
+           (strcmp(mnemonic, "csrr") == 0)) { //csrrs rd,csr,x0
           operands[1].type = RISCV_OP_REG;
           operands[1].reg = 1;
         } else { //csrrxx x0,csr,rs/imm
diff --git a/src/lib/arch/riscv/Instruction_decode.cc b/src/lib/arch/riscv/Instruction_decode.cc
index 8bdd5041eb..9efa7f5c4c 100644
--- a/src/lib/arch/riscv/Instruction_decode.cc
+++ b/src/lib/arch/riscv/Instruction_decode.cc
@@ -143,6 +143,24 @@ void Instruction::decode() {
     case Opcode::RISCV_SD:
       isStore_ = true;
       break;
+      //identify MULs/DIVs
+    case Opcode::RISCV_MUL:
+    case Opcode::RISCV_MULH:
+    case Opcode::RISCV_MULHU:
+    case Opcode::RISCV_MULHSU:
+    case Opcode::RISCV_MULW:
+      isMultiply_ = true;
+      isMul_ = true; //this one is for simeng/Instruction.hh
+      break;
+    case Opcode::RISCV_DIV:
+    case Opcode::RISCV_DIVU:
+    case Opcode::RISCV_DIVUW:
+    case Opcode::RISCV_DIVW:
+      isDivide_ = true;
+      isDiv_ = true; //this one is for simeng/Instruction.hh
+      break;
+    case Opcode::RISCV_ECALL:
+      isSysCall_ = true;
   }
 
   if (Opcode::RISCV_AMOADD_D <= metadata.opcode &&
@@ -257,6 +275,16 @@ void Instruction::decode() {
     isCompare_ = true;
   }
 
+  if (Opcode::RISCV_MRET == metadata.opcode) {
+    uint16_t mepc_tag     = static_cast<uint16_t>(architecture_.getSystemRegisterTag(SYSREG_MEPC));
+    uint16_t mstatus_tag  = static_cast<uint16_t>(architecture_.getSystemRegisterTag(SYSREG_MSTATUS));
+    sourceRegisters[sourceRegisterCount++]            = { RegisterType::SYSTEM, mepc_tag };
+    sourceRegisters[sourceRegisterCount++]            = { RegisterType::SYSTEM, mstatus_tag };
+    destinationRegisters[destinationRegisterCount++]  = { RegisterType::SYSTEM, mstatus_tag };
+    operandsPending += 2;
+    isBranch_ = true;
+  }
+
   // Set branch type
   switch (metadata.opcode) {
     case Opcode::RISCV_BEQ:
@@ -266,12 +294,24 @@ void Instruction::decode() {
     case Opcode::RISCV_BGE:
     case Opcode::RISCV_BGEU:
       branchType_ = BranchType::Conditional;
-      knownTarget_ = instructionAddress_ + metadata.operands[2].imm;
+      knownOffset_ = metadata.operands[2].imm;
       break;
     case Opcode::RISCV_JAL:
+      branchType_ = BranchType::SubroutineCall;
+      knownOffset_ = metadata.operands[1].imm;
+      break;
     case Opcode::RISCV_JALR:
-      branchType_ = BranchType::Unconditional;
-      knownTarget_ = instructionAddress_ + metadata.operands[1].imm;
+    {
+      //jalr x0, 0(x1) == ret
+      if (metadata.operands[0].reg == RISCV_REG_X0 && metadata.operands[1].reg == RISCV_REG_X1 && metadata.operands[2].imm == 0) {
+        branchType_ = BranchType::Return;
+      } else {
+        branchType_ = BranchType::SubroutineCall;
+      }
+      break;
+    }
+    case Opcode::RISCV_MRET:
+      branchType_ = BranchType::Unknown; //TODO: think which type it fits / create new type
       break;
   }
 }
@@ -292,10 +332,14 @@ bool Instruction::decode16() {
              "Invalid operand for JR,JALR:- CR instructions");
       sourceRegisters[sourceRegisterCount++] = csRegToRegister(metadata.operands[0].reg);
       operandsPending++;
+      branchType_ = BranchType::SubroutineCall;
       if (metadata.opcode == Opcode::RISCV_C_JALR) {
         destinationRegisters[destinationRegisterCount++] = Instruction::RA_REGISTER;
+      } else { //case C_JR
+        if (metadata.operands[0].reg == RISCV_REG_X1 ) {
+          branchType_ = BranchType::Return;
+        }
       }
-      branchType_ = BranchType::Unconditional;
       break;
     case Opcode::RISCV_C_MV:
       instFormat_ = CIF_CR;
@@ -309,7 +353,7 @@ bool Instruction::decode16() {
       sourceRegisters[sourceRegisterCount++] = csRegToRegister(metadata.operands[1].reg);
       operandsPending++;
       break;
-    case Opcode::RISCV_C_EBREAK://TODO
+    case Opcode::RISCV_C_EBREAK:
       instFormat_ = CIF_CR;
       break;
     case Opcode::RISCV_C_ADD:
@@ -410,7 +454,7 @@ bool Instruction::decode16() {
       operandsPending++;
       c_imm = metadata.operands[1].imm;
       branchType_ = BranchType::Conditional;
-      knownTarget_ = instructionAddress_ + metadata.operands[1].imm;
+      knownOffset_ = metadata.operands[1].imm;
       break;
     case Opcode::RISCV_C_FLD:
     case Opcode::RISCV_C_FLW:
@@ -503,9 +547,11 @@ bool Instruction::decode16() {
       c_imm = metadata.operands[0].imm;
       if (metadata.opcode == Opcode::RISCV_C_JAL) {
         destinationRegisters[destinationRegisterCount++] = Instruction::RA_REGISTER;
+        branchType_ = BranchType::SubroutineCall;
+      } else { // case C_J
+        branchType_ = BranchType::Unconditional;
       }
-      branchType_ = BranchType::Unconditional;
-      knownTarget_ = instructionAddress_ + metadata.operands[0].imm;
+      knownOffset_ = metadata.operands[0].imm;
       break;
     case Opcode::RISCV_C_UNIMP:
       break;
@@ -523,7 +569,7 @@ bool Instruction::decodeCsr() {
   }
 
   isCsr_ = true;
-  uint32_t sysRegTag = architecture_.getSystemRegisterTag(metadata.csr);
+  int32_t sysRegTag = architecture_.getSystemRegisterTag(metadata.csr);
   if (sysRegTag == -1) {
     exceptionEncountered_ = true;
     exception_ = InstructionException::UnmappedSysReg;
@@ -539,16 +585,16 @@ bool Instruction::decodeCsr() {
   destinationRegisters[destinationRegisterCount++] = {
       RegisterType::SYSTEM, static_cast<uint16_t>(sysRegTag)};
 
-  // First operand from metadata is rd, second operand from metadata is rs1
-  if (csRegToRegister(metadata.operands[1].reg) != Instruction::ZERO_REGISTER) {
+  // First operand (0) from metadata is rd, second operand (1) from metadata is rs1
+  if (csRegToRegister(metadata.operands[0].reg) != Instruction::ZERO_REGISTER) {
     destinationRegisters[destinationRegisterCount++] =
-        csRegToRegister(metadata.operands[1].reg);
+        csRegToRegister(metadata.operands[0].reg);
   }
 
-  if(metadata.operands[0].type == RISCV_OP_IMM) {
-    c_imm = metadata.operands[0].imm;
-  } else if (metadata.operands[0].type == RISCV_OP_REG) {
-    sourceRegisters[sourceRegisterCount] = csRegToRegister(metadata.operands[0].reg);
+  if(metadata.operands[1].type == RISCV_OP_IMM) {
+    c_imm = metadata.operands[1].imm;
+  } else if (metadata.operands[1].type == RISCV_OP_REG) {
+    sourceRegisters[sourceRegisterCount] = csRegToRegister(metadata.operands[1].reg);
     if (sourceRegisters[sourceRegisterCount] ==
         Instruction::ZERO_REGISTER) {
       // Catch zero register references and pre-complete those operands
diff --git a/src/lib/arch/riscv/Instruction_execute.cc b/src/lib/arch/riscv/Instruction_execute.cc
index b7a4a822b4..a37d3750fd 100644
--- a/src/lib/arch/riscv/Instruction_execute.cc
+++ b/src/lib/arch/riscv/Instruction_execute.cc
@@ -4,6 +4,7 @@
 
 #include "InstructionMetadata.hh"
 #include "simeng/arch/riscv/Instruction.hh"
+#include "simeng/arch/riscv/SystemRegister.hh"
 
 namespace simeng {
 namespace arch {
@@ -358,7 +359,9 @@ void Instruction::execute() {
     }
     case Opcode::RISCV_SLTIU: {  // SLTIU rd,rs1,imm
       const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t imm = static_cast<int64_t>(metadata.operands[2].imm);
+      uint64_t       imm = metadata.operands[2].imm;
+      if (archRegWidth_==4)
+        imm = static_cast<uint32_t>(imm);
       if (rs1 < imm) {
         results[0] = RegisterValue(static_cast<uint64_t>(1), archRegWidth_);
       } else {
@@ -460,13 +463,28 @@ void Instruction::execute() {
       results[0] = RegisterValue(instructionAddress_ + 4, archRegWidth_);
       break;
     }
-      // TODO EBREAK
+    case Opcode::RISCV_EBREAK: {  // EBREAK
       // used to return control to a debugging environment pg27 20191213
+      exceptionEncountered_ = true;
+      exception_ = InstructionException::SecureMonitorCall;
+      break;
+    }
     case Opcode::RISCV_ECALL: {  // ECALL
       exceptionEncountered_ = true;
       exception_ = InstructionException::SupervisorCall;
       break;
     }
+    case Opcode::RISCV_MRET: {  // MRET
+      branchAddress_  = (operands[0].get<uint64_t>()) & ~1;  // Set LSB of result to 0
+      branchTaken_    = true;
+
+      auto mstatus    = operands[1].get<uint64_t>();
+      if (mstatus & MSTATUS_MPIE_MASK)
+        mstatus |= MSTATUS_MIE_MASK;
+
+      results[0]      = RegisterValue(mstatus, archRegWidth_);
+      break;
+    }
     case Opcode::RISCV_FENCE: {  // FENCE
       // TODO currently modelled as a NOP as all codes are currently single
       // threaded "Informally, no other RISC-V hart or external device can
@@ -709,28 +727,49 @@ void Instruction::execute() {
       results[0] = RegisterValue(static_cast<int64_t>(rs1 * rs2), archRegWidth_);
       break;
     }
-      //    case Opcode::RISCV_MULH: {//MULH rd,rs1,rs2
-      //      return executionNYI();
-      //
-      //      const int64_t rs1 = operands[0].get<int64_t>();
-      //      const int64_t rs2 = operands[1].get<int64_t>();
-      //      results[0] = RegisterValue(mulhiss(rs1, rs2);
-      //      break;
-      //    }
+    case Opcode::RISCV_MULH: {//MULH rd,rs1,rs2
+      int64_t result;
+      if (archRegWidth_==4)
+      {
+        const int64_t rs1 = operands[0].get<int32_t>();
+        const int64_t rs2 = operands[1].get<int32_t>();
+        result = (rs1*rs2)>>32;
+      } else {
+        const int64_t rs1 = operands[0].get<int64_t>();
+        const int64_t rs2 = operands[1].get<int64_t>();
+        //result = mulhiss(rs1, rs2);
+        return executionNYI();
+      }
+      results[0] = RegisterValue(result, archRegWidth_);
+      break;
+    }
     case Opcode::RISCV_MULHU: {  // MULHU rd,rs1,rs2
       const uint64_t rs1 = operands[0].get<uint64_t>();
       const uint64_t rs2 = operands[1].get<uint64_t>();
-      results[0] = RegisterValue(mulhiuu(rs1, rs2), archRegWidth_);
+      uint64_t result;
+      if (archRegWidth_==4)
+        result = (rs1*rs2)>>32;
+      else
+        result = mulhiuu(rs1, rs2);
+      results[0] = RegisterValue(result, archRegWidth_);
+      break;
+    }
+    case Opcode::RISCV_MULHSU: {//MULHSU rd,rs1,rs2
+      int64_t result;
+      if (archRegWidth_==4)
+      {
+        const int64_t rs1  = operands[0].get<int32_t>();
+        const uint64_t rs2 = operands[1].get<uint32_t>();
+        result = (rs1*rs2)>>32;
+      } else {
+        const int64_t rs1 = operands[0].get<int64_t>();
+        const uint64_t rs2 = operands[1].get<uint64_t>();
+        //result = mulhisu(rs1, rs2);
+        return executionNYI();
+      }
+      results[0] = RegisterValue(result, archRegWidth_);
       break;
     }
-      //    case Opcode::RISCV_MULHSU: {//MULHSU rd,rs1,rs2
-      //      return executionNYI();
-      //
-      //      const int64_t rs1 = operands[0].get<int64_t>();
-      //      const uint64_t rs2 = operands[1].get<uint64_t>();
-      //      results[0] = RegisterValue(mulhisu(rs1, rs2);
-      //      break;
-      //    }
     case Opcode::RISCV_MULW: {  // MULW rd,rs1,rs2
       const uint32_t rs1 = operands[0].get<uint32_t>();
       const uint32_t rs2 = operands[1].get<uint32_t>();
@@ -852,12 +891,14 @@ void Instruction::execute() {
       uint32_t new_csr_value = old_csr_value & ~(operands[1].get<uint32_t>());
       results[0] = RegisterValue(new_csr_value, 4);
       results[1] = RegisterValue(old_csr_value, 4);
+      break;
     }
     case Opcode::RISCV_CSRRCI: {
       uint32_t old_csr_value = operands[0].get<uint32_t>();
       uint32_t new_csr_value = old_csr_value & ~(c_imm);
       results[0] = RegisterValue(new_csr_value, 4);
       results[1] = RegisterValue(old_csr_value, 4);
+      break;
     }
     case Opcode::RISCV_CSRRS: {
       uint32_t old_csr_value = operands[0].get<uint32_t>();
@@ -938,8 +979,12 @@ void Instruction::execute() {
       }
       break;
     }
-    case Opcode::RISCV_C_EBREAK:
+    case Opcode::RISCV_C_EBREAK: {
+      // used to return control to a debugging environment pg27 20191213
+      exceptionEncountered_ = true;
+      exception_ = InstructionException::SecureMonitorCall;
       break;
+    }
     case Opcode::RISCV_C_FLD:
       break;
     case Opcode::RISCV_C_FLDSP:
diff --git a/src/lib/arch/riscv/SystemRegister.cc b/src/lib/arch/riscv/SystemRegister.cc
new file mode 100644
index 0000000000..05de188dcd
--- /dev/null
+++ b/src/lib/arch/riscv/SystemRegister.cc
@@ -0,0 +1,124 @@
+#include "simeng/arch/riscv/Architecture.hh"
+
+namespace simeng {
+namespace arch {
+namespace riscv {
+
+bool MemoryMappedSystemRegisterBlock::put(uint16_t offset, const RegisterValue& value)
+{
+  auto it = memoryMappedSystemRegisters.upper_bound(offset);
+  if  (it != memoryMappedSystemRegisters.begin() )
+  {
+    it--;
+    if (offset-it->first < it->second->size()) {
+      it->second->put(value);
+      return true;
+    }
+    return false;
+  }
+  return false;
+}
+
+bool MemoryMappedSystemRegisterBlock::get(uint16_t offset, RegisterValue& value)
+{
+  auto it = memoryMappedSystemRegisters.upper_bound(offset);
+  if  (it != memoryMappedSystemRegisters.begin() )
+  {
+    it--;
+    if (offset-it->first < it->second->size()) {
+      value = it->second->get();
+      return true;
+    }
+    return false;
+  }
+  return false;
+}
+
+/** Put/Get Memory Mapped Registers */
+bool SystemRegisterMemoryInterface::putMemoryMappedSystemRegister(uint64_t address, const RegisterValue& value)
+{
+  auto it = memoryMappedSystemRegisterBlocks_.upper_bound(address);
+  if  (it != memoryMappedSystemRegisterBlocks_.begin() )
+  {
+    it--;
+    if (address-it->first < it->second->size()) {
+      it->second->put(static_cast<uint16_t>(address-it->first),value);
+      return true;
+    }
+    return false;
+  }
+  return false;
+}
+
+bool SystemRegisterMemoryInterface::getMemoryMappedSystemRegister(uint64_t address, RegisterValue& value)
+{
+  auto it = memoryMappedSystemRegisterBlocks_.upper_bound(address);
+  if  (it != memoryMappedSystemRegisterBlocks_.begin() )
+  {
+    it--;
+    if (address-it->first < it->second->size()) {
+      it->second->get(static_cast<uint16_t>(address-it->first),value);
+      return true;
+    }
+    return false;
+  }
+  return false;
+}
+
+bool HostTargetInterface::put(uint16_t offset, const RegisterValue&value)
+{
+  switch(offset) {
+    case PAYLOAD_OFFSET : 
+    {
+      char ch = value.getAsVector<uint8_t>()[0];
+      if (ch==3 || ch==1)
+        isHalted_ = true;
+      else 
+        putchar(ch);
+      return true;
+    }
+    default :
+      return MemoryMappedSystemRegisterBlock::put(offset, value);
+  }
+}
+
+int16_t Clint::updateSystemTimerRegisters(RegisterFileSet* regFile, const uint64_t iterations)
+{
+  uint64_t ticks      = iterations-last_tick;
+  uint64_t mtime_val  = mtime_.get().get<uint64_t>();
+  bool     ticked     = false;
+
+  last_tick = iterations;
+
+  // if large time passed then multiple timer ticks might be needed
+  while (ticks>=mtime_count)
+  {
+    ticks       -= mtime_count;
+    mtime_count  = mtime_freq;
+    mtime_val   += 1;
+    ticked       = true;
+  }
+
+  // any remaining ticks taken of mtime countdown
+  if (ticks)
+    mtime_count -= ticks;
+
+  mtime_.put(mtime_val);
+
+  if (ticked)
+  {
+    // to improve execution speed only do interrupt checks when the timer ticks
+    // check if interrupts enabled
+    uint16_t mstatus_tag  = static_cast<uint16_t>(architecture_.getSystemRegisterTag(SYSREG_MSTATUS));
+    auto     mstatus_bits = regFile->get( { RegisterType::SYSTEM, mstatus_tag } ).get<uint64_t>();
+    if (mstatus_bits & MSTATUS_MIE_MASK)
+      if  (mtime_val >= mtimecmp_.get().get<uint64_t>())
+        return static_cast<uint16_t>(InterruptId::TIMER);
+  }
+
+  return -1;
+}
+
+}  // namespace riscv
+}  // namespace arch
+}  // namespace simeng
diff --git a/src/lib/kernel/Linux.cc b/src/lib/kernel/Linux.cc
index 02de895080..bc060bbae4 100644
--- a/src/lib/kernel/Linux.cc
+++ b/src/lib/kernel/Linux.cc
@@ -29,10 +29,12 @@ void Linux::createProcess(const LinuxProcess& process) {
                             .currentBrk = process.getHeapStart(),
                             .initialStackPointer = process.getStackPointer(),
                             .mmapRegion = process.getMmapStart(),
-                            .pageSize = process.getPageSize()});
+                            .pageSize = process.getPageSize(),
+                            });
   processStates_.back().fileDescriptorTable.push_back(STDIN_FILENO);
   processStates_.back().fileDescriptorTable.push_back(STDOUT_FILENO);
   processStates_.back().fileDescriptorTable.push_back(STDERR_FILENO);
+  processStates_.back().process = &process;
 
   // Define vector of all currently supported special file paths & files.
   supportedSpecialFiles_.insert(
@@ -649,5 +651,11 @@ int64_t Linux::writev(int64_t fd, const void* iovdata, int iovcnt) {
   return ::writev(hfd, reinterpret_cast<const struct iovec*>(iovdata), iovcnt);
 }
 
+/** Lookup symbol value from table in elf file. */
+bool Linux::lookupSymbolValue(const std::string symbol, uint64_t& value)
+{
+  processStates_[0].process->lookupSymbolValue(symbol,value);
+}
+
 }  // namespace kernel
 }  // namespace simeng
diff --git a/src/lib/kernel/LinuxProcess.cc b/src/lib/kernel/LinuxProcess.cc
index 31e36d7f48..3279652a91 100644
--- a/src/lib/kernel/LinuxProcess.cc
+++ b/src/lib/kernel/LinuxProcess.cc
@@ -24,7 +24,7 @@ LinuxProcess::LinuxProcess(const std::vector<std::string>& commandLine,
   // Parse ELF file
   assert(commandLine.size() > 0);
   char* unwrappedProcImgPtr;
-  Elf elf(commandLine[0], &unwrappedProcImgPtr);
+  Elf elf(commandLine[0], &unwrappedProcImgPtr,symbols_);
   if (!elf.isValid()) {
     return;
   }
@@ -178,5 +178,17 @@ void LinuxProcess::createStack(char** processImage) {
             (*processImage) + stackPointer_);
 }
 
+bool LinuxProcess::lookupSymbolValue(const std::string symbol, uint64_t& value) const
+{
+  auto lookup = symbols_.find(symbol);
+  if (lookup==symbols_.end())
+    return false;
+  else 
+  {
+    value = lookup->second;
+    return true;
+  }
+}
+
 }  // namespace kernel
 }  // namespace simeng
diff --git a/src/lib/models/emulation/Core.cc b/src/lib/models/emulation/Core.cc
index 0eff31d5a5..d9268da25f 100644
--- a/src/lib/models/emulation/Core.cc
+++ b/src/lib/models/emulation/Core.cc
@@ -20,7 +20,8 @@ Core::Core(MemoryInterface& instructionMemory, MemoryInterface& dataMemory,
       isa_(isa),
       pc_(entryPoint),
       registerFileSet_(isa.getRegisterFileStructures()),
-      architecturalRegisterFileSet_(registerFileSet_) {
+      architecturalRegisterFileSet_(registerFileSet_),
+      interruptId_(-1) {
   // Pre-load the first instruction
   instructionMemory_.requestRead({pc_, FETCH_SIZE});
 
@@ -144,11 +145,16 @@ void Core::tick() {
   }
 
   execute(uop);
-  isa_.updateSystemTimerRegisters(&registerFileSet_, ticks_);
+
+  interruptId_ = isa_.updateSystemTimerRegisters(&registerFileSet_, ticks_);
 }
 
 void Core::execute(std::shared_ptr<Instruction>& uop) {
-  uop->execute();
+
+  if (interruptId_>=0)
+    uop->raiseInterrupt(interruptId_);
+  else
+    uop->execute();
 
   if (uop->exceptionEncountered()) {
     instructionsExecuted_++;
diff --git a/src/lib/models/mcu/Core.cc b/src/lib/models/mcu/Core.cc
new file mode 100644
index 0000000000..a085d7a3b4
--- /dev/null
+++ b/src/lib/models/mcu/Core.cc
@@ -0,0 +1,515 @@
+#include "simeng/models/mcu/Core.hh"
+
+#include <iomanip>
+#include <ios>
+#include <sstream>
+#include <string>
+
+#include "simeng/arch/riscv/SystemRegister.hh"
+
+namespace simeng {
+namespace models {
+namespace mcu {
+
+// TODO: Replace with config options
+const unsigned int blockSize = 16;
+const unsigned int clockFrequency = 2.5 * 1e9;
+
+Core::Core(MemoryInterface& instructionMemory, MemoryInterface& dataMemory,
+           uint64_t processMemorySize, uint64_t entryPoint,
+           const arch::Architecture& isa, BranchPredictor& branchPredictor, YAML::Node config)
+    : dataMemory_(dataMemory),
+      isa_(isa),
+      registerFileSet_(isa.getRegisterFileStructures()),
+      architecturalRegisterFileSet_(registerFileSet_),
+      fetchToDecodeBuffer_(1, {}),
+      decodeToExecuteBuffer_(1, nullptr, 1),
+      completionSlots_(2, {1, nullptr}),
+      regDepMap_(isa.getRegisterFileStructures(), registerFileSet_),
+      fetchUnit_(fetchToDecodeBuffer_, instructionMemory, processMemorySize,
+                 entryPoint, blockSize, isa, branchPredictor),
+      decodeUnit_(fetchToDecodeBuffer_, decodeToExecuteBuffer_,
+                  branchPredictor,
+                  [this](auto instruction) { return canIssue(instruction); }),
+      writebackUnit_(completionSlots_, registerFileSet_, [](auto insnId) {},
+                     [this](auto instruction) {removeDep(instruction);},
+                     [this](auto instruction) { return removeInstrOrderQ(instruction); }),
+      loadStoreQueue_(4, dataMemory, { completionSlots_.data()+1, 1 }, [this](auto regs, auto values) { forwardOperands(regs, values); }, false, 4, 4, 2, 1, 1),
+      executeUnit_(
+          decodeToExecuteBuffer_, completionSlots_[0],
+          [this](auto regs, auto values) { forwardOperands(regs, values); },
+          [this](auto instruction) { loadStoreQueue_.addLoad(instruction); },
+          [this](auto instruction) { loadStoreQueue_.addStore(instruction); },
+          [this](auto instruction) { raiseException(instruction); },
+          [this](auto instruction) { addInstrOrderQ(instruction); },
+          [this]() { return isInterruptPending(); },
+          branchPredictor, false),
+      interruptId_(-1) {
+  // Query and apply initial state
+  auto state = isa.getInitialState();
+  applyStateChange(state);
+
+  maxStallCycleTimeout = -1;
+  maxSimCycleTimeout = -1;
+  maxInstrTimeout = -1;
+  if(config["Core"]["EnableHaltCheck"].IsDefined() && config["Core"]["EnableHaltCheck"].as<bool>()) {
+    enableHaltCheck = true;
+    if(config["Core"]["MaxStallCycleTimeout"].IsDefined()) {
+      maxStallCycleTimeout = config["Core"]["MaxStallCycleTimeout"].as<uint64_t>();
+    }
+    if(config["Core"]["MaxSimCycleTimeout"].IsDefined()) {
+      maxSimCycleTimeout = config["Core"]["MaxSimCycleTimeout"].as<uint64_t>();
+    }
+    if(config["Core"]["MaxInstrTimeout"].IsDefined()) {
+      maxInstrTimeout = config["Core"]["MaxInstrTimeout"].as<uint64_t>();
+    }
+  }
+};
+
+void Core::checkHalting() {
+  if(!enableHaltCheck) return;
+
+  if (((ticks_ - lastCommitTick_) > maxStallCycleTimeout)) {
+    std::cout << std::dec << "[SimEng:Core] Max Pipeline stall cycle timeout reached at tick: " <<  (ticks_ - lastCommitTick_) << std::endl;
+    hasHalted_ = true;
+  }
+
+  if((ticks_ > maxSimCycleTimeout)) {
+    std::cout << std::dec << "[SimEng:Core] Max Simulation cycle timeout reached at tick: " <<  ticks_ << std::endl;
+    hasHalted_ = true;    
+  }
+
+  if((getInstructionsRetiredCount() > maxInstrTimeout)) {
+    std::cout << std::dec << "[SimEng:Core] Max Instruction count timeout reached at tick: " <<  ticks_ << std::endl;
+    hasHalted_ = true;
+  }
+}
+
+void Core::tick() {
+  ticks_++;
+
+  checkHalting();
+
+  if (hasHalted_) return;
+
+  if (exceptionHandler_ != nullptr) {
+    processExceptionHandler();
+    return;
+  }
+
+  // Writeback must be ticked at start of cycle, to ensure decode reads the
+  // correct values
+  // writebackUnit_.tick();
+  // for(std::shared_ptr<Instruction> inst: writebackUnit_.getInstsForTrace()) {
+  //   uint16_t sysreg_instrret = isa_.getSystemRegisterTag(arch::riscv32::riscv_sysreg::SYSREG_INSTRRET);
+  //   uint16_t sysreg_cycle = isa_.getSystemRegisterTag(arch::riscv32::riscv_sysreg::SYSREG_CYCLE);
+  //   registerFileSet_.set(Register{0x2, sysreg_instrret}, RegisterValue(static_cast<uint32_t>(writebackUnit_.getInstructionsWrittenCount()), 4));
+  //   registerFileSet_.set(Register{0x2, sysreg_cycle}, RegisterValue(static_cast<uint32_t>(ticks_), 4));
+  //   isa_.updateInstrTrace(inst, &registerFileSet_, ticks_);
+  //   if(inst->isLoad()) {
+  //     loadStoreQueue_.commitLoad(inst);
+  //   } else if(inst->isStoreData()) {
+  //     loadStoreQueue_.commitStore(inst);
+  //   }
+  //   lastCommitTick_ = ticks_;
+  // }
+  // writebackUnit_.traceFinished();
+
+
+  loadStoreQueue_.processResponse();
+  completionSlots_[1].tick();
+
+  // Tick units
+  fetchUnit_.tick();
+  decodeUnit_.tick();
+  executeUnit_.tick();
+
+  // Wipe any data read responses, as they will have been handled by this point
+  //dataMemory_.clearCompletedReads();
+
+  loadStoreQueue_.tick();
+  // Writeback must be ticked at start of cycle, to ensure decode reads the
+  // correct values
+  writebackUnit_.tick();
+  for(std::shared_ptr<Instruction> inst: writebackUnit_.getInstsForTrace()) {
+    uint16_t sysreg_instrret = isa_.getSystemRegisterTag(arch::riscv::riscv_sysreg::SYSREG_INSTRRET);
+    uint16_t sysreg_cycle = isa_.getSystemRegisterTag(arch::riscv::riscv_sysreg::SYSREG_CYCLE);
+    registerFileSet_.set(Register{0x2, sysreg_instrret}, RegisterValue(static_cast<uint32_t>(writebackUnit_.getInstructionsWrittenCount()), 4));
+    registerFileSet_.set(Register{0x2, sysreg_cycle}, RegisterValue(static_cast<uint32_t>(ticks_), 4));
+    isa_.updateInstrTrace(inst, &registerFileSet_, ticks_);
+    if(inst->isLoad()) {
+      loadStoreQueue_.commitLoad(inst);
+    } else if(inst->isStoreData()) {
+      loadStoreQueue_.commitStore(inst);
+    }
+    lastCommitTick_ = ticks_;
+  }
+  // writebackUnit_.traceFinished();
+  // Read pending registers for ready-to-execute uop; must happen after execute
+  // to allow operand forwarding to take place first
+  // readRegisters();
+
+  // Tick buffers
+  // Each unit must have wiped the entries at the head of the buffer after use,
+  // as these will now loop around and become the tail.
+  fetchToDecodeBuffer_.tick();
+  decodeToExecuteBuffer_.tick();
+  completionSlots_[0].tick();
+  // for (auto& buffer : completionSlots_) {
+  //   buffer.tick();
+  // }
+
+  // if (exceptionGenerated_) {
+  //   handleException();
+  //   //fetchUnit_.requestFromPC();
+  //   return;
+  // }
+
+  // Check for flush
+  if (executeUnit_.shouldFlush()) {
+    // Flush was requested at execute stage
+    // Update PC and wipe younger buffers (Fetch/Decode, Decode/Execute)
+    auto targetAddress = executeUnit_.getFlushAddress();
+
+    fetchUnit_.flushLoopBuffer();
+    fetchUnit_.updatePC(targetAddress);
+    fetchUnit_.flushPredictor(targetAddress);
+    // Ensure instructions in the buffer if any are set to be flushed before being removed, this helps with removing the respective dependencies if any
+    decodeUnit_.purgeFlushed();
+    executeUnit_.purgeFlushed();
+    fetchToDecodeBuffer_.fill({});
+    decodeToExecuteBuffer_.fill(nullptr);
+    loadStoreQueue_.purgeFlushed();
+    regDepMap_.purgeFlushed();
+
+    flushes_++;
+  } else if (decodeUnit_.shouldFlush()) {
+    assert(false && "Decode unit should not generate flush");
+    // Flush was requested at decode stage
+    // Update PC and wipe Fetch/Decode buffer.
+    auto targetAddress = decodeUnit_.getFlushAddress();
+
+    fetchUnit_.flushLoopBuffer();
+    fetchUnit_.updatePC(targetAddress);
+    fetchToDecodeBuffer_.fill({});
+
+    flushes_++;
+  }
+
+  if (exceptionGenerated_) {
+    handleException();
+    //fetchUnit_.requestFromPC();
+    return;
+  }
+
+  fetchUnit_.requestFromPC();
+  interruptId_ = isa_.updateSystemTimerRegisters(&registerFileSet_, ticks_);
+}
+
+bool Core::hasHalted() const {
+  if (hasHalted_) {
+    return true;
+  }
+
+  // Core is considered to have halted when the fetch unit has halted, there
+  // are no uops at the head of any buffer, and no exception is currently being
+  // handled.
+  bool decodePending = fetchToDecodeBuffer_.getHeadSlots()[0].size() > 0;
+  bool executePending = decodeToExecuteBuffer_.getHeadSlots()[0] != nullptr;
+  bool writebackPending = completionSlots_[0].getHeadSlots()[0] != nullptr;
+  writebackPending |= completionSlots_[1].getHeadSlots()[0] != nullptr;
+
+  return (fetchUnit_.hasHalted() && !decodePending && !writebackPending &&
+          !executePending && exceptionHandler_ == nullptr);
+}
+
+const ArchitecturalRegisterFileSet& Core::getArchitecturalRegisterFileSet()
+    const {
+  return architecturalRegisterFileSet_;
+}
+
+uint64_t Core::getInstructionsRetiredCount() const {
+  return writebackUnit_.getInstructionsWrittenCount();
+}
+
+uint64_t Core::getSystemTimer() const {
+  // TODO: This will need to be changed if we start supporting DVFS.
+  return ticks_ / (clockFrequency / 1e9);
+}
+
+std::map<std::string, std::string> Core::getStats() const {
+  auto retired = writebackUnit_.getInstructionsWrittenCount();
+  auto ipc = retired / static_cast<float>(ticks_);
+  std::ostringstream ipcStr;
+  ipcStr << std::setprecision(2) << ipc;
+
+  // Sum up the branch stats reported across the execution units.
+  uint64_t totalBranchesExecuted = 0;
+  uint64_t totalBranchMispredicts = 0;
+  totalBranchesExecuted += executeUnit_.getBranchExecutedCount();
+  totalBranchMispredicts += executeUnit_.getBranchMispredictedCount();
+  auto branchMissRate = 100.0f * static_cast<float>(totalBranchMispredicts) /
+                        static_cast<float>(totalBranchesExecuted);
+  std::ostringstream branchMissRateStr;
+  branchMissRateStr << std::setprecision(3) << branchMissRate << "%";
+
+  return {{"cycles", std::to_string(ticks_)},
+          {"retired", std::to_string(retired)},
+          {"ipc", ipcStr.str()},
+          {"flushes", std::to_string(flushes_)},
+          {"branch.executed", std::to_string(totalBranchesExecuted)},
+          {"branch.mispredict", std::to_string(totalBranchMispredicts)},
+          {"branch.missrate", branchMissRateStr.str()},
+          {"lsu.ldminlatency", std::to_string(loadStoreQueue_.getMinLdLat())},
+          {"lsu.ldmaxlatency", std::to_string(loadStoreQueue_.getMaxLdLat())},
+          {"lsu.ldavglatency", std::to_string(loadStoreQueue_.getAvgLdLat())}};
+}
+
+void Core::raiseException(const std::shared_ptr<Instruction>& instruction) {
+  exceptionGenerated_ = true;
+  exceptionGeneratingInstruction_ = instruction;
+}
+
+void Core::handleException() {
+  exceptionGenerated_ = false;
+
+  exceptionHandler_ =
+      isa_.handleException(exceptionGeneratingInstruction_, *this, dataMemory_);
+
+  processExceptionHandler();
+//  isa_.updateInstrTrace(exceptionGeneratingInstruction_, &registerFileSet_, ticks_);
+//  lastCommitTick_ = ticks_;
+//  assert(removeInstrOrderQ(exceptionGeneratingInstruction_) && "Unexpected instruction at the top of inorder instr queue on exception");
+
+  //TODO: This is not a good point to flush the pipeline if the exception is not changing the PC.
+
+  // Flush pipeline
+//  decodeUnit_.purgeFlushed();
+//  executeUnit_.purgeFlushed();
+//  fetchToDecodeBuffer_.fill({});
+//  decodeToExecuteBuffer_.fill(nullptr);
+//  loadStoreQueue_.purgeFlushed();
+//  completionSlots_[0].fill(nullptr);
+//  completionSlots_[1].fill(nullptr);
+//  regDepMap_.purgeFlushed();
+}
+
+void Core::processExceptionHandler() {
+  assert(exceptionHandler_ != nullptr &&
+         "Attempted to process an exception handler that wasn't present");
+  if (dataMemory_.hasPendingRequests()) {
+    // Must wait for all memory requests to complete before processing the
+    // exception
+    return;
+  }
+
+  auto success = exceptionHandler_->tick();
+  if (!success) {
+    // Exception handler requires further ticks to complete
+    return;
+  }
+
+  const auto& result = exceptionHandler_->getResult();
+
+  if (result.fatal) {
+    hasHalted_ = true;
+    std::cout << "[SimEng:Core] Halting due to fatal exception" << std::endl;
+  } else {
+    //fetchUnit_.flushLoopBuffer();
+    fetchUnit_.updatePC(result.instructionAddress);
+    applyStateChange(result.stateChange);
+  }
+
+  exceptionHandler_ = nullptr;
+}
+
+void Core::loadData(const std::shared_ptr<Instruction>& instruction) {
+  const auto& addresses = instruction->getGeneratedAddresses();
+  for (const auto& target : addresses) {
+    dataMemory_.requestRead(target);
+  }
+
+  // NOTE: This model only supports zero-cycle data memory models, and will not
+  // work unless data requests are handled synchronously.
+  for (const auto& response : dataMemory_.getCompletedReads()) {
+    instruction->supplyData(response.target.address, response.data);
+  }
+
+  assert(instruction->hasAllData() &&
+         "Load instruction failed to obtain all data this cycle");
+
+  instruction->execute();
+
+  if (instruction->isStoreData()) {
+    storeData(instruction);
+  }
+}
+
+void Core::storeData(const std::shared_ptr<Instruction>& instruction) {
+  if (instruction->isStoreAddress()) {
+    auto addresses = instruction->getGeneratedAddresses();
+    for (auto const& target : addresses) {
+      previousAddresses_.push(target);
+    }
+  }
+  if (instruction->isStoreData()) {
+    const auto data = instruction->getData();
+    for (size_t i = 0; i < data.size(); i++) {
+      dataMemory_.requestWrite(previousAddresses_.front(), data[i]);
+      previousAddresses_.pop();
+    }
+  }
+}
+
+void Core::forwardOperands(const span<Register>& registers,
+                           const span<RegisterValue>& values) {
+  return;
+  // assert(registers.size() == values.size() &&
+  //        "Mismatched register and value vector sizes");
+
+  // const auto& uop = decodeToExecuteBuffer_.getTailSlots()[0];
+  // if (uop == nullptr) {
+  //   return;
+  // }
+
+  // auto sourceRegisters = uop->getOperandRegisters();
+  // for (size_t i = 0; i < registers.size(); i++) {
+  //   // Check each forwarded register vs source operands and supply for each
+  //   // match
+  //   for (size_t operand = 0; operand < sourceRegisters.size(); operand++) {
+  //     const auto& sourceReg = sourceRegisters[operand];
+  //     if (uop->canExecute()) {
+  //       return;
+  //     }
+  //     if (sourceReg == registers[i] && !uop->isOperandReady(operand)) {
+  //       // Supply the operand
+  //       uop->supplyOperand(operand, values[i]);
+  //     }
+  //   }
+  // }
+}
+
+bool Core::canIssue(const std::shared_ptr<Instruction>& uop) {
+  if (uop->isSysCall() && inorderIQ_.size() > 0) {
+    return false;
+  }
+  if((uop->isLoad() || uop->isStoreData()) && loadStoreQueue_.isBusy()) {
+    return false;
+  }
+  if (regDepMap_.canRead(uop) && regDepMap_.canWrite(uop)) {
+    regDepMap_.insert(uop);
+    return true;
+  }
+  return false;
+}
+
+void Core::removeDep(const std::shared_ptr<Instruction>& uop) {
+  regDepMap_.remove(uop);
+}
+
+void Core::readRegisters() {
+  if (decodeToExecuteBuffer_.isStalled()) {
+    return;
+  }
+
+  const auto& uop = decodeToExecuteBuffer_.getTailSlots()[0];
+  if (uop == nullptr) {
+    return;
+  }
+
+  // Register read
+  // Identify missing registers and supply values
+  const auto& sourceRegisters = uop->getOperandRegisters();
+  for (size_t i = 0; i < sourceRegisters.size(); i++) {
+    const auto& reg = sourceRegisters[i];
+    if (!uop->isOperandReady(i)) {
+      uop->supplyOperand(i, registerFileSet_.get(reg));
+    }
+  }
+}
+
+void Core::applyStateChange(const arch::ProcessStateChange& change) {
+  // Update registers in accoradance with the ProcessStateChange type
+  switch (change.type) {
+    case arch::ChangeType::INCREMENT: {
+      for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
+        registerFileSet_.set(
+            change.modifiedRegisters[i],
+            registerFileSet_.get(change.modifiedRegisters[i]).get<uint64_t>() +
+                change.modifiedRegisterValues[i].get<uint64_t>());
+      }
+      break;
+    }
+    case arch::ChangeType::DECREMENT: {
+      for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
+        registerFileSet_.set(
+            change.modifiedRegisters[i],
+            registerFileSet_.get(change.modifiedRegisters[i]).get<uint64_t>() -
+                change.modifiedRegisterValues[i].get<uint64_t>());
+      }
+      break;
+    }
+    default: {  // arch::ChangeType::REPLACEMENT
+      // If type is ChangeType::REPLACEMENT, set new values
+      for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
+        registerFileSet_.set(change.modifiedRegisters[i],
+                             change.modifiedRegisterValues[i]);
+      }
+      break;
+    }
+  }
+
+  // Update memory
+  // TODO: Analyse if ChangeType::INCREMENT or ChangeType::DECREMENT case is
+  // required for memory changes
+  for (size_t i = 0; i < change.memoryAddresses.size(); i++) {
+    dataMemory_.requestWrite(change.memoryAddresses[i],
+                             change.memoryAddressValues[i]);
+  }
+}
+
+void Core::handleLoad(const std::shared_ptr<Instruction>& instruction) {
+  loadData(instruction);
+  if (instruction->exceptionEncountered()) {
+    raiseException(instruction);
+    return;
+  }
+
+  forwardOperands(instruction->getDestinationRegisters(),
+                  instruction->getResults());
+  // Manually add the instruction to the writeback input buffer
+  completionSlots_[1].getTailSlots()[0] = instruction;
+}
+
+void Core::addInstrOrderQ(const std::shared_ptr<Instruction>& insn) {
+  //std::cout << std::dec << ticks_ << ": Adding instruction at address: 0x" << std::hex << insn->getInstructionAddress() << std::endl;
+  inorderIQ_.push_back(insn);
+}
+
+bool Core::removeInstrOrderQ(const std::shared_ptr<Instruction>& insn) {
+  if (insn == inorderIQ_.front()) {
+    //std::cout << std::dec << ticks_ << ": Removing instruction at address: 0x" << std::hex << insn->getInstructionAddress() << std::endl;
+    // if(insn->exceptionEncountered()) {
+    //   exceptionGenerated_ = true;
+    //   exceptionGeneratingInstruction_ = insn;
+    //   handleException();
+    // }
+    inorderIQ_.pop_front();
+    return true;
+  } else {
+    return false;
+  }
+}
+
+int16_t Core::isInterruptPending() {
+  if (interruptId_>=0) {
+    std::cout << std::dec << "[SimEng:Core] Interrupt Pending id: " << interruptId_ << ", at tick: " << ticks_ << std::endl;
+    return interruptId_;
+  } else {
+    return -1;
+  }
+}
+
+}  // namespace mcu
+}  // namespace models
+}  // namespace simeng
diff --git a/src/lib/pipeline/FetchUnit.cc b/src/lib/pipeline/FetchUnit.cc
index ade3d307c0..28d2eaba51 100644
--- a/src/lib/pipeline/FetchUnit.cc
+++ b/src/lib/pipeline/FetchUnit.cc
@@ -129,7 +129,7 @@ void FetchUnit::tick() {
     BranchPrediction prediction = {false, 0};
     if (macroOp[0]->isBranch()) {
       prediction = branchPredictor_.predict(pc_, macroOp[0]->getBranchType(),
-                                            macroOp[0]->getKnownTarget());
+                                            macroOp[0]->getKnownOffset());
       macroOp[0]->setBranchPrediction(prediction);
     }
 
diff --git a/src/lib/pipeline_hi/DecodeUnit.cc b/src/lib/pipeline_hi/DecodeUnit.cc
new file mode 100644
index 0000000000..86a298a1a3
--- /dev/null
+++ b/src/lib/pipeline_hi/DecodeUnit.cc
@@ -0,0 +1,117 @@
+#include "simeng/pipeline_hi/DecodeUnit.hh"
+
+#include <cassert>
+
+namespace simeng {
+namespace pipeline_hi {
+
+DecodeUnit::DecodeUnit(PipelineBuffer<MacroOp>& input,
+                       PipelineBuffer<std::shared_ptr<Instruction>>& output,
+                       BranchPredictor& predictor,
+                       std::function<bool(const std::shared_ptr<Instruction>&)> canIssue)
+    : input_(input), output_(output), predictor_(predictor), canIssue_(canIssue){};
+
+void DecodeUnit::tick() {
+  // Stall if output buffer is stalled
+  if (output_.isStalled()) {
+    input_.stall(true);
+    return;
+  }
+
+  shouldFlush_ = false;
+  input_.stall(false);
+
+  // Stall if internal uop is overpopulated, otherwise add uops from input to
+  // internal buffer
+  if (microOps_.size() >= output_.getWidth()) {
+    input_.stall(true);
+  } else {
+    // Populate uop buffer with newly fetched macro-ops
+    for (size_t slot = 0; slot < input_.getWidth(); slot++) {
+      auto& macroOp = input_.getHeadSlots()[slot];
+
+      if (macroOp.size() == 0) {
+        // Nothing to process for this macro-op
+        continue;
+      }
+
+      for (uint8_t index = 0; index < macroOp.size(); index++) {
+        microOps_.push_back(std::move(macroOp[index]));
+      }
+
+      input_.getHeadSlots()[slot].clear();
+    }
+  }
+
+  // Process uops in buffer
+  for (size_t slot = 0; slot < output_.getWidth(); slot++) {
+    // If there's no more uops to decode, exit loop early
+    if (!microOps_.size()) break;
+
+    //Check for dependencies before forwarding to next stage
+    //Stop-gap implementation
+    if (!canIssue_(microOps_.front())) break;
+
+    // Move uop to output buffer and remove from internal buffer
+    auto& uop = (output_.getTailSlots()[slot] = std::move(microOps_.front()));
+    microOps_.pop_front();
+
+    // Check preliminary branch prediction results now that the instruction is
+    // decoded. Identifies:
+    // - Non-branch instructions mistakenly predicted as branches
+    // - Incorrect targets for immediate branches
+    // auto [misprediction, correctAddress] = uop->checkEarlyBranchMisprediction();
+    // if (misprediction) {
+    //   earlyFlushes_++;
+    //   shouldFlush_ = true;
+    //   pc_ = correctAddress;
+
+    //   if (!uop->isBranch()) {
+    //     // Non-branch incorrectly predicted as a branch; let the predictor know
+    //     predictor_.update(uop->getInstructionAddress(), false, pc_,
+    //                       uop->getBranchType());
+    //   }
+    //   // Remove macro-operations in microOps_ buffer after macro-operation
+    //   // decoded in this cycle
+    //   auto uopIt = microOps_.begin();
+    //   // Find first microOps_ entry not belonging to same address as flushing
+    //   // instruction
+    //   while (uopIt != microOps_.end()) {
+    //     if ((*uopIt)->getInstructionAddress() != uop->getInstructionAddress()) {
+    //       break;
+    //     } else {
+    //       uopIt++;
+    //     }
+    //   }
+    //   // Remove all entries after first macro-operation in buffer
+    //   while (uopIt != microOps_.end()) {
+    //     uopIt = microOps_.erase(uopIt);
+    //   }
+
+    //   // Skip processing remaining uops, as they need to be flushed
+    //   break;
+    // }
+  }
+}
+
+bool DecodeUnit::shouldFlush() const { return shouldFlush_; }
+uint64_t DecodeUnit::getFlushAddress() const { return pc_; }
+uint64_t DecodeUnit::getEarlyFlushes() const { return earlyFlushes_; };
+
+void DecodeUnit::purgeFlushed() {
+  if (output_.getTailSlots()[0] != nullptr) {
+    output_.getTailSlots()[0]->setFlushed();
+  }
+
+  if (input_.getHeadSlots()[0].size() != 0) {
+    input_.getHeadSlots()[0][0]->setFlushed();
+  }
+
+  if (microOps_.size())
+    microOps_.front()->setFlushed();
+  microOps_.clear();
+  input_.stall(false);
+}
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/lib/pipeline_hi/DispatchIssueUnit.cc b/src/lib/pipeline_hi/DispatchIssueUnit.cc
new file mode 100644
index 0000000000..93ce9fa324
--- /dev/null
+++ b/src/lib/pipeline_hi/DispatchIssueUnit.cc
@@ -0,0 +1,269 @@
+#include "simeng/pipeline_hi/DispatchIssueUnit.hh"
+
+#include <algorithm>
+#include <iostream>
+
+namespace simeng {
+namespace pipeline_hi {
+
+DispatchIssueUnit::DispatchIssueUnit(
+    PipelineBuffer<std::shared_ptr<Instruction>>& fromRename,
+    std::vector<PipelineBuffer<std::shared_ptr<Instruction>>>& issuePorts,
+    const RegisterFileSet& registerFileSet, PortAllocator& portAllocator,
+    const std::vector<uint16_t>& physicalRegisterStructure, YAML::Node config)
+    : input_(fromRename),
+      issuePorts_(issuePorts),
+      registerFileSet_(registerFileSet),
+      scoreboard_(physicalRegisterStructure.size()),
+      dependencyMatrix_(physicalRegisterStructure.size()),
+      portAllocator_(portAllocator) {
+  // Initialise scoreboard
+  for (size_t type = 0; type < physicalRegisterStructure.size(); type++) {
+    scoreboard_[type].assign(physicalRegisterStructure[type], true);
+    dependencyMatrix_[type].resize(physicalRegisterStructure[type]);
+  }
+  // Create set of reservation station structs with correct issue port
+  // mappings
+  for (size_t i = 0; i < config["Reservation-Stations"].size(); i++) {
+    // Iterate over each reservation station in config
+    auto reservation_station = config["Reservation-Stations"][i];
+    // Create ReservationStation struct to be stored
+    ReservationStation rs = {
+        reservation_station["Size"].as<uint16_t>(),
+        reservation_station["Dispatch-Rate"].as<uint16_t>(),
+        0,
+        {}};
+    // Resize rs port attribute to match what's defined in config file
+    rs.ports.resize(reservation_station["Ports"].size());
+    for (size_t j = 0; j < reservation_station["Ports"].size(); j++) {
+      // Iterate over issue ports in config
+      uint16_t issue_port = reservation_station["Ports"][j].as<uint16_t>();
+      rs.ports[j].issuePort = issue_port;
+      // Add port mapping entry, resizing vector if needed
+      if ((issue_port + 1) > portMapping_.size()) {
+        portMapping_.resize((issue_port + 1));
+      }
+      portMapping_[issue_port] = {i, j};
+    }
+    reservationStations_.push_back(rs);
+  }
+  for (uint16_t i = 0; i < reservationStations_.size(); i++)
+    flushed_.emplace(i, std::initializer_list<std::shared_ptr<Instruction>>{});
+}
+
+void DispatchIssueUnit::tick() {
+  input_.stall(false);
+
+  /** Stores the number of instructions dispatched for each
+   * reservation station. */
+  std::vector<uint16_t> dispatches(
+      static_cast<unsigned short>(reservationStations_.size()), 0);
+
+  for (size_t slot = 0; slot < input_.getWidth(); slot++) {
+    auto& uop = input_.getHeadSlots()[slot];
+    if (uop == nullptr) {
+      continue;
+    }
+
+    const std::vector<uint16_t>& supportedPorts = uop->getSupportedPorts();
+    if (uop->exceptionEncountered()) {
+      // Exception; mark as ready to commit, and remove from pipeline
+      uop->setCommitReady();
+      input_.getHeadSlots()[slot] = nullptr;
+      continue;
+    }
+    // Allocate issue port to uop
+    uint16_t port = portAllocator_.allocate(supportedPorts);
+    uint16_t RS_Index = portMapping_[port].first;
+    uint16_t RS_Port = portMapping_[port].second;
+    assert(RS_Index < reservationStations_.size() &&
+           "Allocated port inaccessible");
+    ReservationStation& rs = reservationStations_[RS_Index];
+
+    // When appropriate, stall uop or input buffer if stall buffer full
+    if (rs.currentSize == rs.capacity ||
+        dispatches[RS_Index] == rs.dispatchRate) {
+      // Deallocate port given
+      portAllocator_.deallocate(port);
+      input_.stall(true);
+      rsStalls_++;
+      return;
+    }
+
+    // Assume the uop will be ready
+    bool ready = true;
+
+    // Register read
+    // Identify remaining missing registers and supply values
+    auto& sourceRegisters = uop->getOperandRegisters();
+    for (uint16_t i = 0; i < sourceRegisters.size(); i++) {
+      const auto& reg = sourceRegisters[i];
+
+      if (!uop->isOperandReady(i)) {
+        // The operand hasn't already been supplied
+        if (scoreboard_[reg.type][reg.tag]) {
+          // The scoreboard says it's ready; read and supply the register value
+          uop->supplyOperand(i, registerFileSet_.get(reg));
+        } else {
+          // This register isn't ready yet. Register this uop to the dependency
+          // matrix for a more efficient lookup later
+          dependencyMatrix_[reg.type][reg.tag].push_back({uop, port, i});
+          ready = false;
+        }
+      }
+    }
+
+    // Set scoreboard for all destination registers as not ready
+    auto& destinationRegisters = uop->getDestinationRegisters();
+    for (const auto& reg : destinationRegisters) {
+      scoreboard_[reg.type][reg.tag] = false;
+    }
+
+    // Increment dispatches made and RS occupied entries size
+    dispatches[RS_Index]++;
+    rs.currentSize++;
+
+    if (ready) {
+      rs.ports[RS_Port].ready.push_back(std::move(uop));
+    }
+
+    input_.getHeadSlots()[slot] = nullptr;
+  }
+}
+
+void DispatchIssueUnit::issue() {
+  int issued = 0;
+  // Check the ready queues, and issue an instruction from each if the
+  // corresponding port isn't blocked
+  for (size_t i = 0; i < issuePorts_.size(); i++) {
+    ReservationStation& rs = reservationStations_[portMapping_[i].first];
+    auto& queue = rs.ports[portMapping_[i].second].ready;
+    if (issuePorts_[i].isStalled()) {
+      if (queue.size() > 0) {
+        portBusyStalls_++;
+      }
+      continue;
+    }
+
+    if (queue.size() > 0) {
+      auto& uop = queue.front();
+      issuePorts_[i].getTailSlots()[0] = std::move(uop);
+      queue.pop_front();
+
+      // Inform the port allocator that an instruction issued
+      portAllocator_.issued(i);
+      issued++;
+
+      assert(rs.currentSize > 0);
+      rs.currentSize--;
+    }
+  }
+
+  if (issued == 0) {
+    for (const auto& rs : reservationStations_) {
+      if (rs.currentSize != 0) {
+        backendStalls_++;
+        return;
+      }
+    }
+    frontendStalls_++;
+  }
+}
+
+void DispatchIssueUnit::forwardOperands(const span<Register>& registers,
+                                        const span<RegisterValue>& values) {
+  assert(registers.size() == values.size() &&
+         "Mismatched register and value vector sizes");
+
+  for (size_t i = 0; i < registers.size(); i++) {
+    const auto& reg = registers[i];
+    // Flag scoreboard as ready now result is available
+    scoreboard_[reg.type][reg.tag] = true;
+
+    // Supply the value to all dependent uops
+    const auto& dependents = dependencyMatrix_[reg.type][reg.tag];
+    for (auto& entry : dependents) {
+      entry.uop->supplyOperand(entry.operandIndex, values[i]);
+      if (entry.uop->canExecute()) {
+        // Add the now-ready instruction to the relevant ready queue
+        auto rsInfo = portMapping_[entry.port];
+        reservationStations_[rsInfo.first].ports[rsInfo.second].ready.push_back(
+            std::move(entry.uop));
+      }
+    }
+
+    // Clear the dependency list
+    dependencyMatrix_[reg.type][reg.tag].clear();
+  }
+}
+
+void DispatchIssueUnit::setRegisterReady(Register reg) {
+  scoreboard_[reg.type][reg.tag] = true;
+}
+
+void DispatchIssueUnit::purgeFlushed() {
+  for (size_t i = 0; i < reservationStations_.size(); i++) {
+    // Search the ready queues for flushed instructions and remove them
+    auto& rs = reservationStations_[i];
+    for (auto& port : rs.ports) {
+      // Ready queue
+      auto readyIter = port.ready.begin();
+      while (readyIter != port.ready.end()) {
+        auto& uop = *readyIter;
+        if (uop->isFlushed()) {
+          portAllocator_.deallocate(port.issuePort);
+          readyIter = port.ready.erase(readyIter);
+          assert(rs.currentSize > 0);
+          rs.currentSize--;
+        } else {
+          readyIter++;
+        }
+      }
+    }
+  }
+
+  // Collect flushed instructions and remove them from the dependency matrix
+  for (auto& it : flushed_) it.second.clear();
+  for (auto& registerType : dependencyMatrix_) {
+    for (auto& dependencyList : registerType) {
+      auto it = dependencyList.begin();
+      while (it != dependencyList.end()) {
+        auto& entry = *it;
+        if (entry.uop->isFlushed()) {
+          auto rsIndex = portMapping_[entry.port].first;
+          if (!flushed_[rsIndex].count(entry.uop)) {
+            flushed_[rsIndex].insert(entry.uop);
+            portAllocator_.deallocate(entry.port);
+          }
+          it = dependencyList.erase(it);
+        } else {
+          it++;
+        }
+      }
+    }
+  }
+
+  // Update reservation station size
+  for (uint8_t i = 0; i < reservationStations_.size(); i++) {
+    assert(reservationStations_[i].currentSize >= flushed_[i].size());
+    reservationStations_[i].currentSize -= flushed_[i].size();
+  }
+}
+
+uint64_t DispatchIssueUnit::getRSStalls() const { return rsStalls_; }
+uint64_t DispatchIssueUnit::getFrontendStalls() const {
+  return frontendStalls_;
+}
+uint64_t DispatchIssueUnit::getBackendStalls() const { return backendStalls_; }
+uint64_t DispatchIssueUnit::getPortBusyStalls() const {
+  return portBusyStalls_;
+}
+
+void DispatchIssueUnit::getRSSizes(std::vector<uint64_t>& sizes) const {
+  for (auto& rs : reservationStations_) {
+    sizes.push_back(rs.capacity - rs.currentSize);
+  }
+}
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/lib/pipeline_hi/ExecuteUnit.cc b/src/lib/pipeline_hi/ExecuteUnit.cc
new file mode 100644
index 0000000000..e3b5089d5c
--- /dev/null
+++ b/src/lib/pipeline_hi/ExecuteUnit.cc
@@ -0,0 +1,255 @@
+#include "simeng/pipeline_hi/ExecuteUnit.hh"
+
+#include <cstring>
+#include <iostream>
+
+namespace simeng {
+namespace pipeline_hi {
+
+ExecuteUnit::ExecuteUnit(
+    PipelineBuffer<std::shared_ptr<Instruction>>& input,
+    PipelineBuffer<std::shared_ptr<Instruction>>& output,
+    std::function<void(span<Register>, span<RegisterValue>)> forwardOperands,
+    std::function<void(const std::shared_ptr<Instruction>&)> handleLoad,
+    std::function<void(const std::shared_ptr<Instruction>&)> handleStore,
+    std::function<void(const std::shared_ptr<Instruction>&)> raiseException,
+    std::function<void(const std::shared_ptr<Instruction>&)> addInstrOrderQ,
+    std::function<int16_t(void)> isInterruptPending,
+    BranchPredictor& predictor, bool pipelined,
+    const std::vector<uint16_t>& blockingGroups)
+    : input_(input),
+      output_(output),
+      forwardOperands_(forwardOperands),
+      handleLoad_(handleLoad),
+      handleStore_(handleStore),
+      raiseException_(raiseException),
+      addInstrOrderQ_(addInstrOrderQ),
+      isInterruptPending_(isInterruptPending),
+      predictor_(predictor),
+      pipelined_(pipelined),
+      blockingGroups_(blockingGroups) {}
+
+void ExecuteUnit::tick() {
+  tickCounter_++;
+  shouldFlush_ = false;
+
+  if (stallUntil_ <= tickCounter_) {
+    input_.stall(false);
+    // Input isn't stalled; process instruction and add to pipeline
+
+    auto& uop = input_.getHeadSlots()[0];
+    if (uop != nullptr) {
+      if (!uop->isFlushed()) {
+        // Retrieve execution latency from the instruction
+        auto latency = uop->getLatency();
+        cycles_++;
+        // Block uop execution if appropriate
+        if (std::find(blockingGroups_.begin(), blockingGroups_.end(),
+                      uop->getGroup()) != blockingGroups_.end()) {
+          if (operationsStalled_.size() == 0) {
+            // Add uop to pipeline
+            pipeline_.push_back({nullptr, tickCounter_ + latency - 1});
+            pipeline_.back().insn = std::move(uop);
+            operationsStalled_.push_back(pipeline_.back().insn);
+          } else {
+            // Stall execution start cycle
+            operationsStalled_.push_back(nullptr);
+            operationsStalled_.back() = std::move(uop);
+          }
+        } else if (latency == 1 && pipeline_.size() == 0) {
+          // Pipeline is empty and insn will execute this cycle; bypass
+          execute(uop);
+        } else {
+          // This instruction may take more than a single cycle; check for a
+          // stall. For unpipelined units, the unit will stall for the full
+          // instruction duration.
+          auto stallCycles =
+              pipelined_ ? uop->getStallCycles() : uop->getLatency();
+          if (stallCycles > 1) {
+            stallUntil_ = tickCounter_ + stallCycles - 1;
+            input_.stall(true);
+          }
+
+          // Add insn to pipeline
+          pipeline_.push_back({nullptr, tickCounter_ + latency - 1});
+          pipeline_.back().insn = std::move(uop);
+        }
+      }
+      input_.getHeadSlots()[0] = nullptr;
+    }
+  }
+
+  if (pipeline_.size() == 0) {
+    return;
+  }
+
+  auto& head = pipeline_.front();
+  if (head.readyAt <= tickCounter_) {
+    // Check if the completion of an operation would unblock
+    // another stalled operation.
+    if (std::find(blockingGroups_.begin(), blockingGroups_.end(),
+                  head.insn->getGroup()) != blockingGroups_.end()) {
+      operationsStalled_.pop_front();
+      if (operationsStalled_.size() > 0) {
+        // Add uop to pipeline
+        auto& uop = operationsStalled_.front();
+        pipeline_.push_back({nullptr, tickCounter_ + uop->getLatency() - 1});
+        pipeline_.back().insn = std::move(uop);
+        operationsStalled_.front() = pipeline_.back().insn;
+      }
+    }
+    execute(head.insn);
+    pipeline_.pop_front();
+  }
+}
+
+void ExecuteUnit::execute(std::shared_ptr<Instruction>& uop) {
+  assert(uop->canExecute() &&
+         "Attempted to execute an instruction before it was ready");
+
+  int16_t pendingInterruptId = isInterruptPending_();
+  if(pendingInterruptId>=0) {
+    //std::cout << std::hex << "Execution encountered pending interrupt, PC 0x" << uop->getInstructionAddress() << std::endl;
+    uop->raiseInterrupt(pendingInterruptId);
+    uop->setFlushed();
+    raiseException_(uop);
+    shouldFlush_ = true;
+    return;
+  }
+
+  addInstrOrderQ_(uop);
+  if (uop->exceptionEncountered()) {
+    // Exception encountered prior to execution
+    // TODO: Identify whether this can be removed; executing an
+    // exception-encountered uop would have to be guaranteed to be safe
+    raiseException_(uop);
+    return;
+  }
+
+  if (uop->isLoad()) {
+    uop->generateAddresses();
+    if (uop->exceptionEncountered()) {
+      // Exception; don't pass handle load function
+      raiseException_(uop);
+      return;
+    }
+    handleLoad_(uop);
+    return;
+  } else if (uop->isStoreAddress() || uop->isStoreData()) {
+    if (uop->isStoreAddress()) {
+      uop->generateAddresses();
+    }
+    if (uop->isStoreData()) {
+      uop->execute();
+    }
+    handleStore_(uop);
+  } else {
+    uop->execute();
+  }
+
+  if (uop->exceptionEncountered()) {
+    // Exception; don't forward results, don't pass uop forward
+    raiseException_(uop);
+    shouldFlush_ = true;
+  //TODO: Let the instruction go into writeback stage
+  //   return;
+  }
+
+  if (uop->isBranch()) {
+    pc_ = uop->getBranchAddress();
+
+    // Update branch predictor with branch results
+    predictor_.update(uop->getInstructionAddress(), uop->wasBranchTaken(), pc_,
+                      uop->getBranchType());
+
+    // Update the branch instruction counter
+    branchesExecuted_++;
+
+    if (uop->wasBranchMispredicted()) {
+      //std::cout << std::dec << tickCounter_ << std::hex << ": Misprediction iaddr: 0x" << uop->getInstructionAddress() << ", " << uop->getBranchPrediction().taken << std::endl;
+      // Misprediction; flush the pipeline
+      shouldFlush_ = true;
+      flushAfter_ = uop->getInstructionId();
+      // Update the branch misprediction counter
+      branchMispredicts_++;
+    }
+  }
+
+  // Operand forwarding; allows a dependent uop to execute next cycle
+  //if (!uop->isMul() && !uop->isDiv()) {
+  //  forwardOperands_(uop->getDestinationRegisters(), uop->getResults());
+  //}
+
+  output_.getTailSlots()[0] = std::move(uop);
+}
+
+bool ExecuteUnit::shouldFlush() const { return shouldFlush_; }
+uint64_t ExecuteUnit::getFlushAddress() const { return pc_; }
+uint64_t ExecuteUnit::getFlushSeqId() const { return flushAfter_; }
+
+void ExecuteUnit::purgeFlushed() {
+  auto& uop = input_.getHeadSlots()[0];
+  if (uop != nullptr) {
+    if (!uop->isFlushed()) {
+      uop->setFlushed();
+    }
+  }
+
+  if (pipeline_.size() == 0) {
+    return;
+  }
+
+  // If the newest instruction has been flushed, clear any stalls.
+  if (pipeline_.back().insn->isFlushed()) {
+    stallUntil_ = tickCounter_;
+  }
+
+  // Iterate over the pipeline and remove flushed instructions
+  auto it = pipeline_.begin();
+  while (it != pipeline_.end()) {
+    auto& entry = *it;
+    if (entry.insn->isFlushed()) {
+      it = pipeline_.erase(it);
+    } else {
+      it++;
+    }
+  }
+
+  // If first blocking in-flight instruction is flushed, ensure another
+  // non-flushed stalled instruction takes it place in the pipeline if
+  // available.
+  bool replace = false;
+  if (operationsStalled_.size() > 0 &&
+      operationsStalled_.front()->isFlushed()) {
+    replace = true;
+  }
+  auto itStall = operationsStalled_.begin();
+  while (itStall != operationsStalled_.end()) {
+    auto& entry = *itStall;
+    if (entry->isFlushed()) {
+      itStall = operationsStalled_.erase(itStall);
+    } else {
+      itStall++;
+    }
+  }
+
+  if (replace && operationsStalled_.size() > 0) {
+    // Add uop to pipeline
+    auto& uop = operationsStalled_.front();
+    pipeline_.push_back({nullptr, tickCounter_ + uop->getLatency() - 1});
+    pipeline_.back().insn = std::move(uop);
+    operationsStalled_.front() = pipeline_.back().insn;
+  }
+}
+
+uint64_t ExecuteUnit::getBranchExecutedCount() const {
+  return branchesExecuted_;
+}
+uint64_t ExecuteUnit::getBranchMispredictedCount() const {
+  return branchMispredicts_;
+}
+
+uint64_t ExecuteUnit::getCycles() const { return cycles_; }
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/lib/pipeline_hi/FetchUnit.cc b/src/lib/pipeline_hi/FetchUnit.cc
new file mode 100644
index 0000000000..4de190efca
--- /dev/null
+++ b/src/lib/pipeline_hi/FetchUnit.cc
@@ -0,0 +1,265 @@
+#include "simeng/pipeline_hi/FetchUnit.hh"
+
+namespace simeng {
+namespace pipeline_hi {
+
+FetchUnit::FetchUnit(PipelineBuffer<MacroOp>& output,
+                     MemoryInterface& instructionMemory,
+                     uint64_t programByteLength, uint64_t entryPoint,
+                     uint8_t blockSize, const arch::Architecture& isa,
+                     BranchPredictor& branchPredictor)
+    : output_(output),
+      pc_(entryPoint),
+      instructionMemory_(instructionMemory),
+      programByteLength_(programByteLength),
+      isa_(isa),
+      branchPredictor_(branchPredictor),
+      blockSize_(blockSize),
+      blockMask_(~(blockSize_ - 1)) {
+  assert(blockSize_ >= isa_.getMaxInstructionSize() &&
+         "fetch block size must be larger than the largest instruction");
+  fetchBuffer_ = new uint8_t[2 * blockSize_];
+  requestFromPC();
+}
+
+FetchUnit::~FetchUnit() { delete[] fetchBuffer_; }
+
+void FetchUnit::tick() {
+  if (output_.isStalled()) {
+    return;
+  }
+
+  if (hasHalted_ || waitSCEval_) {
+    return;
+  }
+
+  // If loop buffer has been filled, fill buffer to decode
+  // if (loopBufferState_ == LoopBufferState::SUPPLYING) {
+  //   auto outputSlots = output_.getTailSlots();
+  //   for (size_t slot = 0; slot < output_.getWidth(); slot++) {
+  //     auto& macroOp = outputSlots[slot];
+  //     auto bytesRead = isa_.predecode(&(loopBuffer_.front().encoding),
+  //                                     loopBuffer_.front().instructionSize,
+  //                                     loopBuffer_.front().address, macroOp);
+
+  //     assert(bytesRead != 0 && "predecode failure for loop buffer entry");
+
+  //     // Set prediction to recorded value during loop buffer filling
+  //     if (macroOp[0]->isBranch()) {
+  //       macroOp[0]->setBranchPrediction(loopBuffer_.front().prediction);
+  //     }
+
+  //     // Cycle queue by moving front entry to back
+  //     loopBuffer_.push_back(loopBuffer_.front());
+  //     loopBuffer_.pop_front();
+  //   }
+  //   return;
+  // }
+
+  // Pointer to the instruction data to decode from
+  const uint8_t* buffer;
+  uint8_t bufferOffset;
+
+  // Check if more instruction data is required
+  if (bufferedBytes_ < isa_.getMaxInstructionSize()) {
+    // Calculate the address of the next fetch block
+    uint64_t blockAddress;
+    if (bufferedBytes_ > 0) {
+      // There is already some data in the buffer, so check for the next block
+      bufferOffset = 0;
+      blockAddress = pc_ + bufferedBytes_;
+      assert((blockAddress & ~blockMask_) == 0 && "misaligned fetch buffer");
+    } else {
+      // Fetch buffer is empty, so start from the PC
+      blockAddress = pc_ & blockMask_;
+      bufferOffset = pc_ - blockAddress;
+    }
+
+    // Find fetched memory that matches the desired block
+    const auto& fetched = instructionMemory_.getCompletedReads();
+
+    size_t fetchIndex;
+    for (fetchIndex = 0; fetchIndex < fetched.size(); fetchIndex++) {
+      if (fetched[fetchIndex].target.address == blockAddress) {
+        break;
+      }
+    }
+    if (fetchIndex == fetched.size()) {
+      // Need to wait for fetched instructions
+      return;
+    }
+
+    // TODO: Handle memory faults
+    assert(fetched[fetchIndex].data && "Memory read failed");
+    const uint8_t* fetchData = fetched[fetchIndex].data.getAsVector<uint8_t>();
+
+    // Copy fetched data to fetch buffer after existing data
+    std::memcpy(fetchBuffer_ + bufferedBytes_, fetchData + bufferOffset,
+                blockSize_ - bufferOffset);
+
+    bufferedBytes_ += blockSize_ - bufferOffset;
+    buffer = fetchBuffer_;
+    // Decoding should start from the beginning of the fetchBuffer_.
+    bufferOffset = 0;
+  } else {
+    // There is already enough data in the fetch buffer, so use that
+    buffer = fetchBuffer_;
+    bufferOffset = 0;
+  }
+
+  // Check we have enough data to begin decoding
+  if (bufferedBytes_ == isa_.getMinInstructionSize()) {
+    //Check if those bytes points to a instruction with minimum size or more data is required. If more data is required return
+    // TODO: this is not generic solution, just trying to make it work
+    uint16_t rawBits;
+    memcpy(&rawBits, buffer + bufferOffset, 2);
+    if((rawBits & 0x3) == 0x3) {
+      //std::cout << std::hex << "Only 2 bytes left in fetch buffer and not compresses instr type, current PC: 0x" << pc_ << std::endl;
+      return;
+    }
+  }
+
+  auto outputSlots = output_.getTailSlots();
+  for (size_t slot = 0; slot < output_.getWidth(); slot++) {
+    auto& macroOp = outputSlots[slot];
+
+    auto bytesRead =
+        isa_.predecode(buffer + bufferOffset, bufferedBytes_, pc_, macroOp);
+
+    // If predecode fails, bail and wait for more data
+    if (bytesRead == 0) {
+      assert(bufferedBytes_ < isa_.getMinInstructionSize() &&
+             "unexpected predecode failure");
+      break;
+    }
+
+    // Create branch prediction after identifing instruction type
+    // (e.g. RET, BL, etc).
+    BranchPrediction prediction = {false, 0};
+    if (macroOp[0]->isBranch()) {
+      prediction = branchPredictor_.predict(pc_, macroOp[0]->getBranchType(),
+                                            macroOp[0]->getKnownOffset(),
+                                            (uint8_t)bytesRead);
+      macroOp[0]->setBranchPrediction(prediction);
+    }
+
+    // if (loopBufferState_ == LoopBufferState::FILLING) {
+    //   // Record instruction fetch information in loop body
+    //   uint32_t encoding;
+    //   memcpy(&encoding, buffer + bufferOffset, sizeof(uint32_t));
+    //   loopBuffer_.push_back(
+    //       {encoding, bytesRead, pc_, macroOp[0]->getBranchPrediction()});
+
+    //   if (pc_ == loopBoundaryAddress_) {
+    //     // loopBoundaryAddress_ has been fetched whilst filling the loop buffer.
+    //     // Stop filling as loop body has been recorded and begin to supply
+    //     // decode unit with instructions from the loop buffer
+    //     loopBufferState_ = LoopBufferState::SUPPLYING;
+    //     bufferedBytes_ = 0;
+    //     break;
+    //   }
+    // } else if (loopBufferState_ == LoopBufferState::WAITING &&
+    //            pc_ == loopBoundaryAddress_) {
+    //   // Once set loopBoundaryAddress_ is fetched, start to fill loop buffer
+    //   loopBufferState_ = LoopBufferState::FILLING;
+    // }
+
+    assert(bytesRead <= bufferedBytes_ &&
+           "Predecode consumed more bytes than were available");
+    // Increment the offset, decrement available bytes
+    bufferOffset += bytesRead;
+    bufferedBytes_ -= bytesRead;
+
+    if (!prediction.taken) {
+      // Predicted as not taken; increment PC to next instruction
+      pc_ += bytesRead;
+    } else {
+      // Predicted as taken; set PC to predicted target address
+      pc_ = prediction.target;
+    }
+//    std::cout << std::hex << "PC: 0x" << pc_ << ", PBL: 0x" << programByteLength_ << std::endl;
+    if (pc_ == 0 && (macroOp[0]->getBranchType() == BranchType::SubroutineCall)) {
+      waitSCEval_ = true;
+      break;
+    }
+
+    if (pc_ >= programByteLength_) {
+      hasHalted_ = true;
+      break;
+    }
+
+    if (prediction.taken) {
+      if (slot + 1 < output_.getWidth()) {
+        branchStalls_++;
+      }
+      // Can't continue fetch immediately after a branch
+      bufferedBytes_ = 0;
+      break;
+    }
+
+    // Too few bytes remaining in buffer to continue
+    if (bufferedBytes_ == 0) {
+      break;
+    }
+  }
+
+  if (bufferedBytes_ > 0) {
+    // Move start of fetched data to beginning of fetch buffer
+    std::memmove(fetchBuffer_, buffer + bufferOffset, bufferedBytes_);
+  }
+
+  instructionMemory_.clearCompletedReads();
+}
+
+void FetchUnit::registerLoopBoundary(uint64_t branchAddress) {
+  // Set branch which forms the loop as the loopBoundaryAddress_ and place loop
+  // buffer in state to begin filling once the loopBoundaryAddress_ has been
+  // fetched
+  loopBufferState_ = LoopBufferState::WAITING;
+  loopBoundaryAddress_ = branchAddress;
+}
+
+bool FetchUnit::hasHalted() const { return hasHalted_; }
+
+void FetchUnit::updatePC(uint64_t address) {
+  pc_ = address;
+  bufferedBytes_ = 0;
+  hasHalted_ = (pc_ >= programByteLength_);
+  waitSCEval_ = false;
+}
+
+void FetchUnit::requestFromPC() {
+  // Do nothing if buffer already contains enough data
+  if (bufferedBytes_ >= isa_.getMaxInstructionSize()) return;
+
+  // Do nothing if unit has halted to avoid invalid speculative memory reads
+  // beyond the programByteLength_
+  if (hasHalted_ || waitSCEval_) return;
+
+  uint64_t blockAddress;
+  if (bufferedBytes_ > 0) {
+    // There's already some data in the buffer, so fetch the next block
+    blockAddress = pc_ + bufferedBytes_;
+    assert((blockAddress & ~blockMask_) == 0 && "misaligned fetch buffer");
+  } else {
+    // Fetch buffer is empty, so fetch from the PC
+    blockAddress = pc_ & blockMask_;
+  }
+
+  instructionMemory_.requestRead({blockAddress, blockSize_});
+}
+
+uint64_t FetchUnit::getBranchStalls() const { return branchStalls_; }
+
+void FetchUnit::flushLoopBuffer() {
+  // loopBuffer_.clear();
+  // loopBufferState_ = LoopBufferState::IDLE;
+  // loopBoundaryAddress_ = 0;
+}
+
+void FetchUnit::flushPredictor(uint64_t address) {
+  branchPredictor_.flush(address);
+}
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/lib/pipeline_hi/LoadStoreQueue.cc b/src/lib/pipeline_hi/LoadStoreQueue.cc
new file mode 100644
index 0000000000..c0b752e8af
--- /dev/null
+++ b/src/lib/pipeline_hi/LoadStoreQueue.cc
@@ -0,0 +1,315 @@
+#include "simeng/pipeline_hi/LoadStoreQueue.hh"
+
+#include <array>
+#include <cassert>
+#include <cstring>
+#include <iostream>
+
+namespace simeng {
+namespace pipeline_hi {
+
+/** Check whether requests `a` and `b` overlap. */
+bool requestsOverlap(MemoryAccessTarget a, MemoryAccessTarget b) {
+  // Check whether one region ends before the other begins, implying no overlap,
+  // and negate
+  return !(a.address + a.size <= b.address || b.address + b.size <= a.address);
+}
+
+LoadStoreQueue::LoadStoreQueue(
+    unsigned int maxCombinedSpace, MemoryInterface& memory,
+    span<PipelineBuffer<std::shared_ptr<Instruction>>> completionSlots,
+    std::function<void(span<Register>, span<RegisterValue>)> forwardOperands,
+    bool exclusive, uint16_t loadBandwidth, uint16_t storeBandwidth,
+    uint16_t permittedRequests, uint16_t permittedLoads,
+    uint16_t permittedStores)
+    : completionSlots_(completionSlots),
+      forwardOperands_(forwardOperands),
+      maxCombinedSpace_(maxCombinedSpace),
+      combined_(true),
+      memory_(memory),
+      exclusive_(exclusive),
+      loadBandwidth_(loadBandwidth),
+      storeBandwidth_(storeBandwidth),
+      totalLimit_(permittedRequests),
+      // Set per-cycle limits for each request type
+      reqLimits_{permittedLoads, permittedStores} {};
+
+LoadStoreQueue::LoadStoreQueue(
+    unsigned int maxLoadQueueSpace, unsigned int maxStoreQueueSpace,
+    MemoryInterface& memory,
+    span<PipelineBuffer<std::shared_ptr<Instruction>>> completionSlots,
+    std::function<void(span<Register>, span<RegisterValue>)> forwardOperands,
+    bool exclusive, uint16_t loadBandwidth, uint16_t storeBandwidth,
+    uint16_t permittedRequests, uint16_t permittedLoads,
+    uint16_t permittedStores)
+    : completionSlots_(completionSlots),
+      forwardOperands_(forwardOperands),
+      maxLoadQueueSpace_(maxLoadQueueSpace),
+      maxStoreQueueSpace_(maxStoreQueueSpace),
+      combined_(false),
+      memory_(memory),
+      exclusive_(exclusive),
+      loadBandwidth_(loadBandwidth),
+      storeBandwidth_(storeBandwidth),
+      totalLimit_(permittedRequests),
+      // Set per-cycle limits for each request type
+      reqLimits_{permittedLoads, permittedStores} {};
+
+unsigned int LoadStoreQueue::getLoadQueueSpace() const {
+  if (combined_) {
+    return getCombinedSpace();
+  } else {
+    return getLoadQueueSplitSpace();
+  }
+}
+unsigned int LoadStoreQueue::getStoreQueueSpace() const {
+  if (combined_) {
+    return getCombinedSpace();
+  } else {
+    return getStoreQueueSplitSpace();
+  }
+}
+unsigned int LoadStoreQueue::getTotalSpace() const {
+  if (combined_) {
+    return getCombinedSpace();
+  } else {
+    return getLoadQueueSplitSpace() + getStoreQueueSplitSpace();
+  }
+}
+
+unsigned int LoadStoreQueue::getLoadQueueSplitSpace() const {
+  return maxLoadQueueSpace_ - loadQueue_.size();
+}
+unsigned int LoadStoreQueue::getStoreQueueSplitSpace() const {
+  return maxStoreQueueSpace_ - storeQueue_.size();
+}
+unsigned int LoadStoreQueue::getCombinedSpace() const {
+  return maxCombinedSpace_ - loadQueue_.size() - storeQueue_.size();
+}
+
+bool isMisAligned(uint64_t addr, uint8_t sz) {
+  if(((addr & 0x1) && sz==2) || ((addr & 0x3) && sz==4)) {
+    return true;
+  }
+  return false;
+}
+
+void LoadStoreQueue::addLoad(const std::shared_ptr<Instruction>& insn) {
+
+  const auto& addresses = insn->getGeneratedAddresses();
+
+  assert(addresses.size()==1 && "Expecting only 1 address in load request");
+  // Do something to split into multiple requests if alignment is required for case like crossing 4 byte boundary.
+
+  loadQueue_.push_back(insn);
+  uint64_t add_tick = 1;
+  bool isMisAlign = false;
+  if (isMisAligned(addresses[0].address, addresses[0].size)) {
+    add_tick+=1;
+    isMisAlign=true;
+  }
+  requestQueue_.push_back({{}, {}, insn, LOAD, (tickCounter_+add_tick) + insn->getLSQLatency(), isMisAlign});
+  // Submit request write to memory interface early as the architectural state
+  // considers the store to be retired and thus its operation complete
+
+  for (size_t i = 0; i < addresses.size(); i++) {
+    //memory_.requestWrite(addresses[i], data[i]);
+    // Still add addresses to requestQueue_ to ensure contention of resources is
+    // correctly simulated
+    requestQueue_.back().reqAddresses.push(addresses[i]);
+  }
+
+  //loadQueue_.push_back(insn);
+  //startLoad(insn);
+}
+
+void LoadStoreQueue::addStore(const std::shared_ptr<Instruction>& insn) {
+
+  const auto& addresses = insn->getGeneratedAddresses();
+  span<const simeng::RegisterValue> data = insn->getData();
+
+  assert(addresses.size()==1 && "Expecting only 1 address in store request");
+  // Do something to split into multiple requests if alignment is required for case like crossing 4 byte boundary.
+
+  storeQueue_.push_back({insn, data});
+
+  uint64_t add_tick = 1;
+  bool isMisAlign = false;
+  if (isMisAligned(addresses[0].address, addresses[0].size)) {
+    add_tick+=1;
+    isMisAlign = true;
+  }
+
+  requestQueue_.push_back({{}, {}, insn, STORE, (tickCounter_+add_tick) + insn->getLSQLatency(), isMisAlign});
+  // Submit request write to memory interface early as the architectural state
+  // considers the store to be retired and thus its operation complete
+
+  for (size_t i = 0; i < addresses.size(); i++) {
+    //memory_.requestWrite(addresses[i], data[i]);
+    // Still add addresses to requestQueue_ to ensure contention of resources is
+    // correctly simulated
+    requestQueue_.back().reqAddresses.push(addresses[i]);
+    requestQueue_.back().data.push(data[i]);
+  }
+  //storeQueue_.push_back({insn, {}});
+  //supplyStoreData(insn);
+  //commitStore(insn);
+}
+
+void LoadStoreQueue::startLoad(const std::shared_ptr<Instruction>& insn) {
+  return;
+}
+
+void LoadStoreQueue::supplyStoreData(const std::shared_ptr<Instruction>& insn) {
+  return;
+}
+
+bool LoadStoreQueue::commitStore(const std::shared_ptr<Instruction>& uop) {
+
+  if (storeQueue_.front().first == uop) {
+    storeQueue_.pop_front();
+  } else {
+    assert(false && "The commited store is not the one in the front of the storeQueue_");
+  }
+  return true;
+}
+
+void LoadStoreQueue::commitLoad(const std::shared_ptr<Instruction>& uop) {
+
+  if (loadQueue_.front() == uop) {
+    loadQueue_.pop_front();
+  } else {
+    assert(false && "The commited store is not the one in the front of the loadQueue_");
+  }
+  return;
+}
+
+void LoadStoreQueue::purgeFlushed() {
+
+  return;
+
+}
+
+bool LoadStoreQueue::isBusy() const {
+  // TODO: This is just to allow only 1 outstanding request to be used for SST integeration.
+  //if (activeMisAlignedOpr() || loadQueue_.size()>=1 || storeQueue_.size()>=1) {
+  if (activeMisAlignedOpr() || (loadQueue_.size()+storeQueue_.size())>=2) {
+    return true;
+  }
+  return false;
+}
+
+void LoadStoreQueue::tick() {
+  tickCounter_++;
+
+  //Request at the front of the queue should be sent to memory first
+  //Ensure its scheduled after necessary tick
+  if (requestQueue_.size() > 0) {
+    requestEntry1& oldestreq = requestQueue_.front();
+    if (tickCounter_ >= oldestreq.reqtick) {
+      if(oldestreq.type == LOAD) {
+        memory_.requestRead(oldestreq.reqAddresses.front(), (uint64_t) busReqId);
+        oldestreq.reqAddresses.pop();
+        if (oldestreq.reqAddresses.size() == 0) { // All requests sent
+          requestQueue_.pop_front();
+        }
+        requestedLoads_.emplace(busReqId, oldestreq.insn);
+        numLoads++;
+        latencyLoads_.emplace(busReqId, tickCounter_);
+        busReqId++;
+      } else if(oldestreq.type == STORE) {
+        memory_.requestWrite(oldestreq.reqAddresses.front(), oldestreq.data.front());
+        oldestreq.reqAddresses.pop();
+        oldestreq.data.pop();
+        if (oldestreq.reqAddresses.size() == 0) { // All requests sent
+          requestQueue_.pop_front();
+          //Verify same instruction. and remove from the storeQueue_ as well
+          //storeQueue_.pop_front();//No need
+        }
+      } else {
+        assert(false && "Unknown request type to be scheduled to memory");
+      }
+    }
+  }
+
+  //processResponse();
+}
+
+void LoadStoreQueue::processResponse() {
+  // Process completed read requests
+  for (const auto& response : memory_.getCompletedReads()) {
+    const auto& address = response.target.address;
+    const auto& data = response.data;
+
+    // TODO: Detect and handle non-fatal faults (e.g. page fault)
+
+    // Find instruction that requested the memory read
+    const auto& itr = requestedLoads_.find(response.requestId);
+    if (itr == requestedLoads_.end()) {
+      continue;
+    } else {
+      requestedLoads_.erase(response.requestId);
+      uint32_t ldLatency = ((tickCounter_ + 1) - latencyLoads_.at(response.requestId));
+      if (ldLatency > maxLdLatency) {
+        maxLdLatency = ldLatency;
+      }
+      if (ldLatency < minLdLatency) {
+        minLdLatency = ldLatency;
+      }
+      totalLdLatency += ldLatency;
+      //std::cout << std::dec << "Total Ld latency: " << totalLdLatency << ", numLoads: " << numLoads  << std::endl;
+      latencyLoads_.erase(response.requestId);
+    }
+    // Supply data to the instruction and execute if it is ready
+    const auto& load = itr->second;
+    load->supplyData(address, data);
+    if (load->hasAllData()) {
+      // This load has completed
+      load->execute();
+      /*if (load->isStoreData()) {
+        supplyStoreData(load);
+      }*/
+      completedLoads_.push(load);
+    }
+  }
+  memory_.clearCompletedReads();
+
+  // Pop from the front of the completed loads queue and send to writeback
+  size_t count = 0;
+  while (completedLoads_.size() > 0 && count < completionSlots_.size()) {
+    const auto& insn = completedLoads_.front();
+
+    // Don't process load instruction if it has been flushed
+    if (insn->isFlushed()) {
+      completedLoads_.pop();
+      continue;
+    }
+
+    // Forward the results
+    // forwardOperands_(insn->getDestinationRegisters(), insn->getResults());
+
+    completionSlots_[count].getTailSlots()[0] = std::move(insn);
+
+    completedLoads_.pop();
+
+    count++;
+  }
+}
+
+std::shared_ptr<Instruction> LoadStoreQueue::getViolatingLoad() const {
+  return violatingLoad_;
+}
+
+//Clean up is required!
+bool LoadStoreQueue::activeMisAlignedOpr() const {
+  //if the front of the request queue has a misaligned request that is not yet being sent to the bus then its better to halt LSU taking new requests.
+  // if(storeQueue_.size() > 0 && activeMisAlignedStore) {
+  //   return true;
+  // }
+  return (requestQueue_.size() > 0 && requestQueue_.front().isMisAligned && ((requestQueue_.front().reqtick-tickCounter_)==1));
+}
+
+bool LoadStoreQueue::isCombined() const { return combined_; }
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/lib/pipeline_hi/RegDepMap.cc b/src/lib/pipeline_hi/RegDepMap.cc
new file mode 100644
index 0000000000..4ab004bfdb
--- /dev/null
+++ b/src/lib/pipeline_hi/RegDepMap.cc
@@ -0,0 +1,143 @@
+#include "simeng/pipeline_hi/RegDepMap.hh"
+
+#include <iostream>
+
+//#define RDMDEBUG
+#ifdef RDMDEBUG
+#define DEBUG(x) std::cout << "Core: " << std::hex << x << std::endl;
+#else
+#define DEBUG(x) do { } while (false);
+#endif
+
+namespace simeng {
+namespace pipeline_hi {
+
+const Register l_ZERO_REGISTER = {0, 0};
+
+RegDepMap::RegDepMap(const std::vector<RegisterFileStructure> registerFileStructures,
+                     const RegisterFileSet& registerFileSet) : 
+                registerFileStructures_(registerFileStructures),
+                registerFileSet_(registerFileSet) {
+  regMap_.resize(registerFileStructures_.size());//Just for Integer Register File for now
+  for (size_t type=0; type<registerFileStructures_.size(); type++) {
+    regMap_[type].resize(registerFileStructures_.at(type).quantity);
+  }
+}
+
+RegDepMap::~RegDepMap()
+{
+  for (unsigned i = 0; i < regMap_.size(); i++) {
+    for (unsigned j = 0; j < regMap_[i].size(); j++)
+      regMap_[i][j].clear();
+    regMap_[i].clear();
+  }
+  regMap_.clear();
+}
+
+void RegDepMap::insert(InstrPtr instr)
+{
+  //TODO: IRF X0 is not a dependency!
+  auto& destinationRegisters = instr->getDestinationRegisters();
+  for(const auto& reg: destinationRegisters) {
+    if(reg != l_ZERO_REGISTER) { //Not X0
+      outstandingDep_++;
+      DEBUG("Adding Depencency: addr, 0x" << instr->getInstructionAddress() << std::dec << ", dest: " << reg << ", outstanding: " << outstandingDep_);
+      regMap_[reg.type][reg.tag].push_back(instr);
+    }
+  }
+}
+
+void RegDepMap::remove(InstrPtr instr)
+{
+  auto& destinationRegisters = instr->getDestinationRegisters();
+  for(const auto& reg: destinationRegisters) {
+    auto it = regMap_[reg.type][reg.tag].begin();
+    while (it != regMap_[reg.type][reg.tag].end()) {
+      if(*it == instr) {
+        outstandingDep_--;
+        DEBUG("Removing Depencency: addr, 0x" << instr->getInstructionAddress() << std::dec << ", dest: " << reg << ", outstanding: " << outstandingDep_);
+        it = regMap_[reg.type][reg.tag].erase(it);
+        break;
+      } else {
+        it++;
+      }
+    }
+  }
+}
+
+bool RegDepMap::canRead(InstrPtr instr)
+{
+  bool dependency = false;
+  auto& sourceRegisters = instr->getOperandRegisters();
+  for (uint16_t i = 0; i < sourceRegisters.size(); i++) {
+    const auto& srcReg = sourceRegisters[i];
+
+    if (!instr->isOperandReady(i)) {
+      // The operand hasn't already been supplied
+      if (regMap_[srcReg.type][srcReg.tag].size() == 0) {//pick up value from register file
+        instr->supplyOperand(i, registerFileSet_.get(srcReg));         
+      } else if (regMap_[srcReg.type][srcReg.tag].back()->hasExecuted() &&
+                 !(regMap_[srcReg.type][srcReg.tag].back()->isMul() || regMap_[srcReg.type][srcReg.tag].back()->isDiv() ||
+                   (regMap_[srcReg.type][srcReg.tag].back()->isLoad() && !instr->isStoreData()))) {//pick up value from last executed instruction
+        const auto& destRegisters = regMap_[srcReg.type][srcReg.tag].back()->getDestinationRegisters();
+        const auto& destValues = regMap_[srcReg.type][srcReg.tag].back()->getResults();
+        for (size_t j = 0; j < destRegisters.size(); j++) {
+          const auto& destReg = destRegisters[j];
+          if (destReg == srcReg) {
+            instr->supplyOperand(i, destValues[j]);
+            break;
+          }
+        }
+      } else {
+        dependency = true;
+      }
+    }
+  }
+
+  return !dependency;
+}
+
+bool RegDepMap::canWrite(InstrPtr instr)
+{
+  bool dependency = false;
+  auto& destRegisters = instr->getDestinationRegisters();
+  for(uint16_t i = 0; i < destRegisters.size(); i++) {
+      const auto& destReg = destRegisters[i];
+      if (regMap_[destReg.type][destReg.tag].size() > 0 &&
+          !regMap_[destReg.type][destReg.tag].back()->hasExecuted()) {
+        dependency = true;
+        break;
+      }
+  }
+  return !dependency || (instr->isLoad());
+}
+
+//Clean up the options logic to ensure all of them work well together
+bool RegDepMap::canForward(InstrPtr instr)
+{
+  return true;
+}
+
+void RegDepMap::purgeFlushed() {
+  for (auto& registerType : regMap_) {
+    for (auto& dependencyList : registerType) {
+      auto it = dependencyList.begin();
+      while (it != dependencyList.end()) {
+        DEBUG("Purge entry present at addr: 0x" << (*it)->getInstructionAddress());
+        if ((*it)->isFlushed()) {
+          outstandingDep_--;
+          it = dependencyList.erase(it);
+        } else {
+          it++;
+        }
+      }
+    }
+  }
+}
+
+void RegDepMap::dump()
+{
+}
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/lib/pipeline_hi/RegisterAliasTable.cc b/src/lib/pipeline_hi/RegisterAliasTable.cc
new file mode 100644
index 0000000000..0c813a6f7c
--- /dev/null
+++ b/src/lib/pipeline_hi/RegisterAliasTable.cc
@@ -0,0 +1,110 @@
+#include "simeng/pipeline_hi/RegisterAliasTable.hh"
+
+#include <cassert>
+
+namespace simeng {
+namespace pipeline_hi {
+
+RegisterAliasTable::RegisterAliasTable(
+    std::vector<RegisterFileStructure> architecturalStructure,
+    std::vector<uint16_t> physicalRegisterCounts)
+    : mappingTable_(architecturalStructure.size()),
+      historyTable_(architecturalStructure.size()),
+      destinationTable_(architecturalStructure.size()),
+      freeQueues_(architecturalStructure.size()) {
+  assert(architecturalStructure.size() == physicalRegisterCounts.size() &&
+         "The number of physical register types does not match the number of "
+         "architectural register types");
+
+  for (size_t type = 0; type < architecturalStructure.size(); type++) {
+    auto archCount = architecturalStructure[type].quantity;
+    auto physCount = physicalRegisterCounts[type];
+    assert(archCount <= physCount &&
+           "Cannot have fewer physical registers than architectural registers");
+
+    // Set up the initial mapping table state for this register type
+    mappingTable_[type].resize(archCount);
+
+    for (size_t tag = 0; tag < archCount; tag++) {
+      // Pre-assign a physical register to each architectural register
+      mappingTable_[type][tag] = tag;
+    }
+
+    // Add remaining physical registers to free queue
+    for (size_t tag = archCount; tag < physCount; tag++) {
+      freeQueues_[type].push(tag);
+    }
+
+    // Set up history/destination tables
+    historyTable_[type].resize(physCount);
+    destinationTable_[type].resize(physCount);
+  }
+};
+
+Register RegisterAliasTable::getMapping(Register architectural) const {
+  // Asserts to ensure mapping isn't attempted for an out-of-bound index (i.e.
+  // mapping of WZR / XZR)
+  assert(architectural.type < mappingTable_.size() &&
+         "Invalid register type. Cannot find RAT mapping.");
+  assert(architectural.type >= 0 &&
+         "Invalid register type. Cannot find RAT mapping.");
+
+  auto tag = mappingTable_[architectural.type][architectural.tag];
+  return {architectural.type, tag};
+}
+
+bool RegisterAliasTable::canAllocate(uint8_t type,
+                                     unsigned int quantity) const {
+  return (freeQueues_[type].size() >= quantity);
+}
+
+bool RegisterAliasTable::canRename(uint8_t type) const {
+  // Renaming possible iff there are more physical than architectural registers
+  return destinationTable_[type].size() > mappingTable_[type].size();
+}
+
+unsigned int RegisterAliasTable::freeRegistersAvailable(uint8_t type) const {
+  return freeQueues_[type].size();
+}
+
+Register RegisterAliasTable::allocate(Register architectural) {
+  std::queue<uint16_t>& freeQueue = freeQueues_[architectural.type];
+  assert(freeQueue.size() > 0 &&
+         "Attempted to allocate free register when none were available");
+
+  auto tag = freeQueue.front();
+  freeQueue.pop();
+
+  // Keep the old physical register in the history table
+  historyTable_[architectural.type][tag] =
+      mappingTable_[architectural.type][architectural.tag];
+
+  // Update the mapping table with the new tag, and mark the architectural
+  // register it replaces in the destination table
+  mappingTable_[architectural.type][architectural.tag] = tag;
+  destinationTable_[architectural.type][tag] = architectural.tag;
+
+  return {architectural.type, tag};
+}
+
+void RegisterAliasTable::commit(Register physical) {
+  // Find the register previously mapped to the same architectural register and
+  // free it
+  auto oldTag = historyTable_[physical.type][physical.tag];
+  freeQueues_[physical.type].push(oldTag);
+}
+void RegisterAliasTable::rewind(Register physical) {
+  // Find which architectural tag this referred to
+  auto destinationTag = destinationTable_[physical.type][physical.tag];
+  // Rewind the mapping table to the old physical tag
+  mappingTable_[physical.type][destinationTag] =
+      historyTable_[physical.type][physical.tag];
+  // Add the rewound physical tag back to the free queue
+  freeQueues_[physical.type].push(physical.tag);
+}
+void RegisterAliasTable::free(Register physical) {
+  freeQueues_[physical.type].push(physical.tag);
+}
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/lib/pipeline_hi/ReorderBuffer.cc b/src/lib/pipeline_hi/ReorderBuffer.cc
new file mode 100644
index 0000000000..c653ffd5ea
--- /dev/null
+++ b/src/lib/pipeline_hi/ReorderBuffer.cc
@@ -0,0 +1,206 @@
+#include "simeng/pipeline_hi/ReorderBuffer.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+
+namespace simeng {
+namespace pipeline_hi {
+
+ReorderBuffer::ReorderBuffer(
+    unsigned int maxSize, RegisterAliasTable& rat, LoadStoreQueue& lsq,
+    std::function<void(const std::shared_ptr<Instruction>&)> raiseException,
+    std::function<void(uint64_t branchAddress)> sendLoopBoundary,
+    BranchPredictor& predictor, uint16_t loopBufSize,
+    uint16_t loopDetectionThreshold)
+    : rat_(rat),
+      lsq_(lsq),
+      maxSize_(maxSize),
+      raiseException_(raiseException),
+      sendLoopBoundary_(sendLoopBoundary),
+      predictor_(predictor),
+      loopBufSize_(loopBufSize),
+      loopDetectionThreshold_(loopDetectionThreshold) {}
+
+void ReorderBuffer::reserve(const std::shared_ptr<Instruction>& insn) {
+  assert(buffer_.size() < maxSize_ &&
+         "Attempted to reserve entry in reorder buffer when already full");
+  insn->setSequenceId(seqId_);
+  seqId_++;
+  insn->setInstructionId(insnId_);
+  if (insn->isLastMicroOp()) insnId_++;
+
+  buffer_.push_back(insn);
+}
+
+void ReorderBuffer::commitMicroOps(uint64_t insnId) {
+  if (buffer_.size()) {
+    size_t index = 0;
+    int firstOp = -1;
+    bool validForCommit = false;
+
+    // Find first instance of uop belonging to macro-op instruction
+    for (; index < buffer_.size(); index++) {
+      if (buffer_[index]->getInstructionId() == insnId) {
+        firstOp = index;
+        break;
+      }
+    }
+
+    if (firstOp > -1) {
+      // If found, see if all uops are committable
+      for (; index < buffer_.size(); index++) {
+        if (buffer_[index]->getInstructionId() != insnId) break;
+        if (!buffer_[index]->isWaitingCommit()) {
+          return;
+        } else if (buffer_[index]->isLastMicroOp()) {
+          // all microOps must be in ROB for the commit to be valid
+          validForCommit = true;
+        }
+      }
+      if (!validForCommit) return;
+
+      // No early return thus all uops are committable
+      for (; firstOp < buffer_.size(); firstOp++) {
+        if (buffer_[firstOp]->getInstructionId() != insnId) break;
+        buffer_[firstOp]->setCommitReady();
+      }
+    }
+  }
+  return;
+}
+
+unsigned int ReorderBuffer::commit(unsigned int maxCommitSize) {
+  shouldFlush_ = false;
+  size_t maxCommits =
+      std::min(static_cast<size_t>(maxCommitSize), buffer_.size());
+
+  unsigned int n;
+  for (n = 0; n < maxCommits; n++) {
+    auto& uop = buffer_[0];
+    if (!uop->canCommit()) {
+      break;
+    }
+
+    if (uop->isLastMicroOp()) instructionsCommitted_++;
+
+    if (uop->exceptionEncountered()) {
+      raiseException_(uop);
+      buffer_.pop_front();
+      return n + 1;
+    }
+
+    const auto& destinations = uop->getDestinationRegisters();
+    for (int i = 0; i < destinations.size(); i++) {
+      rat_.commit(destinations[i]);
+    }
+
+    // If it's a memory op, commit the entry at the head of the respective queue
+    if (uop->isLoad()) {
+      lsq_.commitLoad(uop);
+    }
+    if (uop->isStoreAddress()) {
+      bool violationFound = lsq_.commitStore(uop);
+      if (violationFound) {
+        loadViolations_++;
+        // Memory order violation found; aborting commits and flushing
+        auto load = lsq_.getViolatingLoad();
+        shouldFlush_ = true;
+        flushAfter_ = load->getInstructionId() - 1;
+        pc_ = load->getInstructionAddress();
+
+        buffer_.pop_front();
+        return n + 1;
+      }
+    }
+
+    // Increment or swap out branch counter for loop detection
+    if (uop->isBranch() && !loopDetected_) {
+      bool increment = true;
+      if (branchCounter_.first.address != uop->getInstructionAddress()) {
+        // Mismatch on instruction address, reset
+        increment = false;
+      } else if (branchCounter_.first.outcome != uop->getBranchPrediction()) {
+        // Mismatch on branch outcome, reset
+        increment = false;
+      } else if ((instructionsCommitted_ - branchCounter_.first.commitNumber) >
+                 loopBufSize_) {
+        // Loop too big to fit in loop buffer, reset
+        increment = false;
+      }
+
+      if (increment) {
+        // Reset commitNumber value
+        branchCounter_.first.commitNumber = instructionsCommitted_;
+        // Increment counter
+        branchCounter_.second++;
+
+        if (branchCounter_.second > loopDetectionThreshold_) {
+          // If the same branch with the same outcome is sequentially retired
+          // more times than the loopDetectionThreshold_ value, identify as a
+          // loop boundary
+          loopDetected_ = true;
+          sendLoopBoundary_(uop->getInstructionAddress());
+        }
+      } else {
+        // Swap out latest branch
+        branchCounter_ = {{uop->getInstructionAddress(),
+                           uop->getBranchPrediction(), instructionsCommitted_},
+                          0};
+      }
+    }
+    buffer_.pop_front();
+  }
+
+  return n;
+}
+
+void ReorderBuffer::flush(uint64_t afterSeqId) {
+  // Iterate backwards from the tail of the queue to find and remove ops newer
+  // than `afterSeqId`
+  while (!buffer_.empty()) {
+    auto& uop = buffer_.back();
+    if (uop->getInstructionId() <= afterSeqId) {
+      break;
+    }
+
+    // To rewind destination registers in correct history order, rewinding of
+    // register renaming is done backwards
+    auto destinations = uop->getDestinationRegisters();
+    for (int i = destinations.size() - 1; i >= 0; i--) {
+      const auto& reg = destinations[i];
+      rat_.rewind(reg);
+    }
+    uop->setFlushed();
+    // If the instruction is a branch, supply address to branch flushing logic
+    if (uop->isBranch()) {
+      predictor_.flush(uop->getInstructionAddress());
+    }
+    buffer_.pop_back();
+  }
+
+  // Reset branch counter and loop detection
+  branchCounter_ = {{0, {false, 0}, 0}, 0};
+  loopDetected_ = false;
+}
+
+unsigned int ReorderBuffer::size() const { return buffer_.size(); }
+
+unsigned int ReorderBuffer::getFreeSpace() const {
+  return maxSize_ - buffer_.size();
+}
+
+bool ReorderBuffer::shouldFlush() const { return shouldFlush_; }
+uint64_t ReorderBuffer::getFlushAddress() const { return pc_; }
+uint64_t ReorderBuffer::getFlushSeqId() const { return flushAfter_; }
+
+uint64_t ReorderBuffer::getInstructionsCommittedCount() const {
+  return instructionsCommitted_;
+}
+
+uint64_t ReorderBuffer::getViolatingLoadsCount() const {
+  return loadViolations_;
+}
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/lib/pipeline_hi/StaticPredictor.cc b/src/lib/pipeline_hi/StaticPredictor.cc
new file mode 100644
index 0000000000..6619989942
--- /dev/null
+++ b/src/lib/pipeline_hi/StaticPredictor.cc
@@ -0,0 +1,120 @@
+#include "simeng/pipeline_hi/StaticPredictor.hh"
+
+#include <cassert>
+
+namespace simeng {
+namespace pipeline_hi {
+//TODO: temp for get rid of yaml, delete it later
+StaticPredictor::StaticPredictor(uint8_t sType)
+    : staticType_(sType) {}
+
+StaticPredictor::StaticPredictor(YAML::Node config)
+    : staticType_(config["Branch-Predictor"]["Static-Type"].as<uint8_t>()),
+      rasSize_(config["Branch-Predictor"]["RAS-entries"].as<uint64_t>()){}
+
+StaticPredictor::~StaticPredictor() {
+  ras_.clear();
+  rasHistory_.clear();
+}
+
+BranchPrediction StaticPredictor::predict(uint64_t address, BranchType type,
+                                          uint64_t knownOffset,
+                                          uint8_t byteLength) {
+  int64_t offset = knownOffset;
+  uint64_t predict_target = (knownOffset) ? knownOffset + address : 0;
+  BranchPrediction prediction = {false, 0};
+
+  assert(byteLength > 1 && "byteLength <= 1");
+
+  if (type == BranchType::Unconditional) {
+    prediction = { true, predict_target};
+  } else if (type == BranchType::Return) {
+    if (ras_.size() > 0) {
+      predict_target = ras_.back();
+      // Record top of RAS used for target prediction
+      rasHistory_[address] = ras_.back();
+      ras_.pop_back();
+    }
+    prediction = {true, predict_target};
+  } else if (type == BranchType::SubroutineCall) { //JAL and JALR
+    if (ras_.size() >= rasSize_) {
+      ras_.pop_front();
+    }
+    ras_.push_back(address + byteLength);
+    // Record that this address is a branch-and-link instruction
+    rasHistory_[address] = 0;
+    prediction = {true, predict_target};
+  } else if (type == BranchType::Conditional) {
+    switch (staticType_) {
+      case 0: //always-taken
+        prediction = {true, predict_target};
+        break;
+
+      case 1: //always-not-taken;
+        prediction = {false, 0};
+        break;
+
+      case 2: //Backward Taken, Forward Not Taken
+      {
+        if (offset >= 0) {
+          //not taken
+          prediction = {false, address+byteLength};
+        } else {
+          prediction = {true, predict_target};
+        }
+        break;
+      }
+
+      case 3: //Forward Taken, Backward Not Taken
+      {
+        if (offset <= 0) {
+          //not taken
+          prediction = {false, address+byteLength};
+        } else {
+          prediction = {true, predict_target};
+        }
+        break;
+      }
+
+      default:
+        assert(staticType_ < 4 && "Non-supported type for static predictor");
+        break;
+    }
+  }
+
+  return prediction;
+}
+
+void StaticPredictor::update(uint64_t address, bool taken,
+                             uint64_t targetAddress, BranchType type) {}
+
+void StaticPredictor::flush(uint64_t address) {
+  // If address interacted with RAS, rewind entry
+  auto it = rasHistory_.find(address);
+  if (it != rasHistory_.end()) {
+    uint64_t target = it->second;
+    if (target != 0) {
+      // If history entry belongs to a return instruction, push target back onto
+      // stack
+      if (ras_.size() >= rasSize_) {
+        ras_.pop_front();
+      }
+      ras_.push_back(target);
+    } else {
+      // If history entry belongs to a branch-and-link instruction, pop target
+      // off of stack
+      if (ras_.size()) {
+        ras_.pop_back();
+      }
+    }
+    rasHistory_.erase(it);
+  }
+}
+BranchPrediction StaticPredictor::predict(uint64_t address, BranchType type,
+                                          uint64_t knownTarget) {
+  printf("StaticPredictor::predict(), This is overloaded and deprecated! \n");
+  return predict(address, type, knownTarget, 4);
+}
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/lib/pipeline_hi/WritebackUnit.cc b/src/lib/pipeline_hi/WritebackUnit.cc
new file mode 100644
index 0000000000..b0dfd97161
--- /dev/null
+++ b/src/lib/pipeline_hi/WritebackUnit.cc
@@ -0,0 +1,74 @@
+#include "simeng/pipeline_hi/WritebackUnit.hh"
+
+#include <iostream>
+
+namespace simeng {
+namespace pipeline_hi {
+
+WritebackUnit::WritebackUnit(
+    std::vector<PipelineBuffer<std::shared_ptr<Instruction>>>& completionSlots,
+    RegisterFileSet& registerFileSet,
+    std::function<void(uint64_t insnId)> flagMicroOpCommits,
+    std::function<void(const std::shared_ptr<Instruction>&)> removeDep,
+    std::function<bool(const std::shared_ptr<Instruction>&)> removeInstrOrderQ)
+    : completionSlots_(completionSlots),
+      registerFileSet_(registerFileSet),
+      flagMicroOpCommits_(flagMicroOpCommits),
+      removeDep_(removeDep),
+      removeInstrOrderQ_(removeInstrOrderQ) {}
+
+void WritebackUnit::tick() {
+  for (size_t slot = 0; slot < completionSlots_.size(); slot++) {
+    auto& uop = completionSlots_[slot].getHeadSlots()[0];
+
+    if (uop == nullptr) {
+      continue;
+    }
+
+    auto& results = uop->getResults();
+    auto& destinations = uop->getDestinationRegisters();
+    for (size_t i = 0; i < results.size(); i++) {
+      // Write results to register file
+      registerFileSet_.set(destinations[i], results[i]);
+    }
+    if (uop->isMicroOp()) {
+      uop->setWaitingCommit();
+      flagMicroOpCommits_(uop->getInstructionId());
+      if (uop->isLastMicroOp()) {
+        instructionsWritten_++;
+        committedInstsForTrace_.push_back(uop);
+      }
+    } else {
+      uop->setCommitReady();
+      removeDep_(uop);
+      instructionsWritten_++;
+      committedInstsForTrace_.push_back(uop);
+    }
+
+    completionSlots_[slot].getHeadSlots()[0] = nullptr;
+  }
+}
+
+uint64_t WritebackUnit::getInstructionsWrittenCount() const {
+  return instructionsWritten_;
+}
+
+std::vector<std::shared_ptr<Instruction>> WritebackUnit::getInstsForTrace() {
+  std::shared_ptr<Instruction> instr;
+  std::deque<std::shared_ptr<Instruction>>::iterator it =  committedInstsForTrace_.begin();
+  while(it != committedInstsForTrace_.end()) {
+    instr = *it;
+    if (removeInstrOrderQ_(instr)) {
+      committedInstsForTrace_.erase(it);
+      return {instr};
+    }
+    it++;
+  }
+  return {}; //committedInstsForTrace_;
+}
+void WritebackUnit::traceFinished() {
+  //committedInstsForTrace_.clear();
+}
+
+}  // namespace pipeline_hi
+}  // namespace simeng
diff --git a/src/tools/simeng/main.cc b/src/tools/simeng/main.cc
index fa9b58baa1..f5cfa53513 100644
--- a/src/tools/simeng/main.cc
+++ b/src/tools/simeng/main.cc
@@ -10,7 +10,7 @@
 #include "simeng/version.hh"
 
 /** Tick the provided core model until it halts. */
-int simulate(simeng::Core& core, simeng::MemoryInterface& dataMemory,
+uint64_t simulate(simeng::Core& core, simeng::MemoryInterface& dataMemory,
              simeng::MemoryInterface& instructionMemory) {
   uint64_t iterations = 0;
 
@@ -91,7 +91,7 @@ int main(int argc, char** argv) {
 
   // Run simulation
   std::cout << "[SimEng] Starting...\n" << std::endl;
-  int iterations = 0;
+  uint64_t iterations = 0;
   auto startTime = std::chrono::high_resolution_clock::now();
   iterations = simulate(*core, *dataMemory, *instructionMemory);
 
diff --git a/sst/SimEngCoreWrapper.cc b/sst/SimEngCoreWrapper.cc
index 45c1bdde70..668439f52e 100644
--- a/sst/SimEngCoreWrapper.cc
+++ b/sst/SimEngCoreWrapper.cc
@@ -10,9 +10,85 @@
 
 #include "Assemble.hh"
 
+#include <fstream>
+
 using namespace SST::SSTSimEng;
 using namespace SST::Interfaces;
 
+//For now just make sure that the code and data is loaded into memory
+// at the correct addresses instead of sending the entire process image
+void SimEngCoreWrapper::processMemoryImage() {
+  std::ifstream file(executablePath_, std::ios::binary);
+  if (!file.is_open()) {
+    return;
+  }
+
+  char elfMagic[4] = {0x7f, 'E', 'L', 'F'};
+  char fileMagic[4];
+  file.read(fileMagic, 4);
+  if (std::memcmp(elfMagic, fileMagic, sizeof(elfMagic))) {
+    return;
+  }
+
+  /**
+   * The fifth byte of the ELF Header identifies the architecture
+   * of the ELF binary i.e 32-bit or 64-bit.
+   */
+
+  // Check whether this is a 32-bit executable
+  char bitFormat;
+  file.read(&bitFormat, sizeof(bitFormat));
+  if (bitFormat != ElfBitFormat::Format32) {
+    return;
+  }
+  struct Elf32Header {
+    uint32_t type;
+    uint32_t offset;
+    uint32_t virtualAddress;
+    uint32_t physicalAddress;
+    uint32_t fileSize;
+    uint32_t memorySize;
+  };
+  uint32_t entryPoint32_;
+  std::vector<Elf32Header> headers32_;
+
+  file.seekg(0x18);
+  file.read(reinterpret_cast<char*>(&entryPoint32_), sizeof(entryPoint32_));
+  uint32_t headerOffset;
+  file.read(reinterpret_cast<char*>(&headerOffset), sizeof(headerOffset));
+  file.seekg(0x2a);
+  uint16_t headerEntrySize;
+  file.read(reinterpret_cast<char*>(&headerEntrySize), sizeof(headerEntrySize));
+  uint16_t headerEntries;
+  file.read(reinterpret_cast<char*>(&headerEntries), sizeof(headerEntries));  
+  headers32_.resize(headerEntries);
+  // Loop over all headers and extract them.
+  for (size_t i = 0; i < headerEntries; i++) {
+    file.seekg(headerOffset + (i * headerEntrySize));
+    auto& header = headers32_[i];
+
+    const int fieldBytes = 4;
+    file.read(reinterpret_cast<char*>(&(header.type)), sizeof(header.type));
+    file.read(reinterpret_cast<char*>(&(header.offset)), fieldBytes);
+    file.read(reinterpret_cast<char*>(&(header.virtualAddress)), fieldBytes);
+    file.read(reinterpret_cast<char*>(&(header.physicalAddress)), fieldBytes);
+    file.read(reinterpret_cast<char*>(&(header.fileSize)), fieldBytes);
+    file.read(reinterpret_cast<char*>(&(header.memorySize)), fieldBytes);
+  }
+  // Process headers; only observe LOAD sections for this basic implementation
+  for (const auto& header : headers32_) {
+    if (header.type == 1) {  // LOAD
+      char* imagePointer;
+      imagePointer = (char*)calloc(header.memorySize, sizeof(char));
+      file.seekg(header.offset);
+      file.read(imagePointer, header.fileSize);
+      dataMemory_->sendProcessImageToSST(imagePointer, header.memorySize, header.virtualAddress);
+    }
+  }
+  std::cout << "[SSTSimEng:SimEngCoreWrapper] Done exporting elf data into SST memory" << std::endl;
+  //assert(false && "Incomplete implementation");
+}
+
 SimEngCoreWrapper::SimEngCoreWrapper(SST::ComponentId_t id, SST::Params& params)
     : SST::Component(id) {
   output_.init("[SSTSimEng:SimEngCoreWrapper] " + getName() + ":@p:@l ", 999, 0,
@@ -95,7 +171,7 @@ void SimEngCoreWrapper::finish() {
     std::cout << "[SimEng] " << key << ": " << value << "\n";
   }
 
-  std::cout << "\n[SimEng] Finished " << iterations_ << " ticks in " << duration
+  std::cout << "\n[SimEng] Finished " << std::dec << iterations_ << " ticks in " << duration
             << "ms (" << std::round(khz) << " kHz, " << std::setprecision(2)
             << mips << " MIPS)" << std::endl;
 }
@@ -284,13 +360,13 @@ void SimEngCoreWrapper::fabricateSimEngCore() {
             : std::make_unique<simeng::CoreInstance>(
                   a64fxConfigPath_, executablePath_, executableArgs_);
   }
-  if (coreInstance_->getSimulationMode() !=
+  /*if (coreInstance_->getSimulationMode() !=
       simeng::SimulationMode::OutOfOrder) {
     output_.verbose(CALL_INFO, 1, 0,
                     "SimEng currently only supports Out-of-Order "
                     "archetypes with SST.");
     std::exit(EXIT_FAILURE);
-  }
+  }*/
   // Set the SST data memory SimEng should use
   coreInstance_->setL1DataMemory(dataMemory_);
 
@@ -303,7 +379,7 @@ void SimEngCoreWrapper::fabricateSimEngCore() {
 
   // This check ensures that SST has enough memory to store the entire
   // processImage constructed by SimEng.
-  if (maxAddrMemory_ < coreInstance_->getProcessImageSize()) {
+  /*if (maxAddrMemory_ < coreInstance_->getProcessImageSize()) {
     output_.verbose(
         CALL_INFO, 1, 0,
         "Error: SST backend memory is less than processImage size. "
@@ -312,7 +388,7 @@ void SimEngCoreWrapper::fabricateSimEngCore() {
         "\'addr_range_end\'. \n");
     primaryComponentOKToEndSim();
     std::exit(EXIT_FAILURE);
-  }
+  }*/
 // If testing is enabled populate heap if heap values have been specified.
 #ifdef SIMENG_ENABLE_SST_TESTS
   if (heapStr_ != "") {
@@ -320,8 +396,10 @@ void SimEngCoreWrapper::fabricateSimEngCore() {
   }
 #endif
   // Send the process image data over to the SST memory
-  dataMemory_->sendProcessImageToSST(coreInstance_->getProcessImage().get(),
-                                     coreInstance_->getProcessImageSize());
+  //dataMemory_->sendProcessImageToSST(coreInstance_->getProcessImage().get(),
+  //                                   coreInstance_->getProcessImageSize());
+
+  processMemoryImage();
 
   output_.verbose(CALL_INFO, 1, 0, "SimEng core setup successfully.\n");
   // Print out build metadata
@@ -356,4 +434,4 @@ std::vector<uint64_t> SimEngCoreWrapper::splitHeapStr() {
   }
   out.push_back(static_cast<uint64_t>(std::stoull(acc)));
   return out;
-}
\ No newline at end of file
+}
diff --git a/sst/SimEngMemInterface.cc b/sst/SimEngMemInterface.cc
index 4e07801f21..678d985329 100644
--- a/sst/SimEngMemInterface.cc
+++ b/sst/SimEngMemInterface.cc
@@ -18,7 +18,7 @@ SimEngMemInterface::SimEngMemInterface(StandardMem* mem, uint64_t cl,
   this->debug_ = debug;
 };
 
-void SimEngMemInterface::sendProcessImageToSST(char* image, uint64_t size) {
+void SimEngMemInterface::sendProcessImageToSST(char* image, uint64_t size, uint64_t startAddr) {
   std::vector<uint8_t> data;
   data.reserve(size);
 
@@ -26,7 +26,8 @@ void SimEngMemInterface::sendProcessImageToSST(char* image, uint64_t size) {
     data.push_back((uint8_t)image[i]);
   }
 
-  StandardMem::Request* req = new StandardMem::Write(0, data.size(), data);
+  StandardMem::Request* req = new StandardMem::Write(startAddr, data.size(), data);
+  std::cout << std::hex << "[SSTSimEng:SimEngMemInterface] Sending image section to SST Memory at address 0x" << startAddr << ", size 0x" << data.size() << std::endl;
   sstMem_->sendUntimedData(req);
   return;
 };
@@ -176,7 +177,7 @@ void SimEngMemInterface::requestRead(const MemoryAccessTarget& target,
   if (debug_) {
     std::cout << "[SSTSimEng:SSTDebug] MemRead"
               << "-read-request-" << requestId << "-cycle-" << tickCounter_
-              << "-split-" << requests.size() << std::endl;
+              << "-split-" << requests.size() << "-addr-0x" << std::hex << addrStart << std::endl;
   }
   for (StandardMem::Request* req : requests) {
     sstMem_->send(req);
@@ -192,7 +193,11 @@ void SimEngMemInterface::requestWrite(const MemoryAccessTarget& target,
   AggregateWriteRequest* aggrReq = new AggregateWriteRequest(target, data);
   std::vector<StandardMem::Request*> requests =
       makeSSTRequests<AggregateWriteRequest>(aggrReq, addrStart, addrEnd, size);
-
+  if (debug_) {
+    std::cout << "[SSTSimEng:SSTDebug] MemWrite"
+              << "-write-request-xx" << "-cycle-" << tickCounter_
+              << "-split-" << requests.size() << "-addr-0x" << std::hex << addrStart << std::endl;
+  }
   for (StandardMem::Request* req : requests) {
     sstMem_->send(req);
   }
diff --git a/sst/config/mcu_int_example_config.py b/sst/config/mcu_int_example_config.py
new file mode 100644
index 0000000000..fdd3b9682b
--- /dev/null
+++ b/sst/config/mcu_int_example_config.py
@@ -0,0 +1,74 @@
+import sst
+import os
+
+DEBUG_L1 = 1
+DEBUG_MEM = 1
+DEBUG_LEVEL = 1
+
+clw = "32"
+
+# Assume this is run from SimEng root dir
+simeng_path = os.getcwd()
+binary_file = simeng_path + "/share/dhrystone_rv32imc/memory.elf" # Apply the appropriate binary
+config_file = simeng_path + "/configs/DEMO_RISCV32_mcu_sst.yaml"
+
+# Define the simulation components
+cpu = sst.Component("core", "sstsimeng.simengcore")
+cpu.addParams({
+    "simeng_config_path": config_file,
+    "executable_path": binary_file,
+    "executable_args": "",
+    "clock" : "1GHz",
+    "max_addr_memory": 4*1024*1024*1024-1,
+    "cache_line_width": clw,
+    "source": "",
+    "assemble_with_source": False,
+    "heap": "",
+    "debug": False
+})
+
+iface = cpu.setSubComponent("memory", "memHierarchy.standardInterface")
+
+l1cache = sst.Component("l1cache.mesi", "memHierarchy.Cache")
+l1cache.addParams({
+      "access_latency_cycles" : "1",
+      "cache_frequency" : "1Ghz",
+      "replacement_policy" : "nmru",
+      "coherence_protocol" : "MESI",
+      "associativity" : "4",
+      "cache_line_size" : clw,
+      "debug" : DEBUG_L1,
+      "debug_level" : DEBUG_LEVEL,
+      "L1" : "1",
+      "cache_size" : "32KiB"
+})
+
+# Explicitly set the link subcomponents instead of having cache figure them out based on connected port names
+l1toC = l1cache.setSubComponent("cpulink", "memHierarchy.MemLink")
+l1toM = l1cache.setSubComponent("memlink", "memHierarchy.MemLink")
+
+# Memory controller
+memctrl = sst.Component("memory", "memHierarchy.MemController")
+memctrl.addParams({
+    "clock" : "1GHz",
+    "request_width" : clw,
+    "debug" : DEBUG_MEM,
+    "debug_level" : DEBUG_LEVEL,
+    "addr_range_end" : 4*1024*1024*1024-1,
+})
+Mtol1 = memctrl.setSubComponent("cpulink", "memHierarchy.MemLink")
+
+# Memory model
+memory = memctrl.setSubComponent("backend", "memHierarchy.simpleMem")
+memory.addParams({
+      "access_time" : "10ns",
+      "mem_size" : "4GiB",
+      "request_width": clw
+})
+
+# Define the simulation links
+link_cpu_cache_link = sst.Link("link_cpu_cache_link")
+link_cpu_cache_link.connect( (iface, "port", "0ps"), (l1toC, "port", "0ps") )
+link_mem_bus_link = sst.Link("link_mem_bus_link")
+link_mem_bus_link.connect( (l1toM, "port", "0ps"), (Mtol1, "port", "0ps") )
+
diff --git a/sst/include/SimEngCoreWrapper.hh b/sst/include/SimEngCoreWrapper.hh
index cb53c0f50a..fc841949de 100644
--- a/sst/include/SimEngCoreWrapper.hh
+++ b/sst/include/SimEngCoreWrapper.hh
@@ -141,6 +141,8 @@ class SimEngCoreWrapper : public SST::Component {
   /** Method used to assemble SimEng core. */
   void fabricateSimEngCore();
 
+  void processMemoryImage();
+
   /** Method to split the passed executable argument's string into a vector of
    * individual arguments. */
   std::vector<std::string> splitArgs(std::string argString);
@@ -210,7 +212,7 @@ class SimEngCoreWrapper : public SST::Component {
   std::shared_ptr<SimEngMemInterface> dataMemory_;
 
   /** Number of clock iterations. */
-  int iterations_;
+  uint64_t iterations_;
 
   /** Start time of simulation. */
   std::chrono::high_resolution_clock::time_point startTime_;
diff --git a/sst/include/SimEngMemInterface.hh b/sst/include/SimEngMemInterface.hh
index 79789a9f39..463d0dc9d5 100644
--- a/sst/include/SimEngMemInterface.hh
+++ b/sst/include/SimEngMemInterface.hh
@@ -33,7 +33,7 @@ class SimEngMemInterface : public MemoryInterface {
                      bool debug);
   /** Send SimEng's processImage to SST memory backend during `init` lifecycle
    * phase of SST. */
-  void sendProcessImageToSST(char* image, uint64_t size);
+  void sendProcessImageToSST(char* image, uint64_t size, uint64_t startAddr=0);
 
   /**
    * Construct an AggregatedReadRequest and use it to generate

From 70fdf36f6a1048fb4681892a1f1334d4c8f579c5 Mon Sep 17 00:00:00 2001
From: dANW34V3R <dan.weaver@hotmail.co.uk>
Date: Wed, 6 Sep 2023 11:51:20 +0100
Subject: [PATCH 5/5] Clang format

---
 src/include/simeng/BranchPredictor.hh         |   4 +-
 src/include/simeng/Elf.hh                     | 117 +++---
 src/include/simeng/Instruction.hh             |   8 +-
 src/include/simeng/RegisterValue.hh           |  19 +-
 src/include/simeng/arch/Architecture.hh       |  16 +-
 .../simeng/arch/aarch64/Architecture.hh       |   2 +-
 src/include/simeng/arch/riscv/Architecture.hh |  26 +-
 .../simeng/arch/riscv/ExceptionHandler.hh     |   2 +-
 src/include/simeng/arch/riscv/Instruction.hh  |  17 +-
 .../simeng/arch/riscv/SystemRegister.hh       | 348 +++++++++---------
 src/include/simeng/kernel/LinuxProcess.hh     |   2 +-
 src/include/simeng/models/emulation/Core.hh   |   5 +-
 .../simeng/pipeline/PipelineBuffer1.hh        |  19 +-
 .../simeng/pipeline_hi/LoadStoreQueue.hh      |  12 +-
 .../simeng/pipeline_hi/StaticPredictor.hh     |   3 +-
 src/lib/CoreInstance.cc                       |  10 +-
 src/lib/Elf.cc                                |  47 +--
 src/lib/ModelConfig.cc                        |  10 +-
 src/lib/arch/aarch64/Architecture.cc          |  10 +-
 src/lib/arch/riscv/Architecture.cc            | 130 ++++---
 src/lib/arch/riscv/InstructionMetadata.cc     |  33 +-
 src/lib/arch/riscv/InstructionMetadata.hh     |   6 +-
 src/lib/kernel/Linux.cc                       |  22 +-
 src/lib/models/emulation/Core.cc              |  23 +-
 src/lib/pipeline_hi/FetchUnit.cc              |  19 +-
 src/lib/pipeline_hi/RegDepMap.cc              | 100 ++---
 src/lib/pipeline_hi/WritebackUnit.cc          |   9 +-
 sst/SimEngMemInterface.cc                     |  19 +-
 sst/include/SimEngMemInterface.hh             |   3 +-
 29 files changed, 538 insertions(+), 503 deletions(-)

diff --git a/src/include/simeng/BranchPredictor.hh b/src/include/simeng/BranchPredictor.hh
index 8d76f08753..dd34a067ae 100644
--- a/src/include/simeng/BranchPredictor.hh
+++ b/src/include/simeng/BranchPredictor.hh
@@ -48,8 +48,8 @@ class BranchPredictor {
 
   /** Overload predict() with more information in parameters */
   virtual BranchPrediction predict(uint64_t address, BranchType type,
-                                   uint64_t knownTarget, uint8_t instByteLength)
-      = 0;
+                                   uint64_t knownTarget,
+                                   uint8_t instByteLength) = 0;
 
   /** Generate a branch prediction for the specified instruction address with a
    * branch type and possible known target. */
diff --git a/src/include/simeng/Elf.hh b/src/include/simeng/Elf.hh
index 485debea60..88e101c88b 100644
--- a/src/include/simeng/Elf.hh
+++ b/src/include/simeng/Elf.hh
@@ -1,8 +1,8 @@
 #pragma once
 
 #include <string>
-#include <vector>
 #include <unordered_map>
+#include <vector>
 
 #include "simeng/span.hh"
 
@@ -33,83 +33,76 @@ struct Elf32Header {
 
 typedef struct {
   unsigned char e_ident[16];
-  uint16_t      e_type;
-  uint16_t      e_machine;
-  uint32_t      e_version;
-  uint32_t      e_entry;
-  uint32_t      e_phoff;
-  uint32_t      e_shoff;
-  uint32_t      e_flags;
-  uint16_t      e_ehsize;
-  uint16_t      e_phentsize;
-  uint16_t      e_phnum;
-  uint16_t      e_shentsize;
-  uint16_t      e_shnum;
-  uint16_t      e_shstrndx;
+  uint16_t e_type;
+  uint16_t e_machine;
+  uint32_t e_version;
+  uint32_t e_entry;
+  uint32_t e_phoff;
+  uint32_t e_shoff;
+  uint32_t e_flags;
+  uint16_t e_ehsize;
+  uint16_t e_phentsize;
+  uint16_t e_phnum;
+  uint16_t e_shentsize;
+  uint16_t e_shnum;
+  uint16_t e_shstrndx;
 } Elf32_Ehdr;
 
 typedef struct {
-    uint32_t   p_type;
-    uint32_t   p_offset;
-    uint32_t   p_vaddr;
-    uint32_t   p_paddr;
-    uint32_t   p_filesz;
-    uint32_t   p_memsz;
-    uint32_t   p_flags;
-    uint32_t   p_align;
+  uint32_t p_type;
+  uint32_t p_offset;
+  uint32_t p_vaddr;
+  uint32_t p_paddr;
+  uint32_t p_filesz;
+  uint32_t p_memsz;
+  uint32_t p_flags;
+  uint32_t p_align;
 } Elf32_Phdr;
 
 typedef struct {
-  uint32_t   sh_name;
-  uint32_t   sh_type;
-  uint32_t   sh_flags;
-  uint32_t   sh_addr;
-  uint32_t   sh_offset;
-  uint32_t   sh_size;
-  uint32_t   sh_link;
-  uint32_t   sh_info;
-  uint32_t   sh_addralign;
-  uint32_t   sh_entsize;
+  uint32_t sh_name;
+  uint32_t sh_type;
+  uint32_t sh_flags;
+  uint32_t sh_addr;
+  uint32_t sh_offset;
+  uint32_t sh_size;
+  uint32_t sh_link;
+  uint32_t sh_info;
+  uint32_t sh_addralign;
+  uint32_t sh_entsize;
 } Elf32_Shdr;
 
 typedef struct {
-    uint32_t      st_name;
-    uint32_t      st_value;
-    uint32_t      st_size;
-    unsigned char st_info;
-    unsigned char st_other;
-    uint16_t      st_shndx;
+  uint32_t st_name;
+  uint32_t st_value;
+  uint32_t st_size;
+  unsigned char st_info;
+  unsigned char st_other;
+  uint16_t st_shndx;
 } Elf32_Sym;
 
-enum ElfPhType {
-  PT_NULL,
-  PT_LOAD
-};
+enum ElfPhType { PT_NULL, PT_LOAD };
 
-enum ElfShType {
-  SHT_NULL,
-  SHT_PROGBITS,
-  SHT_SYMTAB,
-  SHT_STRTAB
-};
+enum ElfShType { SHT_NULL, SHT_PROGBITS, SHT_SYMTAB, SHT_STRTAB };
 
 /** A processed Executable and Linkable Format (ELF) file. */
 class Elf {
-  public:
-    Elf(std::string path, char** imagePointer, std::unordered_map<std::string, uint64_t>& symbols);
-    ~Elf();
-    uint64_t  getProcessImageSize() const;
-    bool      isValid() const;
-    uint64_t  getEntryPoint() const;
+ public:
+  Elf(std::string path, char** imagePointer,
+      std::unordered_map<std::string, uint64_t>& symbols);
+  ~Elf();
+  uint64_t getProcessImageSize() const;
+  bool isValid() const;
+  uint64_t getEntryPoint() const;
 
-  private:
-    uint64_t  entryPoint_;
-    std::vector<ElfHeader> headers_;
-    uint32_t  entryPoint32_;
-    std::vector<Elf32Header> headers32_;
-    bool      isValid_ = false;
-    uint64_t  processImageSize_;
-    bool      mode32bit_;
+ private:
+  uint64_t entryPoint_;
+  std::vector<ElfHeader> headers_;
+  uint32_t entryPoint32_;
+  std::vector<Elf32Header> headers32_;
+  bool isValid_ = false;
+  uint64_t processImageSize_;
+  bool mode32bit_;
 };
 
 }  // namespace simeng
diff --git a/src/include/simeng/Instruction.hh b/src/include/simeng/Instruction.hh
index 9ffc4a8d27..8c681b1076 100644
--- a/src/include/simeng/Instruction.hh
+++ b/src/include/simeng/Instruction.hh
@@ -24,7 +24,7 @@ class Instruction {
   bool exceptionEncountered() const;
 
   /** Binds an interrupt to this instruction  */
-  virtual void raiseInterrupt(int16_t& interruptId)                     {}
+  virtual void raiseInterrupt(int16_t& interruptId) {}
 
   /** Retrieve the source registers this instruction reads. */
   virtual const span<Register> getOperandRegisters() const = 0;
@@ -102,7 +102,8 @@ class Instruction {
   /** Retrieve branch type. */
   virtual BranchType getBranchType() const = 0;
 
-  /** Retrieve an offset of branch target from the instruction's metadata if known. */
+  /** Retrieve an offset of branch target from the instruction's metadata if
+   * known. */
   virtual uint64_t getKnownOffset() const = 0;
 
   /** Is this a store address operation (a subcategory of store operations which
@@ -217,7 +218,8 @@ class Instruction {
   /** What type of branch this instruction is. */
   BranchType branchType_ = BranchType::Unknown;
 
-  /** If the offset of branch target is known at the time of decode, store it. */
+  /** If the offset of branch target is known at the time of decode, store it.
+   */
   uint64_t knownOffset_ = 0;
 
   // Flushing
diff --git a/src/include/simeng/RegisterValue.hh b/src/include/simeng/RegisterValue.hh
index d85471eda3..ebee3fa730 100644
--- a/src/include/simeng/RegisterValue.hh
+++ b/src/include/simeng/RegisterValue.hh
@@ -26,12 +26,13 @@ class RegisterValue {
    * number of bytes (defaulting to the size of the template type). */
   template <class T,
             typename std::enable_if_t<!std::is_pointer_v<T>, T>* = nullptr>
-  RegisterValue(T value, uint16_t bytes = sizeof(T), bool relaxFor32 = true) : bytes(bytes) {
+  RegisterValue(T value, uint16_t bytes = sizeof(T), bool relaxFor32 = true)
+      : bytes(bytes) {
     relaxedFor32bit_ = relaxFor32;
     std::memset(this->value, 0, MAX_LOCAL_BYTES);
     if (isLocal()) {
       T* view = reinterpret_cast<T*>(this->value);
-      if (sizeof(T) > bytes) { // e.g. when T is int64 and bytes is 4
+      if (sizeof(T) > bytes) {  // e.g. when T is int64 and bytes is 4
         std::memcpy(this->value, &value, bytes);
       } else {
         view[0] = value;
@@ -98,13 +99,15 @@ class RegisterValue {
   const T* getAsVector() const {
     static_assert(alignof(T) <= 8 && "Alignment over 8 bytes not guaranteed");
     assert(bytes > 0 && "Attempted to access an uninitialised RegisterValue");
-    assert((sizeof(T) <= bytes || (bytes == 4 && sizeof(T) == 8)) && "Attempted"
+    assert((sizeof(T) <= bytes || (bytes == 4 && sizeof(T) == 8)) &&
+           "Attempted"
            " to access a RegisterValue as a datatype larger than the "
-           "data held" );
-    if(!relaxedFor32bit_) { // maybe #ifdef if it makes slower?
-      assert(sizeof(T) <= bytes &&
-        "Attempted to access a RegisterValue as a datatype larger than the "
-        "data held");
+           "data held");
+    if (!relaxedFor32bit_) {  // maybe #ifdef if it makes slower?
+      assert(
+          sizeof(T) <= bytes &&
+          "Attempted to access a RegisterValue as a datatype larger than the "
+          "data held");
     }
     if (isLocal()) {
       return reinterpret_cast<const T*>(value);
diff --git a/src/include/simeng/arch/Architecture.hh b/src/include/simeng/arch/Architecture.hh
index 29874c6d69..0b31130c3b 100644
--- a/src/include/simeng/arch/Architecture.hh
+++ b/src/include/simeng/arch/Architecture.hh
@@ -15,10 +15,7 @@ using MacroOp = std::vector<std::shared_ptr<Instruction>>;
 namespace arch {
 
 /** Modes. Assume only has 32-bit and 64-bit. */
-enum arch_mode {
-  ARCH_32BIT=1,
-  ARCH_64BIT=0
-};
+enum arch_mode { ARCH_32BIT = 1, ARCH_64BIT = 0 };
 
 /** The types of changes that can be made to values within the process state. */
 enum class ChangeType { REPLACEMENT, INCREMENT, DECREMENT };
@@ -116,17 +113,16 @@ class Architecture {
       YAML::Node config) const = 0;
 
   /** Updates System registers of any system-based timers. */
-  virtual int16_t updateSystemTimerRegisters(RegisterFileSet* regFile,
-                                          const uint64_t iterations) const = 0;
+  virtual int16_t updateSystemTimerRegisters(
+      RegisterFileSet* regFile, const uint64_t iterations) const = 0;
 
   /** Update trace file */
   virtual void updateInstrTrace(const std::shared_ptr<Instruction>& instruction,
-                                RegisterFileSet* regFile, uint64_t tick) const = 0;
+                                RegisterFileSet* regFile,
+                                uint64_t tick) const = 0;
 
   /** Return the mode (32-bit or 64-bit) */
-  arch_mode is32BitMode() const {
-    return is32Bit_;
-  }
+  arch_mode is32BitMode() const { return is32Bit_; }
 
  protected:
   /** Mode, either 32-bit or 64-bit */
diff --git a/src/include/simeng/arch/aarch64/Architecture.hh b/src/include/simeng/arch/aarch64/Architecture.hh
index 3c1ce27f59..3a0b8457cd 100644
--- a/src/include/simeng/arch/aarch64/Architecture.hh
+++ b/src/include/simeng/arch/aarch64/Architecture.hh
@@ -63,7 +63,7 @@ class Architecture : public arch::Architecture {
 
   /** Updates System registers of any system-based timers. */
   int16_t updateSystemTimerRegisters(RegisterFileSet* regFile,
-                                  const uint64_t iterations) const override;
+                                     const uint64_t iterations) const override;
 
   /** Returns the physical register structure as defined within the config file
    */
diff --git a/src/include/simeng/arch/riscv/Architecture.hh b/src/include/simeng/arch/riscv/Architecture.hh
index 3bdb6287e9..2833113fe5 100644
--- a/src/include/simeng/arch/riscv/Architecture.hh
+++ b/src/include/simeng/arch/riscv/Architecture.hh
@@ -1,19 +1,18 @@
 #pragma once
 
 #include <forward_list>
-#include <unordered_map>
 #include <fstream>
 #include <iomanip>
+#include <unordered_map>
 
 #include "simeng/arch/Architecture.hh"
-
 #include "simeng/arch/riscv/Instruction.hh"
 #include "simeng/kernel/Linux.hh"
 
 using csh = size_t;
 
-#include "simeng/arch/riscv/SystemRegister.hh"
 #include "simeng/arch/riscv/ExceptionHandler.hh"
+#include "simeng/arch/riscv/SystemRegister.hh"
 
 namespace simeng {
 namespace arch {
@@ -30,13 +29,14 @@ struct constantsPool {
 struct archConstants {
   uint8_t alignMask;
   uint8_t bytesLimit; /* Minimum bytes the decoder needs to process */
-  uint8_t regWidth; /* Register width in bytes */
+  uint8_t regWidth;   /* Register width in bytes */
 };
 
 /* A basic RISC-V implementation of the `Architecture` interface. */
 class Architecture : public arch::Architecture {
  public:
-  Architecture(kernel::Linux& kernel, YAML::Node config, std::shared_ptr<simeng::MemoryInterface>& dataMemory);
+  Architecture(kernel::Linux& kernel, YAML::Node config,
+               std::shared_ptr<simeng::MemoryInterface>& dataMemory);
   ~Architecture();
   /** Pre-decode instruction memory into a macro-op of `Instruction`
    * instances. Returns the number of bytes consumed to produce it (always 4),
@@ -74,9 +74,10 @@ class Architecture : public arch::Architecture {
   /** Returns the minimum size of a valid instruction in bytes. */
   uint8_t getMinInstructionSize() const override;
 
-  /** Updates System registers of any system-based timers. Return +ve id if interrupt occurs */
+  /** Updates System registers of any system-based timers. Return +ve id if
+   * interrupt occurs */
   int16_t updateSystemTimerRegisters(RegisterFileSet* regFile,
-                                  const uint64_t iterations) const override;
+                                     const uint64_t iterations) const override;
 
   /** Returns the physical register structure as defined within the config file
    */
@@ -115,15 +116,18 @@ class Architecture : public arch::Architecture {
   std::unordered_map<uint16_t, uint16_t> systemRegisterMap_;
 
   /** Ordered map of memory mapped system regsiters banks **/
-  std::map<uint64_t, MemoryMappedSystemRegisterBlock*> memoryMappedSystemRegisterBlocks;
+  std::map<uint64_t, MemoryMappedSystemRegisterBlock*>
+      memoryMappedSystemRegisterBlocks;
 
-  /* Memory Interface through which memory mapped system registers are accessed */
+  /* Memory Interface through which memory mapped system registers are accessed
+   */
   std::shared_ptr<SystemRegisterMemoryInterface> systemRegisterMemoryInterface;
 
   /* Optional Clint block which replicates that functionality in spike */
   std::shared_ptr<Clint> clint;
 
-  /* Optional Host Target Interface block which replicates that functionality in spike */
+  /* Optional Host Target Interface block which replicates that functionality in
+   * spike */
   std::shared_ptr<HostTargetInterface> htif;
 
   /** A map to hold the relationship between aarch64 instruction groups and
@@ -141,7 +145,7 @@ class Architecture : public arch::Architecture {
   kernel::Linux& linux_;
 
   /** A pointer to the trace file */
-  std::ofstream *traceFile_;
+  std::ofstream* traceFile_;
 
   /** Switch for updateInstrTrace() */
   bool traceOn_ = false;
diff --git a/src/include/simeng/arch/riscv/ExceptionHandler.hh b/src/include/simeng/arch/riscv/ExceptionHandler.hh
index 36cfd5d187..501f52bc30 100644
--- a/src/include/simeng/arch/riscv/ExceptionHandler.hh
+++ b/src/include/simeng/arch/riscv/ExceptionHandler.hh
@@ -57,7 +57,7 @@ class ExceptionHandler : public simeng::arch::ExceptionHandler {
    */
   bool readBufferThen(uint64_t ptr, uint64_t length, std::function<bool()> then,
                       bool firstCall = true);
-  
+
   /** generate system register changes associated with taking an exception **/
   void takeException(uint64_t causecode);
 
diff --git a/src/include/simeng/arch/riscv/Instruction.hh b/src/include/simeng/arch/riscv/Instruction.hh
index 60966ce044..55b72692d1 100644
--- a/src/include/simeng/arch/riscv/Instruction.hh
+++ b/src/include/simeng/arch/riscv/Instruction.hh
@@ -89,12 +89,11 @@ class Instruction : public simeng::Instruction {
   virtual InstructionException getException() const;
 
   /** Raise an interrupt. */
-  void raiseInterrupt(int16_t& interruptId)
-  {
-    interruptId_          = interruptId;
+  void raiseInterrupt(int16_t& interruptId) {
+    interruptId_ = interruptId;
     exceptionEncountered_ = true;
-    exception_            = InstructionException::Interrupt;
-    interruptId           = -1;
+    exception_ = InstructionException::Interrupt;
+    interruptId = -1;
   }
 
   /** Get Id of this interrupr */
@@ -152,7 +151,8 @@ class Instruction : public simeng::Instruction {
   /** Retrieve branch type. */
   BranchType getBranchType() const override;
 
-  /** Retrieve an offset of branch target from the instruction's metadata if known. */
+  /** Retrieve an offset of branch target from the instruction's metadata if
+   * known. */
   uint64_t getKnownOffset() const override;
 
   /** Is this a store address operation (a subcategory of store operations which
@@ -207,7 +207,8 @@ class Instruction : public simeng::Instruction {
   static const uint8_t MAX_SOURCE_REGISTERS = 2;
   /** The maximum number of destination registers any supported RISC-V
    * instruction can have. */
-  static const uint8_t MAX_DESTINATION_REGISTERS = 2; //CSRs can be another destination apart from std RD
+  static const uint8_t MAX_DESTINATION_REGISTERS =
+      2;  // CSRs can be another destination apart from std RD
 
   /** A reference to the ISA instance this instruction belongs to. */
   const Architecture& architecture_;
@@ -307,7 +308,7 @@ class Instruction : public simeng::Instruction {
   std::vector<RegisterValue> memoryData;
 
   /** Return integer register value, to support both 32-bit and 64-bit mode */
-  int64_t  getSignedInt(RegisterValue& value) const;
+  int64_t getSignedInt(RegisterValue& value) const;
 
   int16_t interruptId_;
 };
diff --git a/src/include/simeng/arch/riscv/SystemRegister.hh b/src/include/simeng/arch/riscv/SystemRegister.hh
index 0556156ef6..5f0fe74459 100644
--- a/src/include/simeng/arch/riscv/SystemRegister.hh
+++ b/src/include/simeng/arch/riscv/SystemRegister.hh
@@ -1,12 +1,11 @@
 #pragma once
 
 #include <forward_list>
-#include <unordered_map>
 #include <fstream>
 #include <iomanip>
+#include <unordered_map>
 
 #include "simeng/arch/Architecture.hh"
-
 #include "simeng/arch/riscv/Instruction.hh"
 #include "simeng/kernel/Linux.hh"
 
@@ -17,213 +16,198 @@ namespace riscv {
 // Should probably move to Capstone
 
 enum riscv_sysreg {
-  SYSREG_MSTATUS    = 0x300,
-  SYSREG_MIE        = 0x304,
-  SYSREG_MTVEC      = 0x305,
-  SYSREG_MSTATUSH   = 0x310,
-  SYSREG_MSCRATCH   = 0x340,
-  SYSREG_MEPC       = 0x341,
-  SYSREG_MCAUSE     = 0x342,
-  SYSREG_MHARTID    = 0xF14,
-  SYSREG_MXCPTSC    = 0xFC2,
-  SYSREG_CYCLE      = 0xC00,
-  SYSREG_TIME       = 0xC01,
-  SYSREG_INSTRRET   = 0xC02
+  SYSREG_MSTATUS = 0x300,
+  SYSREG_MIE = 0x304,
+  SYSREG_MTVEC = 0x305,
+  SYSREG_MSTATUSH = 0x310,
+  SYSREG_MSCRATCH = 0x340,
+  SYSREG_MEPC = 0x341,
+  SYSREG_MCAUSE = 0x342,
+  SYSREG_MHARTID = 0xF14,
+  SYSREG_MXCPTSC = 0xFC2,
+  SYSREG_CYCLE = 0xC00,
+  SYSREG_TIME = 0xC01,
+  SYSREG_INSTRRET = 0xC02
 };
 
 enum riscv_causecode_enum {
-  CAUSE_IADDRESS_MISALIGN   = 0,
-  CAUSE_IACCESS_FAULT       = 1,
+  CAUSE_IADDRESS_MISALIGN = 0,
+  CAUSE_IACCESS_FAULT = 1,
   CAUSE_ILLEGAL_INSTRUCTION = 2,
-  CAUSE_BREAKPOINT          = 3,
-  CAUSE_LDADDRESS_MISALIGN  = 4,
-  CAUSE_LDACCESS_FAULT      = 5,
-  CAUSE_STADDRESS_MISALIGN  = 6,
-  CAUSE_STACCESS_FAULT      = 7,
-  CAUSE_ECALL_FROM_M        = 11
+  CAUSE_BREAKPOINT = 3,
+  CAUSE_LDADDRESS_MISALIGN = 4,
+  CAUSE_LDACCESS_FAULT = 5,
+  CAUSE_STADDRESS_MISALIGN = 6,
+  CAUSE_STACCESS_FAULT = 7,
+  CAUSE_ECALL_FROM_M = 11
 };
 
-enum class InterruptId {
-  HALT             = 1,
-  TIMER            = 7
-};
+enum class InterruptId { HALT = 1, TIMER = 7 };
 
-enum riscv_sysreg_masks {
-  MSTATUS_MIE_MASK           = 0x8,
-  MSTATUS_MPIE_MASK          = 0x80
-};
+enum riscv_sysreg_masks { MSTATUS_MIE_MASK = 0x8, MSTATUS_MPIE_MASK = 0x80 };
 
 typedef uint16_t riscv_causecode;
 
 class MemoryMappedSystemRegister {
-  public:
-    MemoryMappedSystemRegister(const RegisterValue& val)          : state(val) {}                
-    bool size()                                                   { return state.size(); }
-    virtual void  put(const RegisterValue& val)                   { state = val; }
-    virtual const RegisterValue& get()                            { return state; }
-  private:
-    RegisterValue state;
+ public:
+  MemoryMappedSystemRegister(const RegisterValue& val) : state(val) {}
+  bool size() { return state.size(); }
+  virtual void put(const RegisterValue& val) { state = val; }
+  virtual const RegisterValue& get() { return state; }
+
+ private:
+  RegisterValue state;
 };
 
 class MemoryMappedSystemRegisterBlock {
-  public:
-    MemoryMappedSystemRegisterBlock(size_t sz)                    : size_(sz) {}
-    size_t size()                                                 { return size_; }
-    virtual bool put(uint16_t, const RegisterValue&);
-    virtual bool get(uint16_t, RegisterValue&);
-    virtual void tick()                                           {}
-  protected:
-    /** Ordered map of memory mapped system regsiters **/
-    std::map<uint16_t, MemoryMappedSystemRegister*> memoryMappedSystemRegisters;
-    size_t size_;
+ public:
+  MemoryMappedSystemRegisterBlock(size_t sz) : size_(sz) {}
+  size_t size() { return size_; }
+  virtual bool put(uint16_t, const RegisterValue&);
+  virtual bool get(uint16_t, RegisterValue&);
+  virtual void tick() {}
+
+ protected:
+  /** Ordered map of memory mapped system regsiters **/
+  std::map<uint16_t, MemoryMappedSystemRegister*> memoryMappedSystemRegisters;
+  size_t size_;
 };
 
 class SystemRegisterMemoryInterface : public MemoryInterface {
-  public:
-    SystemRegisterMemoryInterface(
-      std::shared_ptr<simeng::MemoryInterface>& dataMemory, 
-      std::map<uint64_t,MemoryMappedSystemRegisterBlock*>& memoryMappedSystemRegisterBlocks
-    ) :
-      dataMemory_(dataMemory),
-      memoryMappedSystemRegisterBlocks_(memoryMappedSystemRegisterBlocks)
-    {}
-
-    /** Request a read from the supplied target location. */
-    virtual void requestRead(const MemoryAccessTarget& target,
-                            uint64_t requestId = 0)
-    {
-      RegisterValue data(0,target.size);
-      if (getMemoryMappedSystemRegister(target.address, data))
-        completedReads_.push_back({target, data, requestId});
-      else 
-        dataMemory_.get()->requestRead(target,requestId);
-    }
-
-    /** Request a write of `data` to the target location. */
-    virtual void requestWrite(const MemoryAccessTarget& target,
-                              const RegisterValue& data)
-    {
-      if (!putMemoryMappedSystemRegister(target.address, data))
-        dataMemory_.get()->requestWrite(target,data);
-    }
-
-    /** Retrieve all completed read requests. */
-    virtual const span<MemoryReadResult> getCompletedReads() const
-    {
-      if (completedReads_.empty())
-        return dataMemory_.get()->getCompletedReads();
-      else
-        return {const_cast<MemoryReadResult*>(completedReads_.data()), completedReads_.size()};
-    }
-
-    /** Clear the completed reads. */
-    virtual void clearCompletedReads()
-    {
-      if (completedReads_.empty())
-        dataMemory_.get()->clearCompletedReads();
-      else 
-        completedReads_.clear();
-    }
-
-    /** Returns true if there are any oustanding memory requests in-flight. */
-    virtual bool hasPendingRequests() const
-    {
-      return dataMemory_.get()->hasPendingRequests();
-    }
-
-    /** Tick the memory interface to allow it to process internal tasks.
-    *
-    * TODO: Move ticking out of the memory interface and into a central "memory
-    * system" covering a set of related interfaces.
-    */
-    virtual void tick()
-    {
-      dataMemory_.get()->tick();
-    }
-
-  private :
-    /** Put/Get Memory Mapped Registers */
-    bool putMemoryMappedSystemRegister(uint64_t address, const RegisterValue& value);
-    bool getMemoryMappedSystemRegister(uint64_t address, RegisterValue& value);
-
-    std::shared_ptr<simeng::MemoryInterface> dataMemory_;
-
-    /** Address map of all system register blocks */
-    std::map<uint64_t,MemoryMappedSystemRegisterBlock*>& memoryMappedSystemRegisterBlocks_;
-    
-    /** A vector containing all completed read requests. */
-    std::vector<MemoryReadResult> completedReads_;
+ public:
+  SystemRegisterMemoryInterface(
+      std::shared_ptr<simeng::MemoryInterface>& dataMemory,
+      std::map<uint64_t, MemoryMappedSystemRegisterBlock*>&
+          memoryMappedSystemRegisterBlocks)
+      : dataMemory_(dataMemory),
+        memoryMappedSystemRegisterBlocks_(memoryMappedSystemRegisterBlocks) {}
+
+  /** Request a read from the supplied target location. */
+  virtual void requestRead(const MemoryAccessTarget& target,
+                           uint64_t requestId = 0) {
+    RegisterValue data(0, target.size);
+    if (getMemoryMappedSystemRegister(target.address, data))
+      completedReads_.push_back({target, data, requestId});
+    else
+      dataMemory_.get()->requestRead(target, requestId);
+  }
+
+  /** Request a write of `data` to the target location. */
+  virtual void requestWrite(const MemoryAccessTarget& target,
+                            const RegisterValue& data) {
+    if (!putMemoryMappedSystemRegister(target.address, data))
+      dataMemory_.get()->requestWrite(target, data);
+  }
+
+  /** Retrieve all completed read requests. */
+  virtual const span<MemoryReadResult> getCompletedReads() const {
+    if (completedReads_.empty())
+      return dataMemory_.get()->getCompletedReads();
+    else
+      return {const_cast<MemoryReadResult*>(completedReads_.data()),
+              completedReads_.size()};
+  }
+
+  /** Clear the completed reads. */
+  virtual void clearCompletedReads() {
+    if (completedReads_.empty())
+      dataMemory_.get()->clearCompletedReads();
+    else
+      completedReads_.clear();
+  }
+
+  /** Returns true if there are any oustanding memory requests in-flight. */
+  virtual bool hasPendingRequests() const {
+    return dataMemory_.get()->hasPendingRequests();
+  }
+
+  /** Tick the memory interface to allow it to process internal tasks.
+   *
+   * TODO: Move ticking out of the memory interface and into a central "memory
+   * system" covering a set of related interfaces.
+   */
+  virtual void tick() { dataMemory_.get()->tick(); }
+
+ private:
+  /** Put/Get Memory Mapped Registers */
+  bool putMemoryMappedSystemRegister(uint64_t address,
+                                     const RegisterValue& value);
+  bool getMemoryMappedSystemRegister(uint64_t address, RegisterValue& value);
+
+  std::shared_ptr<simeng::MemoryInterface> dataMemory_;
+
+  /** Address map of all system register blocks */
+  std::map<uint64_t, MemoryMappedSystemRegisterBlock*>&
+      memoryMappedSystemRegisterBlocks_;
+
+  /** A vector containing all completed read requests. */
+  std::vector<MemoryReadResult> completedReads_;
 };
 
 class Architecture;
 
 class HostTargetInterface : public MemoryMappedSystemRegisterBlock {
-  public:
-    enum { 
-      PAYLOAD_OFFSET  = 0,
-      DEVICEID_OFFSET = 4
-    };
-
-    HostTargetInterface(Architecture& architecture)
-    : 
-      MemoryMappedSystemRegisterBlock(8),
-      architecture_(architecture),
-      isHalted_(false)
-    {
-      memoryMappedSystemRegisters[PAYLOAD_OFFSET]  = new MemoryMappedSystemRegister(static_cast<uint32_t>(0));
-      memoryMappedSystemRegisters[DEVICEID_OFFSET] = new MemoryMappedSystemRegister(static_cast<uint32_t>(0));
-    }
-
-    bool put(uint16_t offset, const RegisterValue&value);
-
-    int16_t updateSystemTimerRegisters(RegisterFileSet* regFile, const uint64_t iterations) {
-      if (isHalted_)
-        return static_cast<int16_t>(InterruptId::HALT);
-      return -1;
-    }
-
-  private :
-    Architecture& architecture_;
-    bool          isHalted_;
+ public:
+  enum { PAYLOAD_OFFSET = 0, DEVICEID_OFFSET = 4 };
+
+  HostTargetInterface(Architecture& architecture)
+      : MemoryMappedSystemRegisterBlock(8),
+        architecture_(architecture),
+        isHalted_(false) {
+    memoryMappedSystemRegisters[PAYLOAD_OFFSET] =
+        new MemoryMappedSystemRegister(static_cast<uint32_t>(0));
+    memoryMappedSystemRegisters[DEVICEID_OFFSET] =
+        new MemoryMappedSystemRegister(static_cast<uint32_t>(0));
+  }
+
+  bool put(uint16_t offset, const RegisterValue& value);
+
+  int16_t updateSystemTimerRegisters(RegisterFileSet* regFile,
+                                     const uint64_t iterations) {
+    if (isHalted_) return static_cast<int16_t>(InterruptId::HALT);
+    return -1;
+  }
+
+ private:
+  Architecture& architecture_;
+  bool isHalted_;
 };
 
 class Clint : public MemoryMappedSystemRegisterBlock {
-  public:
-    enum {
-      CLINT_BASE        = 0x02000000,
-      CLINT_SIZE        = 0x0000c000,
-      MTIMECMP_OFFSET   = 0x4000,
-      MTIME_OFFSET      = 0xbff8
-    };
-
-    Clint(Architecture& architecture)
-    :
-      MemoryMappedSystemRegisterBlock(CLINT_SIZE),
-      architecture_(architecture),
-      mtime_(static_cast<uint64_t>(0)),
-      mtimecmp_(static_cast<uint64_t>(0)),
-      mtime_freq(100),
-      mtime_count(0),
-      last_tick(0)
-    {
-      memoryMappedSystemRegisters[MTIME_OFFSET]    = &mtime_;
-      memoryMappedSystemRegisters[MTIMECMP_OFFSET] = &mtimecmp_;
-    }
-
-    int16_t updateSystemTimerRegisters(RegisterFileSet* regFile, const uint64_t iterations);
-
-  private :
-    Architecture& architecture_;
-
-    MemoryMappedSystemRegister mtime_;
-    MemoryMappedSystemRegister mtimecmp_;
-
-    uint32_t      mtime_freq;
-    uint32_t      mtime_count;
-    uint64_t      last_tick;
+ public:
+  enum {
+    CLINT_BASE = 0x02000000,
+    CLINT_SIZE = 0x0000c000,
+    MTIMECMP_OFFSET = 0x4000,
+    MTIME_OFFSET = 0xbff8
+  };
+
+  Clint(Architecture& architecture)
+      : MemoryMappedSystemRegisterBlock(CLINT_SIZE),
+        architecture_(architecture),
+        mtime_(static_cast<uint64_t>(0)),
+        mtimecmp_(static_cast<uint64_t>(0)),
+        mtime_freq(100),
+        mtime_count(0),
+        last_tick(0) {
+    memoryMappedSystemRegisters[MTIME_OFFSET] = &mtime_;
+    memoryMappedSystemRegisters[MTIMECMP_OFFSET] = &mtimecmp_;
+  }
+
+  int16_t updateSystemTimerRegisters(RegisterFileSet* regFile,
+                                     const uint64_t iterations);
+
+ private:
+  Architecture& architecture_;
+
+  MemoryMappedSystemRegister mtime_;
+  MemoryMappedSystemRegister mtimecmp_;
+
+  uint32_t mtime_freq;
+  uint32_t mtime_count;
+  uint64_t last_tick;
 };
 
-
 }  // namespace riscv
 }  // namespace arch
 }  // namespace simeng
diff --git a/src/include/simeng/kernel/LinuxProcess.hh b/src/include/simeng/kernel/LinuxProcess.hh
index d6b2c4a967..a4b4ce428d 100644
--- a/src/include/simeng/kernel/LinuxProcess.hh
+++ b/src/include/simeng/kernel/LinuxProcess.hh
@@ -116,7 +116,7 @@ class LinuxProcess {
 
   /** Shared pointer to processImage. */
   std::shared_ptr<char> processImage_;
-  
+
   std::unordered_map<std::string, uint64_t> symbols_;
 };
 
diff --git a/src/include/simeng/models/emulation/Core.hh b/src/include/simeng/models/emulation/Core.hh
index 1db10d2381..fb8767ec2f 100644
--- a/src/include/simeng/models/emulation/Core.hh
+++ b/src/include/simeng/models/emulation/Core.hh
@@ -11,7 +11,8 @@
 #include "simeng/arch/Architecture.hh"
 #include "simeng/span.hh"
 
-// TODO: This is architecture-specific, need to be refactored later. See comments in Core.cc
+// TODO: This is architecture-specific, need to be refactored later. See
+// comments in Core.cc
 #include "simeng/arch/riscv/Architecture.hh"
 
 namespace simeng {
@@ -110,7 +111,7 @@ class Core : public simeng::Core {
   uint64_t branchesExecuted_ = 0;
 
   /** Set to interruptId when interrupt occurs, otherwise -1 */
-  int16_t  interruptId_;
+  int16_t interruptId_;
 };
 
 }  // namespace emulation
diff --git a/src/include/simeng/pipeline/PipelineBuffer1.hh b/src/include/simeng/pipeline/PipelineBuffer1.hh
index dd2ed70ce7..e677645fdf 100644
--- a/src/include/simeng/pipeline/PipelineBuffer1.hh
+++ b/src/include/simeng/pipeline/PipelineBuffer1.hh
@@ -15,13 +15,18 @@ class PipelineBuffer {
   /** Construct a pipeline buffer of width `width`, and fill all slots with
    * `initialValue`. */
   PipelineBuffer(int width, const T& initialValue)
-      : width(width), buffer(width * defaultLength_, initialValue),
-        length_(defaultLength_), headIndex_(defaultLength_-1),
+      : width(width),
+        buffer(width * defaultLength_, initialValue),
+        length_(defaultLength_),
+        headIndex_(defaultLength_ - 1),
         tailIndex_(0) {}
 
   PipelineBuffer(int width, const T& initialValue, int length)
-      : width(width), buffer(width * length, initialValue), length_(length),
-        headIndex_(length_-1), tailIndex_(0) {
+      : width(width),
+        buffer(width * length, initialValue),
+        length_(length),
+        headIndex_(length_ - 1),
+        tailIndex_(0) {
     assert(length_ != 0 && "Pipeline buffer length cannot be 0");
   }
 
@@ -30,14 +35,14 @@ class PipelineBuffer {
   void tick() {
     if (isStalled_) return;
 
-    //length ==1 shortcut? condition check cost
+    // length ==1 shortcut? condition check cost
 
-    if (headIndex_) { // when headIndex != 0
+    if (headIndex_) {  // when headIndex != 0
       headIndex_--;
     } else {
       headIndex_ = length_ - 1;
     }
-    if (tailIndex_) { // when tailIndex != 0
+    if (tailIndex_) {  // when tailIndex != 0
       tailIndex_--;
     } else {
       tailIndex_ = length_ - 1;
diff --git a/src/include/simeng/pipeline_hi/LoadStoreQueue.hh b/src/include/simeng/pipeline_hi/LoadStoreQueue.hh
index 211b1ef72d..5f34b956d6 100644
--- a/src/include/simeng/pipeline_hi/LoadStoreQueue.hh
+++ b/src/include/simeng/pipeline_hi/LoadStoreQueue.hh
@@ -117,7 +117,7 @@ class LoadStoreQueue {
 
   bool isBusy() const;
 
-  float getAvgLdLat() const { return (totalLdLatency)/numLoads; };
+  float getAvgLdLat() const { return (totalLdLatency) / numLoads; };
 
   uint32_t getMaxLdLat() const { return maxLdLatency; };
   uint32_t getMinLdLat() const { return minLdLatency; };
@@ -138,7 +138,7 @@ class LoadStoreQueue {
   /** Map of loads that have requested their data, keyed by sequence ID. */
   std::unordered_map<uint64_t, std::shared_ptr<Instruction>> requestedLoads_;
 
-    /** Map of loads that have requested their data, keyed by sequence ID. */
+  /** Map of loads that have requested their data, keyed by sequence ID. */
   std::unordered_map<uint64_t, uint64_t> latencyLoads_;
 
   /** A function handler to call to forward the results of a completed load. */
@@ -214,15 +214,16 @@ class LoadStoreQueue {
   /** The number of loads and stores permitted per cycle. */
   std::array<uint16_t, 2> reqLimits_;
 
-  /** A map between LSQ cycles and load or store requests ready on that cycle. */
+  /** A map between LSQ cycles and load or store requests ready on that cycle.
+   */
   std::deque<requestEntry1> requestQueue_;
 
   /* Identifier for request to memory*/
   uint8_t busReqId = 0;
 
-  //bool activeMisAlignedStore = false;
+  // bool activeMisAlignedStore = false;
 
-  //Stats
+  // Stats
   uint64_t numLoads = 0;
   double totalLdLatency = 0;
   uint32_t maxLdLatency = 0;
@@ -230,6 +231,5 @@ class LoadStoreQueue {
   float averageAccessLdLatency = 0.0;
 };
 
-
 }  // namespace pipeline_hi
 }  // namespace simeng
diff --git a/src/include/simeng/pipeline_hi/StaticPredictor.hh b/src/include/simeng/pipeline_hi/StaticPredictor.hh
index d8923dc23c..83c7f0e83e 100644
--- a/src/include/simeng/pipeline_hi/StaticPredictor.hh
+++ b/src/include/simeng/pipeline_hi/StaticPredictor.hh
@@ -12,7 +12,8 @@ namespace pipeline_hi {
  */
 class StaticPredictor : public BranchPredictor {
  public:
-  StaticPredictor(uint8_t sType); //TODO: temp constructor, get rid of yaml, delete it later
+  StaticPredictor(uint8_t sType);  // TODO: temp constructor, get rid of yaml,
+                                   // delete it later
   StaticPredictor(YAML::Node config);
   ~StaticPredictor();
 
diff --git a/src/lib/CoreInstance.cc b/src/lib/CoreInstance.cc
index e8f91d3450..f8c76c930b 100644
--- a/src/lib/CoreInstance.cc
+++ b/src/lib/CoreInstance.cc
@@ -90,8 +90,7 @@ void CoreInstance::setSimulationMode() {
              "outoforder") {
     mode_ = SimulationMode::OutOfOrder;
     modeString_ = "Out-of-Order";
-  } else if (config_["Core"]["Simulation-Mode"].as<std::string>() ==
-             "mcu") {
+  } else if (config_["Core"]["Simulation-Mode"].as<std::string>() == "mcu") {
     mode_ = SimulationMode::MCU;
     modeString_ = "MCU";
   }
@@ -239,8 +238,8 @@ void CoreInstance::createCore() {
   // Create the architecture, with knowledge of the kernel
   if (config_["Core"]["ISA"].as<std::string>() == "rv64" ||
       config_["Core"]["ISA"].as<std::string>() == "rv32") {
-    arch_ =
-        std::make_unique<simeng::arch::riscv::Architecture>(kernel_, config_,dataMemory_);
+    arch_ = std::make_unique<simeng::arch::riscv::Architecture>(
+        kernel_, config_, dataMemory_);
   } else if (config_["Core"]["ISA"].as<std::string>() == "AArch64") {
     arch_ =
         std::make_unique<simeng::arch::aarch64::Architecture>(kernel_, config_);
@@ -249,7 +248,8 @@ void CoreInstance::createCore() {
   // Construct branch predictor object
   predictor_ = std::make_unique<simeng::GenericPredictor>(config_);
   if (mode_ == SimulationMode::MCU) {
-    predictor_ = std::make_unique<simeng::pipeline_hi::StaticPredictor>(2); //config_
+    predictor_ =
+        std::make_unique<simeng::pipeline_hi::StaticPredictor>(2);  // config_
   }
 
   // Extract port arrangement from config file
diff --git a/src/lib/Elf.cc b/src/lib/Elf.cc
index 901f370eec..3b7e71e28d 100644
--- a/src/lib/Elf.cc
+++ b/src/lib/Elf.cc
@@ -14,8 +14,8 @@ namespace simeng {
  * https://man7.org/linux/man-pages/man5/elf.5.html
  */
 
-Elf::Elf(std::string path, char** imagePointer, std::unordered_map<std::string, uint64_t>& symbols)
-{
+Elf::Elf(std::string path, char** imagePointer,
+         std::unordered_map<std::string, uint64_t>& symbols) {
   std::ifstream file(path, std::ios::binary);
 
   if (!file.is_open()) {
@@ -49,7 +49,8 @@ Elf::Elf(std::string path, char** imagePointer, std::unordered_map<std::string,
   // Check whether this is a 32 or 64-bit executable
   char bitFormat;
   file.read(&bitFormat, sizeof(bitFormat));
-  if (bitFormat != ElfBitFormat::Format32 && bitFormat != ElfBitFormat::Format64) {
+  if (bitFormat != ElfBitFormat::Format32 &&
+      bitFormat != ElfBitFormat::Format64) {
     return;
   }
 
@@ -94,7 +95,8 @@ Elf::Elf(std::string path, char** imagePointer, std::unordered_map<std::string,
     // Seek to the byte representing header entry size.
     file.seekg(0x36);
     uint16_t headerEntrySize;
-    file.read(reinterpret_cast<char*>(&headerEntrySize), sizeof(headerEntrySize));
+    file.read(reinterpret_cast<char*>(&headerEntrySize),
+              sizeof(headerEntrySize));
     uint16_t headerEntries;
     file.read(reinterpret_cast<char*>(&headerEntries), sizeof(headerEntries));
 
@@ -170,8 +172,8 @@ Elf::Elf(std::string path, char** imagePointer, std::unordered_map<std::string,
     for (const auto& header : headers_) {
       if (header.type == 1) {  // LOAD
         file.seekg(header.offset);
-        // Read `fileSize` bytes from `file` into the appropriate place in process
-        // memory
+        // Read `fileSize` bytes from `file` into the appropriate place in
+        // process memory
         file.read(*imagePointer + header.virtualAddress, header.fileSize);
       }
     }
@@ -182,29 +184,32 @@ Elf::Elf(std::string path, char** imagePointer, std::unordered_map<std::string,
     file.read(reinterpret_cast<char*>(&eheader), sizeof(eheader));
 
     entryPoint32_ = eheader.e_entry;
-  
+
     processImageSize_ = 0;
 
     // Loop over pheaders and extract them.
     file.seekg(eheader.e_phoff);
     std::vector<Elf32_Phdr> pheaders(eheader.e_phnum);
-    for (auto &ph : pheaders) {
-        file.read(reinterpret_cast<char*>(&ph), sizeof(ph));
-        if ((ph.p_type == PT_LOAD) && (ph.p_vaddr+ph.p_memsz > processImageSize_))
-           processImageSize_ = ph.p_vaddr+ph.p_memsz;
+    for (auto& ph : pheaders) {
+      file.read(reinterpret_cast<char*>(&ph), sizeof(ph));
+      if ((ph.p_type == PT_LOAD) &&
+          (ph.p_vaddr + ph.p_memsz > processImageSize_))
+        processImageSize_ = ph.p_vaddr + ph.p_memsz;
     }
 
     *imagePointer = (char*)malloc(processImageSize_ * sizeof(char));
 
     for (const auto& ph : pheaders) {
-       if (ph.p_type == PT_LOAD) {
+      if (ph.p_type == PT_LOAD) {
         file.seekg(ph.p_offset);
-        // Read `fileSize` bytes from `file` into the appropriate place in process memory
-        file.read(*imagePointer+ph.p_vaddr, ph.p_filesz);
+        // Read `fileSize` bytes from `file` into the appropriate place in
+        // process memory
+        file.read(*imagePointer + ph.p_vaddr, ph.p_filesz);
 
-        if (ph.p_memsz>ph.p_filesz)
+        if (ph.p_memsz > ph.p_filesz)
           // Need to padd the rest of the section memory with zeros
-          memset(*imagePointer+ph.p_vaddr+ph.p_filesz, 0, ph.p_memsz-ph.p_filesz);
+          memset(*imagePointer + ph.p_vaddr + ph.p_filesz, 0,
+                 ph.p_memsz - ph.p_filesz);
       }
     }
 
@@ -214,13 +219,13 @@ Elf::Elf(std::string path, char** imagePointer, std::unordered_map<std::string,
     file.seekg(eheader.e_shoff);
     std::vector<Elf32_Shdr> sheaders(eheader.e_shnum);
     unsigned int sh_idx = 0;
-    for (auto &sh : sheaders) {
+    for (auto& sh : sheaders) {
       file.read(reinterpret_cast<char*>(&sh), sizeof(sh));
 
       // find section header for strings to use for symbol table.
-      if (sh.sh_type==SHT_SYMTAB)
+      if (sh.sh_type == SHT_SYMTAB)
         sh_symtab = &sh;
-      else if (sh.sh_type==SHT_STRTAB && sh_idx!=eheader.e_shstrndx)
+      else if (sh.sh_type == SHT_STRTAB && sh_idx != eheader.e_shstrndx)
         sh_strtab = &sh;
       sh_idx++;
     };
@@ -232,9 +237,9 @@ Elf::Elf(std::string path, char** imagePointer, std::unordered_map<std::string,
 
     // Read symbols tables
     file.seekg(sh_symtab->sh_offset);
-    unsigned num_symbols = sh_symtab->sh_size/sh_symtab->sh_entsize;
+    unsigned num_symbols = sh_symtab->sh_size / sh_symtab->sh_entsize;
     Elf32_Sym sym;
-    while(num_symbols--) {
+    while (num_symbols--) {
       file.read(reinterpret_cast<char*>(&sym), sizeof(sym));
       if (strtab[sym.st_name]) {
         std::string name(&strtab[sym.st_name]);
diff --git a/src/lib/ModelConfig.cc b/src/lib/ModelConfig.cc
index 342476347c..1d00ce4bab 100644
--- a/src/lib/ModelConfig.cc
+++ b/src/lib/ModelConfig.cc
@@ -67,10 +67,12 @@ void ModelConfig::validate() {
                "Streaming-Vector-Length"};
   validISA = nodeChecker<std::string>(
       configFile_[root][subFields[0]], subFields[0],
-      std::vector<std::string>({"AArch64", "rv64", "rv32"}), ExpectedValue::String);
-  nodeChecker<std::string>(configFile_[root][subFields[1]], subFields[1],
-                           {"emulation", "inorderpipelined", "mcu", "outoforder"},
-                           ExpectedValue::String);
+      std::vector<std::string>({"AArch64", "rv64", "rv32"}),
+      ExpectedValue::String);
+  nodeChecker<std::string>(
+      configFile_[root][subFields[1]], subFields[1],
+      {"emulation", "inorderpipelined", "mcu", "outoforder"},
+      ExpectedValue::String);
   nodeChecker<float>(configFile_[root][subFields[2]], subFields[2],
                      std::make_pair(0.f, 10.f), ExpectedValue::Float);
   nodeChecker<uint32_t>(configFile_[root][subFields[3]], subFields[3],
diff --git a/src/lib/arch/aarch64/Architecture.cc b/src/lib/arch/aarch64/Architecture.cc
index 5ad11c70d1..a61d9542ea 100644
--- a/src/lib/arch/aarch64/Architecture.cc
+++ b/src/lib/arch/aarch64/Architecture.cc
@@ -287,8 +287,8 @@ uint64_t Architecture::getVectorLength() const { return VL_; }
 
 uint64_t Architecture::getStreamingVectorLength() const { return SVL_; }
 
-int16_t Architecture::updateSystemTimerRegisters(RegisterFileSet* regFile,
-                                              const uint64_t iterations) const {
+int16_t Architecture::updateSystemTimerRegisters(
+    RegisterFileSet* regFile, const uint64_t iterations) const {
   // Update the Processor Cycle Counter to total cycles completed.
   regFile->set(PCCreg_, iterations);
   // Update Virtual Counter Timer at correct frequency.
@@ -329,9 +329,9 @@ void Architecture::setSVCRval(const uint64_t newVal) const {
   SVCRval_ = newVal;
 }
 
-void Architecture::updateInstrTrace(const std::shared_ptr<simeng::Instruction>& instruction,
-                                    simeng::RegisterFileSet* regFile, uint64_t tick) const {
-  }
+void Architecture::updateInstrTrace(
+    const std::shared_ptr<simeng::Instruction>& instruction,
+    simeng::RegisterFileSet* regFile, uint64_t tick) const {}
 
 }  // namespace aarch64
 }  // namespace arch
diff --git a/src/lib/arch/riscv/Architecture.cc b/src/lib/arch/riscv/Architecture.cc
index 84afcc0996..e9fcc7cd38 100644
--- a/src/lib/arch/riscv/Architecture.cc
+++ b/src/lib/arch/riscv/Architecture.cc
@@ -15,10 +15,9 @@ namespace riscv {
 std::unordered_map<uint32_t, Instruction> Architecture::decodeCache;
 std::forward_list<InstructionMetadata> Architecture::metadataCache;
 
-Architecture::Architecture(kernel::Linux& kernel, YAML::Node config, std::shared_ptr<simeng::MemoryInterface>& dataMemory)
-: 
-  linux_(kernel)
-{
+Architecture::Architecture(kernel::Linux& kernel, YAML::Node config,
+                           std::shared_ptr<simeng::MemoryInterface>& dataMemory)
+    : linux_(kernel) {
   is32Bit_ = ARCH_64BIT;
   if (config["Core"]["ISA"].as<std::string>() == "rv32") {
     is32Bit_ = ARCH_32BIT;
@@ -27,8 +26,9 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config, std::shared
   cs_mode csMode = CS_MODE_RISCV64;
   constantsPool constantsPool;
 
-  if(is32Bit_) {
-    csMode = CS_MODE_RISCV32GC; // TODO Note: currently using local (1-line)modified capstone
+  if (is32Bit_) {
+    csMode = CS_MODE_RISCV32GC;  // TODO Note: currently using local
+                                 // (1-line)modified capstone
     constants_.alignMask = constantsPool.alignMaskCompressed;
     constants_.regWidth = constantsPool.byteLength32;
     constants_.bytesLimit = constantsPool.bytesLimitCompressed;
@@ -63,11 +63,12 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config, std::shared
 
   // Memory Mapped System Register Blocks
 
-  // if elf file includes the label tohost then assume that this binary supports HTIF protocol (used by spike) and include an HTI block
+  // if elf file includes the label tohost then assume that this binary supports
+  // HTIF protocol (used by spike) and include an HTI block
   uint64_t htifAddress;
-  if (linux_.lookupSymbolValue("tohost",htifAddress))
-  {
-    std::cout << "[SimEng] HTIF detected at: " << std::hex << htifAddress << std::endl;
+  if (linux_.lookupSymbolValue("tohost", htifAddress)) {
+    std::cout << "[SimEng] HTIF detected at: " << std::hex << htifAddress
+              << std::endl;
     htif = std::make_shared<HostTargetInterface>(*this);
     memoryMappedSystemRegisterBlocks[htifAddress] = htif.get();
   }
@@ -76,9 +77,10 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config, std::shared
   clint = std::make_shared<Clint>(*this);
   memoryMappedSystemRegisterBlocks[Clint::CLINT_BASE] = clint.get();
 
-  if (!memoryMappedSystemRegisterBlocks.empty())
-  {
-    systemRegisterMemoryInterface = std::make_shared<SystemRegisterMemoryInterface>(dataMemory, memoryMappedSystemRegisterBlocks);
+  if (!memoryMappedSystemRegisterBlocks.empty()) {
+    systemRegisterMemoryInterface =
+        std::make_shared<SystemRegisterMemoryInterface>(
+            dataMemory, memoryMappedSystemRegisterBlocks);
     dataMemory = systemRegisterMemoryInterface;
   }
 
@@ -173,7 +175,8 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config, std::shared
       }
     }
   }
-  if (config["Core"]["Trace"].IsDefined() && config["Core"]["Trace"].as<bool>()) {
+  if (config["Core"]["Trace"].IsDefined() &&
+      config["Core"]["Trace"].as<bool>()) {
     traceFile_ = new std::ofstream();
     traceFile_->open("./trace.log");
     traceOn_ = true;
@@ -184,7 +187,7 @@ Architecture::~Architecture() {
   decodeCache.clear();
   metadataCache.clear();
   groupExecutionInfo_.clear();
-  if(traceOn_) {
+  if (traceOn_) {
     traceFile_->close();
   }
 }
@@ -192,7 +195,6 @@ Architecture::~Architecture() {
 uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
                                 uint64_t instructionAddress,
                                 MacroOp& output) const {
-
   // Check that instruction address is 4-byte aligned as required by RISC-V
   // 2-byte when Compressed ISA is supported
   if (instructionAddress & constants_.alignMask) {
@@ -305,9 +307,9 @@ int32_t Architecture::getSystemRegisterTag(uint16_t reg) const {
 /** Returns a System Register index from a system register tag.
     reverse lookup slow but only used in printing so will be fine */
 uint16_t Architecture::getSystemRegisterIdFromTag(int32_t tag) const {
-  for (auto it = systemRegisterMap_.begin();it != systemRegisterMap_.end();it++)
-      if (it->second == tag)
-        return it->first;
+  for (auto it = systemRegisterMap_.begin(); it != systemRegisterMap_.end();
+       it++)
+    if (it->second == tag) return it->first;
   assert(0 && "Tag not found in systemRegisterMap");
 }
 
@@ -318,11 +320,10 @@ ProcessStateChange Architecture::getInitialState() const {
   changes.modifiedRegisters.push_back({RegisterType::GENERAL, 2});
   uint64_t stackPointer;
   // TODO: check if this conditional expression is needed
-  if(is32Bit_) {
+  if (is32Bit_) {
     stackPointer = (uint32_t)linux_.getInitialStackPointer();
     changes.modifiedRegisterValues.push_back((uint32_t)stackPointer);
-  } else
-  {
+  } else {
     stackPointer = linux_.getInitialStackPointer();
     changes.modifiedRegisterValues.push_back(stackPointer);
   }
@@ -335,8 +336,10 @@ uint8_t Architecture::getMinInstructionSize() const { return 2; }
 
 std::vector<RegisterFileStructure>
 Architecture::getConfigPhysicalRegisterStructure(YAML::Node config) const {
-  return {{constants_.regWidth, config["Register-Set"]["GeneralPurpose-Count"].as<uint16_t>()},
-          {constants_.regWidth, config["Register-Set"]["FloatingPoint-Count"].as<uint16_t>()},
+  return {{constants_.regWidth,
+           config["Register-Set"]["GeneralPurpose-Count"].as<uint16_t>()},
+          {constants_.regWidth,
+           config["Register-Set"]["FloatingPoint-Count"].as<uint16_t>()},
           {constants_.regWidth, getNumSystemRegisters()}};
 }
 
@@ -350,15 +353,13 @@ uint16_t Architecture::getNumSystemRegisters() const {
   return static_cast<uint16_t>(systemRegisterMap_.size());
 }
 
-int16_t Architecture::updateSystemTimerRegisters(RegisterFileSet* regFile,
-                                              const uint64_t iterations) const {
+int16_t Architecture::updateSystemTimerRegisters(
+    RegisterFileSet* regFile, const uint64_t iterations) const {
   int16_t interruptId = -1;
 
-  if (htif)
-  {
+  if (htif) {
     interruptId = htif->updateSystemTimerRegisters(regFile, iterations);
-    if (interruptId>=0)
-       return interruptId;
+    if (interruptId >= 0) return interruptId;
   }
 
   if (clint)
@@ -367,23 +368,27 @@ int16_t Architecture::updateSystemTimerRegisters(RegisterFileSet* regFile,
   return interruptId;
 }
 
-void Architecture::updateInstrTrace(const std::shared_ptr<simeng::Instruction>& instruction,
-                                    RegisterFileSet* regFile, uint64_t tick) const {
-  if(traceOn_) {
+void Architecture::updateInstrTrace(
+    const std::shared_ptr<simeng::Instruction>& instruction,
+    RegisterFileSet* regFile, uint64_t tick) const {
+  if (traceOn_) {
     Instruction instr_ = *static_cast<Instruction*>(instruction.get());
     auto& metadata = instr_.getMetadata();
     std::stringstream s;
     s << "0x" << std::hex << instr_.getInstructionAddress() << " ";
     if (tick < 100000000)
-      s << "t(" << std::setfill('0') << std::setw(8) << std::dec << (uint32_t)tick << ") ";
+      s << "t(" << std::setfill('0') << std::setw(8) << std::dec
+        << (uint32_t)tick << ") ";
     else
-      s << "t(" << std::setfill('0') << std::setw(16) << std::dec << (uint32_t)tick << ") ";
+      s << "t(" << std::setfill('0') << std::setw(16) << std::dec
+        << (uint32_t)tick << ") ";
     s << "(";
-    if(metadata.len == IL_16B) {
+    if (metadata.len == IL_16B) {
       s << "0000";
     }
-    for(int8_t i=metadata.lenBytes; i>0; i--) {
-      s << std::hex << std::setfill('0') << std::setw(2) << static_cast<unsigned int>(metadata.encoding[i-1]);
+    for (int8_t i = metadata.lenBytes; i > 0; i--) {
+      s << std::hex << std::setfill('0') << std::setw(2)
+        << static_cast<unsigned int>(metadata.encoding[i - 1]);
     }
     s << ") ";
     s << metadata.mnemonic << " " << metadata.operandStr;
@@ -391,21 +396,25 @@ void Architecture::updateInstrTrace(const std::shared_ptr<simeng::Instruction>&
     auto destinations = instr_.getDestinationRegisters();
     int8_t num_src = (int8_t)sources.size();
     int8_t num_dest = (int8_t)destinations.size();
-    if((num_src + num_dest) >0) {
+    if ((num_src + num_dest) > 0) {
       s << "    ";
       if (num_dest > 0) {
         s << "(d: ";
-        for(int8_t i=0;i<num_dest; i++) {
+        for (int8_t i = 0; i < num_dest; i++) {
           auto reg = destinations[i];
-          if(reg.type == RegisterType::GENERAL) {
-            s << "x" << std::dec << std::setfill('0') << std::setw(2) << reg.tag << "=0x";
-          } else if(reg.type == RegisterType::FLOAT) {
-            s << "f" << std::dec << std::setfill('0') << std::setw(2) << reg.tag << "=0x";
-          } else if(reg.type == RegisterType::SYSTEM) {
-            s << "csr_0x" << std::hex << std::setfill('0') << std::setw(3) << getSystemRegisterIdFromTag(reg.tag) << "=0x";
+          if (reg.type == RegisterType::GENERAL) {
+            s << "x" << std::dec << std::setfill('0') << std::setw(2) << reg.tag
+              << "=0x";
+          } else if (reg.type == RegisterType::FLOAT) {
+            s << "f" << std::dec << std::setfill('0') << std::setw(2) << reg.tag
+              << "=0x";
+          } else if (reg.type == RegisterType::SYSTEM) {
+            s << "csr_0x" << std::hex << std::setfill('0') << std::setw(3)
+              << getSystemRegisterIdFromTag(reg.tag) << "=0x";
           }
-          s << std::hex << std::setfill('0') << std::setw(8) << regFile->get(reg).get<uint32_t>();
-          if(i < (num_dest-1)) {
+          s << std::hex << std::setfill('0') << std::setw(8)
+            << regFile->get(reg).get<uint32_t>();
+          if (i < (num_dest - 1)) {
             s << " ";
           }
         }
@@ -413,17 +422,21 @@ void Architecture::updateInstrTrace(const std::shared_ptr<simeng::Instruction>&
       }
       if (num_src > 0) {
         s << "(s: ";
-        for(int8_t i=0;i<num_src; i++) {
+        for (int8_t i = 0; i < num_src; i++) {
           auto reg = sources[i];
-          if(reg.type == RegisterType::GENERAL) {
-            s << "x" << std::dec << std::setfill('0') << std::setw(2) << reg.tag << "=0x";
-          } else if(reg.type == RegisterType::FLOAT) {
-            s << "f" << std::dec << std::setfill('0') << std::setw(2) << reg.tag << "=0x";
-          } else if(reg.type == RegisterType::SYSTEM) {
-            s << "csr_0x" << std::hex << std::setfill('0') << std::setw(3) << getSystemRegisterIdFromTag(reg.tag) << "=0x";
+          if (reg.type == RegisterType::GENERAL) {
+            s << "x" << std::dec << std::setfill('0') << std::setw(2) << reg.tag
+              << "=0x";
+          } else if (reg.type == RegisterType::FLOAT) {
+            s << "f" << std::dec << std::setfill('0') << std::setw(2) << reg.tag
+              << "=0x";
+          } else if (reg.type == RegisterType::SYSTEM) {
+            s << "csr_0x" << std::hex << std::setfill('0') << std::setw(3)
+              << getSystemRegisterIdFromTag(reg.tag) << "=0x";
           }
-          s << std::hex << std::setfill('0') << std::setw(8) << regFile->get(reg).get<uint32_t>();
-          if(i < (num_src-1)) {
+          s << std::hex << std::setfill('0') << std::setw(8)
+            << regFile->get(reg).get<uint32_t>();
+          if (i < (num_src - 1)) {
             s << " ";
           }
         }
@@ -432,7 +445,8 @@ void Architecture::updateInstrTrace(const std::shared_ptr<simeng::Instruction>&
     }
     s << std::endl;
     *traceFile_ << s.str();
-    traceFile_->flush(); //Helps with debugging sometimes as all the state of previous committed instr is written to file.
+    traceFile_->flush();  // Helps with debugging sometimes as all the state of
+                          // previous committed instr is written to file.
   }
 }
 archConstants Architecture::getConstants() const { return constants_; }
diff --git a/src/lib/arch/riscv/InstructionMetadata.cc b/src/lib/arch/riscv/InstructionMetadata.cc
index d293bc7fdb..b929836b5c 100644
--- a/src/lib/arch/riscv/InstructionMetadata.cc
+++ b/src/lib/arch/riscv/InstructionMetadata.cc
@@ -38,7 +38,8 @@ InstructionMetadata::InstructionMetadata(const uint8_t* invalidEncoding,
       opcode(Opcode::RISCV_INSTRUCTION_LIST_END),
       implicitSourceCount(0),
       implicitDestinationCount(0),
-      operandCount(0), len(IL_INVALID) {
+      operandCount(0),
+      len(IL_INVALID) {
   assert(bytes <= sizeof(encoding));
   std::memcpy(encoding, invalidEncoding, bytes);
   mnemonic[0] = '\0';
@@ -260,17 +261,17 @@ void InstructionMetadata::alterPseudoInstructions(const cs_insn& insn) {
     case Opcode::RISCV_CSRRWI:
     case Opcode::RISCV_CSRRSI:
     case Opcode::RISCV_CSRRCI: {
-      //Extract CSR info
+      // Extract CSR info
       csr = ((uint32_t)encoding[3] << 4) | ((uint32_t)encoding[2] >> 4);
-      //If there are less than 2 operands provided add necessary x0 operand
-      if(operandCount == 1) {
-        if((strcmp(mnemonic, "rdinstret") == 0) ||
-           (strcmp(mnemonic, "rdcycle") == 0) ||
-           (strcmp(mnemonic, "rdtime") == 0) ||
-           (strcmp(mnemonic, "csrr") == 0)) { //csrrs rd,csr,x0
+      // If there are less than 2 operands provided add necessary x0 operand
+      if (operandCount == 1) {
+        if ((strcmp(mnemonic, "rdinstret") == 0) ||
+            (strcmp(mnemonic, "rdcycle") == 0) ||
+            (strcmp(mnemonic, "rdtime") == 0) ||
+            (strcmp(mnemonic, "csrr") == 0)) {  // csrrs rd,csr,x0
           operands[1].type = RISCV_OP_REG;
           operands[1].reg = 1;
-        } else { //csrrxx x0,csr,rs/imm
+        } else {  // csrrxx x0,csr,rs/imm
           operands[1] = operands[0];
           operands[0].type = RISCV_OP_REG;
           operands[0].reg = 1;
@@ -305,13 +306,17 @@ void InstructionMetadata::includeZeroRegisterPosZero() {
   operandCount = 3;
 }
 
-
 void InstructionMetadata::setLength(uint8_t size) {
   lenBytes = size;
-    switch(size) {
-      case 2: len = IL_16B; break;
-      case 4: len = IL_32B; break;
-      default: len = IL_INVALID;
+  switch (size) {
+    case 2:
+      len = IL_16B;
+      break;
+    case 4:
+      len = IL_32B;
+      break;
+    default:
+      len = IL_INVALID;
   }
 }
 
diff --git a/src/lib/arch/riscv/InstructionMetadata.hh b/src/lib/arch/riscv/InstructionMetadata.hh
index 4ce164a346..796afc96c2 100644
--- a/src/lib/arch/riscv/InstructionMetadata.hh
+++ b/src/lib/arch/riscv/InstructionMetadata.hh
@@ -14,11 +14,7 @@ namespace Opcode {
 #include "RISCVGenInstrInfo.inc"
 }  // namespace Opcode
 
-enum INSTR_LENGTH {
-  IL_16B,
-  IL_32B,
-  IL_INVALID
-};
+enum INSTR_LENGTH { IL_16B, IL_32B, IL_INVALID };
 
 /** A simplified RISC-V-only version of the Capstone instruction structure. */
 struct InstructionMetadata {
diff --git a/src/lib/kernel/Linux.cc b/src/lib/kernel/Linux.cc
index bc060bbae4..424395c0f8 100644
--- a/src/lib/kernel/Linux.cc
+++ b/src/lib/kernel/Linux.cc
@@ -23,14 +23,15 @@ namespace kernel {
 void Linux::createProcess(const LinuxProcess& process) {
   assert(process.isValid() && "Attempted to use an invalid process");
   assert(processStates_.size() == 0 && "Multiple processes not yet supported");
-  processStates_.push_back({.pid = 0,  // TODO: create unique PIDs
-                            .path = process.getPath(),
-                            .startBrk = process.getHeapStart(),
-                            .currentBrk = process.getHeapStart(),
-                            .initialStackPointer = process.getStackPointer(),
-                            .mmapRegion = process.getMmapStart(),
-                            .pageSize = process.getPageSize(),
-                            });
+  processStates_.push_back({
+      .pid = 0,  // TODO: create unique PIDs
+      .path = process.getPath(),
+      .startBrk = process.getHeapStart(),
+      .currentBrk = process.getHeapStart(),
+      .initialStackPointer = process.getStackPointer(),
+      .mmapRegion = process.getMmapStart(),
+      .pageSize = process.getPageSize(),
+  });
   processStates_.back().fileDescriptorTable.push_back(STDIN_FILENO);
   processStates_.back().fileDescriptorTable.push_back(STDOUT_FILENO);
   processStates_.back().fileDescriptorTable.push_back(STDERR_FILENO);
@@ -652,9 +653,8 @@ int64_t Linux::writev(int64_t fd, const void* iovdata, int iovcnt) {
 }
 
 /** Lookup symbol value from table in elf file. */
-bool Linux::lookupSymbolValue(const std::string symbol, uint64_t& value)
-{
-  processStates_[0].process->lookupSymbolValue(symbol,value);
+bool Linux::lookupSymbolValue(const std::string symbol, uint64_t& value) {
+  processStates_[0].process->lookupSymbolValue(symbol, value);
 }
 
 }  // namespace kernel
diff --git a/src/lib/models/emulation/Core.cc b/src/lib/models/emulation/Core.cc
index d9268da25f..a779ef7521 100644
--- a/src/lib/models/emulation/Core.cc
+++ b/src/lib/models/emulation/Core.cc
@@ -150,15 +150,15 @@ void Core::tick() {
 }
 
 void Core::execute(std::shared_ptr<Instruction>& uop) {
-
-  if (interruptId_>=0)
+  if (interruptId_ >= 0)
     uop->raiseInterrupt(interruptId_);
   else
     uop->execute();
 
   if (uop->exceptionEncountered()) {
     instructionsExecuted_++;
-    isa_.updateInstrTrace(uop, &registerFileSet_, ticks_); // Handle ECALL into trace here
+    isa_.updateInstrTrace(uop, &registerFileSet_,
+                          ticks_);  // Handle ECALL into trace here
     handleException(uop);
     return;
   }
@@ -192,14 +192,19 @@ void Core::execute(std::shared_ptr<Instruction>& uop) {
 
   if (uop->isLastMicroOp()) {
     instructionsExecuted_++;
-    // TODO: This is architecture-specific. It's here for the reference and should(will) be refactored later
-    uint16_t sysreg_instrret = isa_.getSystemRegisterTag(arch::riscv::riscv_sysreg::SYSREG_INSTRRET);
-    uint16_t sysreg_cycle = isa_.getSystemRegisterTag(arch::riscv::riscv_sysreg::SYSREG_CYCLE);
+    // TODO: This is architecture-specific. It's here for the reference and
+    // should(will) be refactored later
+    uint16_t sysreg_instrret =
+        isa_.getSystemRegisterTag(arch::riscv::riscv_sysreg::SYSREG_INSTRRET);
+    uint16_t sysreg_cycle =
+        isa_.getSystemRegisterTag(arch::riscv::riscv_sysreg::SYSREG_CYCLE);
     // NOTE: 64-bit system registers are not implemented yet
-    //TODO: Maybe make use of byteLength and remove is32BitMode() function?
+    // TODO: Maybe make use of byteLength and remove is32BitMode() function?
     if (isa_.is32BitMode()) {
-      registerFileSet_.set(Register{0x2, sysreg_instrret}, RegisterValue(instructionsExecuted_, 4));
-      registerFileSet_.set(Register{0x2, sysreg_cycle}, RegisterValue(ticks_, 4));
+      registerFileSet_.set(Register{0x2, sysreg_instrret},
+                           RegisterValue(instructionsExecuted_, 4));
+      registerFileSet_.set(Register{0x2, sysreg_cycle},
+                           RegisterValue(ticks_, 4));
     }
     isa_.updateInstrTrace(uop, &registerFileSet_, ticks_);
   }
diff --git a/src/lib/pipeline_hi/FetchUnit.cc b/src/lib/pipeline_hi/FetchUnit.cc
index 4de190efca..56f6526e7b 100644
--- a/src/lib/pipeline_hi/FetchUnit.cc
+++ b/src/lib/pipeline_hi/FetchUnit.cc
@@ -109,12 +109,14 @@ void FetchUnit::tick() {
 
   // Check we have enough data to begin decoding
   if (bufferedBytes_ == isa_.getMinInstructionSize()) {
-    //Check if those bytes points to a instruction with minimum size or more data is required. If more data is required return
-    // TODO: this is not generic solution, just trying to make it work
+    // Check if those bytes points to a instruction with minimum size or more
+    // data is required. If more data is required return
+    //  TODO: this is not generic solution, just trying to make it work
     uint16_t rawBits;
     memcpy(&rawBits, buffer + bufferOffset, 2);
-    if((rawBits & 0x3) == 0x3) {
-      //std::cout << std::hex << "Only 2 bytes left in fetch buffer and not compresses instr type, current PC: 0x" << pc_ << std::endl;
+    if ((rawBits & 0x3) == 0x3) {
+      // std::cout << std::hex << "Only 2 bytes left in fetch buffer and not
+      // compresses instr type, current PC: 0x" << pc_ << std::endl;
       return;
     }
   }
@@ -151,7 +153,8 @@ void FetchUnit::tick() {
     //       {encoding, bytesRead, pc_, macroOp[0]->getBranchPrediction()});
 
     //   if (pc_ == loopBoundaryAddress_) {
-    //     // loopBoundaryAddress_ has been fetched whilst filling the loop buffer.
+    //     // loopBoundaryAddress_ has been fetched whilst filling the loop
+    //     buffer.
     //     // Stop filling as loop body has been recorded and begin to supply
     //     // decode unit with instructions from the loop buffer
     //     loopBufferState_ = LoopBufferState::SUPPLYING;
@@ -177,8 +180,10 @@ void FetchUnit::tick() {
       // Predicted as taken; set PC to predicted target address
       pc_ = prediction.target;
     }
-//    std::cout << std::hex << "PC: 0x" << pc_ << ", PBL: 0x" << programByteLength_ << std::endl;
-    if (pc_ == 0 && (macroOp[0]->getBranchType() == BranchType::SubroutineCall)) {
+    //    std::cout << std::hex << "PC: 0x" << pc_ << ", PBL: 0x" <<
+    //    programByteLength_ << std::endl;
+    if (pc_ == 0 &&
+        (macroOp[0]->getBranchType() == BranchType::SubroutineCall)) {
       waitSCEval_ = true;
       break;
     }
diff --git a/src/lib/pipeline_hi/RegDepMap.cc b/src/lib/pipeline_hi/RegDepMap.cc
index 4ab004bfdb..45a5de7f60 100644
--- a/src/lib/pipeline_hi/RegDepMap.cc
+++ b/src/lib/pipeline_hi/RegDepMap.cc
@@ -6,7 +6,9 @@
 #ifdef RDMDEBUG
 #define DEBUG(x) std::cout << "Core: " << std::hex << x << std::endl;
 #else
-#define DEBUG(x) do { } while (false);
+#define DEBUG(x) \
+  do {           \
+  } while (false);
 #endif
 
 namespace simeng {
@@ -14,48 +16,50 @@ namespace pipeline_hi {
 
 const Register l_ZERO_REGISTER = {0, 0};
 
-RegDepMap::RegDepMap(const std::vector<RegisterFileStructure> registerFileStructures,
-                     const RegisterFileSet& registerFileSet) : 
-                registerFileStructures_(registerFileStructures),
-                registerFileSet_(registerFileSet) {
-  regMap_.resize(registerFileStructures_.size());//Just for Integer Register File for now
-  for (size_t type=0; type<registerFileStructures_.size(); type++) {
+RegDepMap::RegDepMap(
+    const std::vector<RegisterFileStructure> registerFileStructures,
+    const RegisterFileSet& registerFileSet)
+    : registerFileStructures_(registerFileStructures),
+      registerFileSet_(registerFileSet) {
+  regMap_.resize(registerFileStructures_
+                     .size());  // Just for Integer Register File for now
+  for (size_t type = 0; type < registerFileStructures_.size(); type++) {
     regMap_[type].resize(registerFileStructures_.at(type).quantity);
   }
 }
 
-RegDepMap::~RegDepMap()
-{
+RegDepMap::~RegDepMap() {
   for (unsigned i = 0; i < regMap_.size(); i++) {
-    for (unsigned j = 0; j < regMap_[i].size(); j++)
-      regMap_[i][j].clear();
+    for (unsigned j = 0; j < regMap_[i].size(); j++) regMap_[i][j].clear();
     regMap_[i].clear();
   }
   regMap_.clear();
 }
 
-void RegDepMap::insert(InstrPtr instr)
-{
-  //TODO: IRF X0 is not a dependency!
+void RegDepMap::insert(InstrPtr instr) {
+  // TODO: IRF X0 is not a dependency!
   auto& destinationRegisters = instr->getDestinationRegisters();
-  for(const auto& reg: destinationRegisters) {
-    if(reg != l_ZERO_REGISTER) { //Not X0
+  for (const auto& reg : destinationRegisters) {
+    if (reg != l_ZERO_REGISTER) {  // Not X0
       outstandingDep_++;
-      DEBUG("Adding Depencency: addr, 0x" << instr->getInstructionAddress() << std::dec << ", dest: " << reg << ", outstanding: " << outstandingDep_);
+      DEBUG("Adding Depencency: addr, 0x"
+            << instr->getInstructionAddress() << std::dec << ", dest: " << reg
+            << ", outstanding: " << outstandingDep_);
       regMap_[reg.type][reg.tag].push_back(instr);
     }
   }
 }
 
-void RegDepMap::remove(InstrPtr instr)
-{
+void RegDepMap::remove(InstrPtr instr) {
   auto& destinationRegisters = instr->getDestinationRegisters();
-  for(const auto& reg: destinationRegisters) {
+  for (const auto& reg : destinationRegisters) {
     auto it = regMap_[reg.type][reg.tag].begin();
     while (it != regMap_[reg.type][reg.tag].end()) {
-      if(*it == instr) {
+      if (*it == instr) {
         outstandingDep_--;
-        DEBUG("Removing Depencency: addr, 0x" << instr->getInstructionAddress() << std::dec << ", dest: " << reg << ", outstanding: " << outstandingDep_);
+        DEBUG("Removing Depencency: addr, 0x"
+              << instr->getInstructionAddress() << std::dec << ", dest: " << reg
+              << ", outstanding: " << outstandingDep_);
         it = regMap_[reg.type][reg.tag].erase(it);
         break;
       } else {
@@ -65,8 +69,7 @@ void RegDepMap::remove(InstrPtr instr)
   }
 }
 
-bool RegDepMap::canRead(InstrPtr instr)
-{
+bool RegDepMap::canRead(InstrPtr instr) {
   bool dependency = false;
   auto& sourceRegisters = instr->getOperandRegisters();
   for (uint16_t i = 0; i < sourceRegisters.size(); i++) {
@@ -74,13 +77,19 @@ bool RegDepMap::canRead(InstrPtr instr)
 
     if (!instr->isOperandReady(i)) {
       // The operand hasn't already been supplied
-      if (regMap_[srcReg.type][srcReg.tag].size() == 0) {//pick up value from register file
-        instr->supplyOperand(i, registerFileSet_.get(srcReg));         
+      if (regMap_[srcReg.type][srcReg.tag].size() ==
+          0) {  // pick up value from register file
+        instr->supplyOperand(i, registerFileSet_.get(srcReg));
       } else if (regMap_[srcReg.type][srcReg.tag].back()->hasExecuted() &&
-                 !(regMap_[srcReg.type][srcReg.tag].back()->isMul() || regMap_[srcReg.type][srcReg.tag].back()->isDiv() ||
-                   (regMap_[srcReg.type][srcReg.tag].back()->isLoad() && !instr->isStoreData()))) {//pick up value from last executed instruction
-        const auto& destRegisters = regMap_[srcReg.type][srcReg.tag].back()->getDestinationRegisters();
-        const auto& destValues = regMap_[srcReg.type][srcReg.tag].back()->getResults();
+                 !(regMap_[srcReg.type][srcReg.tag].back()->isMul() ||
+                   regMap_[srcReg.type][srcReg.tag].back()->isDiv() ||
+                   (regMap_[srcReg.type][srcReg.tag].back()->isLoad() &&
+                    !instr->isStoreData()))) {  // pick up value from last
+                                                // executed instruction
+        const auto& destRegisters =
+            regMap_[srcReg.type][srcReg.tag].back()->getDestinationRegisters();
+        const auto& destValues =
+            regMap_[srcReg.type][srcReg.tag].back()->getResults();
         for (size_t j = 0; j < destRegisters.size(); j++) {
           const auto& destReg = destRegisters[j];
           if (destReg == srcReg) {
@@ -97,33 +106,30 @@ bool RegDepMap::canRead(InstrPtr instr)
   return !dependency;
 }
 
-bool RegDepMap::canWrite(InstrPtr instr)
-{
+bool RegDepMap::canWrite(InstrPtr instr) {
   bool dependency = false;
   auto& destRegisters = instr->getDestinationRegisters();
-  for(uint16_t i = 0; i < destRegisters.size(); i++) {
-      const auto& destReg = destRegisters[i];
-      if (regMap_[destReg.type][destReg.tag].size() > 0 &&
-          !regMap_[destReg.type][destReg.tag].back()->hasExecuted()) {
-        dependency = true;
-        break;
-      }
+  for (uint16_t i = 0; i < destRegisters.size(); i++) {
+    const auto& destReg = destRegisters[i];
+    if (regMap_[destReg.type][destReg.tag].size() > 0 &&
+        !regMap_[destReg.type][destReg.tag].back()->hasExecuted()) {
+      dependency = true;
+      break;
+    }
   }
   return !dependency || (instr->isLoad());
 }
 
-//Clean up the options logic to ensure all of them work well together
-bool RegDepMap::canForward(InstrPtr instr)
-{
-  return true;
-}
+// Clean up the options logic to ensure all of them work well together
+bool RegDepMap::canForward(InstrPtr instr) { return true; }
 
 void RegDepMap::purgeFlushed() {
   for (auto& registerType : regMap_) {
     for (auto& dependencyList : registerType) {
       auto it = dependencyList.begin();
       while (it != dependencyList.end()) {
-        DEBUG("Purge entry present at addr: 0x" << (*it)->getInstructionAddress());
+        DEBUG("Purge entry present at addr: 0x"
+              << (*it)->getInstructionAddress());
         if ((*it)->isFlushed()) {
           outstandingDep_--;
           it = dependencyList.erase(it);
@@ -135,9 +141,7 @@ void RegDepMap::purgeFlushed() {
   }
 }
 
-void RegDepMap::dump()
-{
-}
+void RegDepMap::dump() {}
 
 }  // namespace pipeline_hi
 }  // namespace simeng
diff --git a/src/lib/pipeline_hi/WritebackUnit.cc b/src/lib/pipeline_hi/WritebackUnit.cc
index b0dfd97161..dce0dd5e6a 100644
--- a/src/lib/pipeline_hi/WritebackUnit.cc
+++ b/src/lib/pipeline_hi/WritebackUnit.cc
@@ -55,8 +55,9 @@ uint64_t WritebackUnit::getInstructionsWrittenCount() const {
 
 std::vector<std::shared_ptr<Instruction>> WritebackUnit::getInstsForTrace() {
   std::shared_ptr<Instruction> instr;
-  std::deque<std::shared_ptr<Instruction>>::iterator it =  committedInstsForTrace_.begin();
-  while(it != committedInstsForTrace_.end()) {
+  std::deque<std::shared_ptr<Instruction>>::iterator it =
+      committedInstsForTrace_.begin();
+  while (it != committedInstsForTrace_.end()) {
     instr = *it;
     if (removeInstrOrderQ_(instr)) {
       committedInstsForTrace_.erase(it);
@@ -64,10 +65,10 @@ std::vector<std::shared_ptr<Instruction>> WritebackUnit::getInstsForTrace() {
     }
     it++;
   }
-  return {}; //committedInstsForTrace_;
+  return {};  // committedInstsForTrace_;
 }
 void WritebackUnit::traceFinished() {
-  //committedInstsForTrace_.clear();
+  // committedInstsForTrace_.clear();
 }
 
 }  // namespace pipeline_hi
diff --git a/sst/SimEngMemInterface.cc b/sst/SimEngMemInterface.cc
index 678d985329..d01d7d216e 100644
--- a/sst/SimEngMemInterface.cc
+++ b/sst/SimEngMemInterface.cc
@@ -18,7 +18,8 @@ SimEngMemInterface::SimEngMemInterface(StandardMem* mem, uint64_t cl,
   this->debug_ = debug;
 };
 
-void SimEngMemInterface::sendProcessImageToSST(char* image, uint64_t size, uint64_t startAddr) {
+void SimEngMemInterface::sendProcessImageToSST(char* image, uint64_t size,
+                                               uint64_t startAddr) {
   std::vector<uint8_t> data;
   data.reserve(size);
 
@@ -26,8 +27,12 @@ void SimEngMemInterface::sendProcessImageToSST(char* image, uint64_t size, uint6
     data.push_back((uint8_t)image[i]);
   }
 
-  StandardMem::Request* req = new StandardMem::Write(startAddr, data.size(), data);
-  std::cout << std::hex << "[SSTSimEng:SimEngMemInterface] Sending image section to SST Memory at address 0x" << startAddr << ", size 0x" << data.size() << std::endl;
+  StandardMem::Request* req =
+      new StandardMem::Write(startAddr, data.size(), data);
+  std::cout << std::hex
+            << "[SSTSimEng:SimEngMemInterface] Sending image section to SST "
+               "Memory at address 0x"
+            << startAddr << ", size 0x" << data.size() << std::endl;
   sstMem_->sendUntimedData(req);
   return;
 };
@@ -177,7 +182,8 @@ void SimEngMemInterface::requestRead(const MemoryAccessTarget& target,
   if (debug_) {
     std::cout << "[SSTSimEng:SSTDebug] MemRead"
               << "-read-request-" << requestId << "-cycle-" << tickCounter_
-              << "-split-" << requests.size() << "-addr-0x" << std::hex << addrStart << std::endl;
+              << "-split-" << requests.size() << "-addr-0x" << std::hex
+              << addrStart << std::endl;
   }
   for (StandardMem::Request* req : requests) {
     sstMem_->send(req);
@@ -195,8 +201,9 @@ void SimEngMemInterface::requestWrite(const MemoryAccessTarget& target,
       makeSSTRequests<AggregateWriteRequest>(aggrReq, addrStart, addrEnd, size);
   if (debug_) {
     std::cout << "[SSTSimEng:SSTDebug] MemWrite"
-              << "-write-request-xx" << "-cycle-" << tickCounter_
-              << "-split-" << requests.size() << "-addr-0x" << std::hex << addrStart << std::endl;
+              << "-write-request-xx"
+              << "-cycle-" << tickCounter_ << "-split-" << requests.size()
+              << "-addr-0x" << std::hex << addrStart << std::endl;
   }
   for (StandardMem::Request* req : requests) {
     sstMem_->send(req);
diff --git a/sst/include/SimEngMemInterface.hh b/sst/include/SimEngMemInterface.hh
index 463d0dc9d5..5a55ec5e10 100644
--- a/sst/include/SimEngMemInterface.hh
+++ b/sst/include/SimEngMemInterface.hh
@@ -33,7 +33,8 @@ class SimEngMemInterface : public MemoryInterface {
                      bool debug);
   /** Send SimEng's processImage to SST memory backend during `init` lifecycle
    * phase of SST. */
-  void sendProcessImageToSST(char* image, uint64_t size, uint64_t startAddr=0);
+  void sendProcessImageToSST(char* image, uint64_t size,
+                             uint64_t startAddr = 0);
 
   /**
    * Construct an AggregatedReadRequest and use it to generate