diff --git a/.circleci/config.yml b/.circleci/config.yml
index fb75f0311f..27cca27cd1 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -49,4 +49,5 @@ workflows:
             - docs-build
           filters:
             branches:
-              only: main
\ No newline at end of file
+              only: 
+                - main
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/simeng-bug-or-issue-report.md b/.github/ISSUE_TEMPLATE/simeng-bug-or-issue-report.md
new file mode 100644
index 0000000000..7518e5967a
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/simeng-bug-or-issue-report.md
@@ -0,0 +1,66 @@
+---
+name: SimEng Bug or Issue Report
+about: Report a bug or un-expected SimEng behaviour
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Check List**
+ - [ ] The binary I am trying to run has been compiled statically for either RV64 or AArch64.
+ - [ ] The compiled binary is a Linux Elf file.
+ - [ ] I have provided both a config file and a binary to SimEng as runtime arguments.
+
+**Binary File Information**
+Please run `file` on the binary used and paste the output below (i.e. `file myBinary.elf`).
+```bash
+```
+
+**System Description**
+Please provide the following as a list:
+ - The Operating System of the system you are running SimEng on
+ - The compiler used to compile SimEng and its version
+ - The compiler used to compile the static binary and its version
+ - The ISA or specific processor that the binary was compiled for
+   - For example, if `-march=armv8.4-a+sve` was used, then present `armv8.4-a+sve`
+   - If `-mcpu=neoverse-v1` or similar was used, then present `neoverse-v1`
+ - The processor of the system you are running SimEng on
+ - The main memory capacity of the system you are running SimEng on
+
+**SimEng Version**
+Provide the SimEng repository branch, commit hash, and version tag (if relevant) that the issue is present on.
+
+**SimEng CMAKE Options Used**
+Provide a bullet list of all CMAKE options used. E.g. `-DCMAKE_BUILD_TYPE=Release`.
+
+**Binary Compilation Instructions**
+Provide a bullet list of how the binary in question was compiled, including all compiler flags used.
+
+**SimEng Command Line Expression**
+Provide the command line expression used to run SimEng e.g. `./simeng /path/to/configs/a64fx.yaml /path/to/myBinary.elf`
+
+**SimEng Metadata Output**
+If your simulation begins to execute the binary, please provide the metadata that SimEng prints at the start of execution.
+E.g.
+```bash
+./simeng configs/a64fx.yaml myStaticBinary.elf 
+[SimEng] Build metadata: 
+[SimEng]        Version: 0.9.6 
+[SimEng]        Compile Time - Date: 14:01:44 - Jun 19 2024 
+[SimEng]        Build type: Debug 
+[SimEng]        Compile options: $<$<COMPILE_LANGUAGE:CXX>:-fno-rtti>;-Wall;-pedantic;-Werror 
+[SimEng]        Test suite: ON 
+
+[SimEng] Running in Out-of-Order mode 
+[SimEng] Workload: /home/SimEng/myStaticBinary.elf 
+[SimEng] Config file: /home/SimEng/configs/a64fx.yaml 
+[SimEng] ISA: AArch64
+[SimEng] Auto-generated Special File directory: True 
+[SimEng] Special File directory used: /home/SimEng/build/specialFiles/ 
+[SimEng] Number of Cores: 1 
+[SimEng] Starting...
+```
+
+**Problem Description**
+Explain what you think should happen, and what actually happens.
diff --git a/.github/ISSUE_TEMPLATE/simeng-instruction-NYI-or-Alias.md b/.github/ISSUE_TEMPLATE/simeng-instruction-NYI-or-Alias.md
new file mode 100644
index 0000000000..5f0388673c
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/simeng-instruction-NYI-or-Alias.md
@@ -0,0 +1,51 @@
+---
+name: SimEng Instruction Not-Yet-Imlpemented, Alias, or Missing System Register
+about: Report an instruction which either does not yet have execution logic implemented (NYI), an instruction who's alias needs resolving, or an instruction which targets a system register which is not yet supported.
+title: '[NYI|ALIAS|SYSREG]'
+labels: ''
+assignees: ''
+
+---
+
+**Check List**
+ - [ ] I have selected the appropriate title prefix from the available options within `[NYI|ALIAS|SYSREG]`, and added a suitable title after this.
+ - [ ] The binary I am trying to run has been compiled statically for either RV64 or AArch64.
+ - [ ] The compiled binary is a Linux Elf file.
+
+**Binary Compilation Instructions**
+Provide a bullet list of how the binary in question was compiled, including all compiler flags used.
+
+**SimEng CMAKE Options Used**
+Provide a bullet list of all CMAKE options used. E.g. `-DCMAKE_BUILD_TYPE=Release`.
+
+**SimEng Command Line Expression**
+Provide the command line expression used to run SimEng e.g. `./simeng /path/to/configs/a64fx.yaml /path/to/myBinary.elf`
+
+**Console Printout**
+Please provide a copy of SimEng's console printout prior to its end-of-execution statistics. Please do not include any binary specific output.
+E.g.
+```bash
+./simeng configs/a64fx.yaml myStaticBinary.elf 
+[SimEng] Build metadata: 
+[SimEng]        Version: 0.9.6 
+[SimEng]        Compile Time - Date: 14:01:44 - Jun 19 2024 
+[SimEng]        Build type: Debug 
+[SimEng]        Compile options: $<$<COMPILE_LANGUAGE:CXX>:-fno-rtti>;-Wall;-pedantic;-Werror 
+[SimEng]        Test suite: ON 
+
+[SimEng] Running in Out-of-Order mode 
+[SimEng] Workload: /home/SimEng/myStaticBinary.elf 
+[SimEng] Config file: /home/SimEng/configs/a64fx.yaml 
+[SimEng] ISA: AArch64
+[SimEng] Auto-generated Special File directory: True 
+[SimEng] Special File directory used: /home/SimEng/build/specialFiles/ 
+[SimEng] Number of Cores: 1 
+[SimEng] Starting...
+
+
+[SimEng:ExceptionHandler] Encountered execution not-yet-implemented exception
+[SimEng:ExceptionHandler]   Generated by instruction:
+[SimEng:ExceptionHandler]     0x0000000000402dc8: 1d 00 80 d2     mov x29, #0
+[SimEng:ExceptionHandler]       opcode ID: 3680
+[SimEng:Core] Halting due to fatal exception
+``` 
diff --git a/.github/actions/select_setup/action.yml b/.github/actions/select_setup/action.yml
new file mode 100644
index 0000000000..4b8f6c0579
--- /dev/null
+++ b/.github/actions/select_setup/action.yml
@@ -0,0 +1,113 @@
+name: Setup compiler and OS combo
+description: installs dependencies and correct compiler/OS versions to build and test simeng
+
+##############################################################################
+# Calls the correct setup action based on parameters passed into this action.
+##############################################################################
+
+inputs:
+  OS:
+    required: true
+  COMPILER:
+    required: true
+  MODE:
+    required: true
+
+runs:
+  using: 'composite'
+  steps:
+    ##########################################
+    # GCC jobs
+    ##########################################
+
+    # ubuntu
+    - if: ${{ contains( inputs.COMPILER, 'gcc') && contains( inputs.OS, 'ubuntu') }}
+      name:  Install ${{ inputs.COMPILER }} + Build SimEng
+      uses: ./.github/actions/setup_gcc_ubuntu
+      with:
+        OS: ${{ inputs.OS }}
+        gcc-version: ${{ inputs.COMPILER }}
+        MODE: ${{ inputs.mode }}
+    
+    # rocky linux
+    - if: ${{ contains( inputs.COMPILER, 'gcc') && contains( inputs.OS, 'rocky') }}
+      name:  Install ${{ inputs.COMPILER }} + Build SimEng
+      uses: ./.github/actions/setup_gcc_rocky
+      with:
+        OS: ${{ inputs.OS }}
+        gcc-version: ${{ inputs.COMPILER }}
+        MODE: ${{ inputs.mode }}
+
+    # red hat
+    - if: ${{ contains( inputs.COMPILER, 'gcc') && contains( inputs.OS, 'redhat') }}
+      name:  Install ${{ inputs.COMPILER }} + Build SimEng
+      uses: ./.github/actions/setup_gcc_redhat
+      with:
+        OS: ${{ inputs.OS }}
+        gcc-version: ${{ inputs.COMPILER }}
+        MODE: ${{ inputs.mode }}
+
+    # debian
+    - if: ${{ contains( inputs.COMPILER, 'gcc') && contains( inputs.OS, 'debian') }}
+      name:  Install ${{ inputs.COMPILER }} + Build SimEng
+      uses: ./.github/actions/setup_gcc_debian
+      with:
+        OS: ${{ inputs.OS }}
+        gcc-version: ${{ inputs.COMPILER }}
+        MODE: ${{ inputs.mode }}
+    
+    # macos
+    - if: ${{ contains( inputs.COMPILER, 'gcc') && contains( inputs.OS, 'macos') }}
+      name:  Install ${{ inputs.COMPILER }} + Build SimEng
+      uses: ./.github/actions/setup_gcc_macos
+      with:
+        OS: ${{ inputs.OS }}
+        gcc-version: ${{ inputs.COMPILER }}
+        MODE: ${{ inputs.mode }}
+        
+    ##########################################
+    # APPLE CLANG
+    ##########################################
+
+    - if: ${{ contains( inputs.COMPILER, 'clang') && contains( inputs.OS, 'macos') }}
+      name:  Install ${{ inputs.COMPILER }} + Build SimEng
+      uses: ./.github/actions/setup_clang_macos
+      with:
+        OS: ${{ inputs.OS }}
+        MODE: ${{ inputs.mode }}
+    
+    ##########################################
+    # ARM CLANG
+    ##########################################
+
+    # ubuntu
+    - if: ${{ contains( inputs.COMPILER, 'armclang') && contains( inputs.OS, 'ubuntu') }}
+      name:  Install ${{ inputs.COMPILER }} + Build SimEng
+      uses: ./.github/actions/setup_armclang_ubuntu
+      with:
+        OS: ${{ inputs.OS }}
+        MODE: ${{ inputs.mode }}
+
+    # debian
+    - if: ${{ contains( inputs.COMPILER, 'armclang') && contains( inputs.OS, 'debian') }}
+      name:  Install ${{ inputs.COMPILER }} + Build SimEng
+      uses: ./.github/actions/setup_armclang_debian
+      with:
+        OS: ${{ inputs.OS }}
+        MODE: ${{ inputs.mode }}
+
+    # redhat
+    - if: ${{ contains( inputs.COMPILER, 'armclang') && contains( inputs.OS, 'redhat') }}
+      name:  Install ${{ inputs.COMPILER }} + Build SimEng
+      uses: ./.github/actions/setup_armclang_redhat
+      with:
+        OS: ${{ inputs.OS }}
+        MODE: ${{ inputs.mode }}
+
+    # rocky
+    - if: ${{ contains( inputs.COMPILER, 'armclang') && contains( inputs.OS, 'rocky') }}
+      name:  Install ${{ inputs.COMPILER }} + Build SimEng
+      uses: ./.github/actions/setup_armclang_rocky
+      with:
+        OS: ${{ inputs.OS }}
+        MODE: ${{ inputs.mode }}
\ No newline at end of file
diff --git a/.github/actions/setup_armclang_debian/action.yml b/.github/actions/setup_armclang_debian/action.yml
new file mode 100644
index 0000000000..fc54a98284
--- /dev/null
+++ b/.github/actions/setup_armclang_debian/action.yml
@@ -0,0 +1,95 @@
+name: setup armclang
+description: installs dependencies and correct armclang version to build and test simeng
+
+inputs:
+  OS:
+    description: docker image name
+    required: true
+  MODE:
+    description: simeng-mode e.g. Release or Debug
+    required: true
+
+runs:
+  using: 'composite'
+  steps:
+    #######################################
+    # Install dependencies required (cmake, etc).
+    #######################################
+    - name: Install dependencies
+      shell: bash
+      run: |
+        export DEBIAN_FRONTEND=noninteractive
+
+        # Update package lists
+        apt-get update
+
+        # Install essential packages
+        apt-get install -y \
+          python3-launchpadlib \
+          software-properties-common \
+          build-essential \
+          sudo \
+          wget \
+          zlib1g-dev \
+          python3 \
+          build-essential \
+          libssl-dev \
+          ninja-build \
+          tree \
+          git
+
+        # add pyparsing for benchmarking
+        apt-get install -y python3-pip
+        pip3 install pyparsing
+
+        apt-get update
+        apt-get upgrade -y
+      
+    #######################################
+    # Install Cmake
+    #######################################
+    - name: Install Cmake via apt
+      shell: bash
+      run: |
+        wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null && \
+        apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main' && \
+        apt update && apt install cmake -y
+        apt upgrade -y
+
+    #######################################
+    # Install ArmClang
+    #######################################
+    - name: Install armclang
+      shell: bash
+      run: |
+        apt-get update
+        apt-get upgrade -y
+        apt-get install environment-modules
+        source /etc/profile.d/modules.sh
+
+        wget https://developer.arm.com/-/cdn-downloads/permalink/Arm-Compiler-for-Linux/Version_24.04/arm-compiler-for-linux_24.04_Ubuntu-22.04_aarch64.tar
+        tar -xf arm-compiler-for-linux_24.04_Ubuntu-22.04_aarch64.tar
+
+        ./arm-compiler-for-linux_24.04_Ubuntu-20.04/arm-compiler-for-linux_24.04_Ubuntu-20.04.sh --install-to ./armclang_compiler
+        export MODULEPATH=$MODULEPATH:$(pwd)/armclang_compiler/modulefiles
+
+        module avail
+        module load acfl/24.04
+        armclang -v
+
+        echo "C_COMPILER=$(which armclang)" >> $GITHUB_ENV
+        echo "CPP_COMPILER=$(which armclang++)" >> $GITHUB_ENV
+
+    #######################################
+    # Build SimEng
+    #######################################
+    - name: Build SimEng
+      shell: bash
+      run: |
+        cmake -B build -S . -DCMAKE_BUILD_TYPE=${{ inputs.MODE }} -DSIMENG_ENABLE_TESTS=ON -DSIMENG_OPTIMIZE=ON -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} -DCMAKE_CXX_COMPILER=${{ env.CPP_COMPILER }}
+
+        cmake --build build -j $(nproc)
+
+        cmake --build build --target install
+
+        
diff --git a/.github/actions/setup_armclang_redhat/action.yml b/.github/actions/setup_armclang_redhat/action.yml
new file mode 100644
index 0000000000..592e375d01
--- /dev/null
+++ b/.github/actions/setup_armclang_redhat/action.yml
@@ -0,0 +1,91 @@
+name: setup armclang
+description: installs dependencies and correct armclang version to build and test simeng
+
+inputs:
+  OS:
+    description: docker image name
+    required: true
+  MODE:
+    description: simeng-mode e.g. Release or Debug
+    required: true
+
+runs:
+  using: 'composite'
+  steps:
+    #######################################
+    # Install dependencies required (cmake, etc).
+    #######################################
+    - name: Install dependencies
+      shell: bash
+      run: |
+        dnf -y update && \
+        dnf -y install \
+        gcc gcc-c++ make \
+        wget \
+        python3 \
+        git \
+        diffutils \
+        openssl-devel \
+        bzip2 \
+        automake \
+        autoconf \
+        cmake \
+        file \
+        zlib-devel 
+
+        if [[ ${{  inputs.OS  }} == 'redhat/ubi8:latest' ]]; then
+          dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm 
+        elif [[ ${{  inputs.OS  }} == 'redhat/ubi9:latest' ]]; then
+          dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm 
+        fi
+
+        # add pyparsing for benchmarking
+        dnf install -y python3-pip
+        pip3 install pyparsing
+
+        dnf update -y
+        dnf upgrade -y 
+        dnf clean all
+
+    #######################################
+    # Install Cmake
+    #######################################
+    - name: Install Cmake via DNF
+      shell: bash
+      run: |
+        dnf install -y cmake
+
+    ######################################
+    # Install ArmClang
+    ######################################
+    - name: Install armclang
+      shell: bash
+      run: |
+
+        dnf install -y environment-modules
+        source /etc/profile.d/modules.sh
+
+        wget https://developer.arm.com/-/cdn-downloads/permalink/Arm-Compiler-for-Linux/Version_24.04/arm-compiler-for-linux_24.04_Ubuntu-22.04_aarch64.tar
+        tar -xf arm-compiler-for-linux_24.04_Ubuntu-22.04_aarch64.tar
+
+        ./arm-compiler-for-linux_24.04_Ubuntu-20.04/arm-compiler-for-linux_24.04_Ubuntu-20.04.sh --install-to ./armclang_compiler
+        export MODULEPATH=$MODULEPATH:$(pwd)/armclang_compiler/modulefiles
+
+        module avail
+        module load acfl/24.04
+        armclang -v
+
+        echo "C_COMPILER=$(which armclang)" >> $GITHUB_ENV
+        echo "CPP_COMPILER=$(which armclang++)" >> $GITHUB_ENV
+
+    #######################################
+    # Build SimEng
+    #######################################
+    - name: Build SimEng
+      shell: bash
+      run: |
+        cmake -B build -S . -DCMAKE_BUILD_TYPE=${{ inputs.MODE }} -DSIMENG_ENABLE_TESTS=ON -DSIMENG_OPTIMIZE=ON -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} -DCMAKE_CXX_COMPILER=${{ env.CPP_COMPILER }}
+
+        cmake --build build -j $(nproc)
+
+        cmake --build build --target install
diff --git a/.github/actions/setup_armclang_rocky/action.yml b/.github/actions/setup_armclang_rocky/action.yml
new file mode 100644
index 0000000000..93f337f02a
--- /dev/null
+++ b/.github/actions/setup_armclang_rocky/action.yml
@@ -0,0 +1,86 @@
+name: setup armclang
+description: installs dependencies and correct armclang version to build and test simeng
+
+inputs:
+  OS:
+    description: docker image name
+    required: true
+  MODE:
+    description: simeng-mode e.g. Release or Debug
+    required: true
+
+runs:
+  using: 'composite'
+  steps:
+    #######################################
+    # Install dependencies required (cmake, etc).
+    #######################################
+    - name: Install dependencies
+      shell: bash
+      run: |
+        dnf -y update && \
+        dnf install -y epel-release \
+          gcc gcc-c++ make \
+          git \
+          wget \
+          openssl-devel \
+          automake \
+          autoconf \
+          bzip2 \
+          file \
+          sudo \
+          tree \
+          zlib-devel
+
+        # add pyparsing for benchmarking
+        dnf install -y python3-pip
+        pip3 install pyparsing
+
+        dnf group install -y "Development Tools"
+        dnf update -y
+        dnf upgrade -y
+
+    #######################################
+    # Install Cmake
+    #######################################
+    - name: Install Cmake via DNF
+      shell: bash
+      run: |
+        dnf install -y cmake
+
+    ######################################
+    # Install ArmClang
+    ######################################
+    - name: Install armclang
+      shell: bash
+      run: |
+
+        dnf install -y environment-modules
+        source /etc/profile.d/modules.sh
+
+        wget https://developer.arm.com/-/cdn-downloads/permalink/Arm-Compiler-for-Linux/Version_24.04/arm-compiler-for-linux_24.04_Ubuntu-22.04_aarch64.tar
+        tar -xf arm-compiler-for-linux_24.04_Ubuntu-22.04_aarch64.tar
+
+        ./arm-compiler-for-linux_24.04_Ubuntu-20.04/arm-compiler-for-linux_24.04_Ubuntu-20.04.sh --install-to ./armclang_compiler
+        export MODULEPATH=$MODULEPATH:$(pwd)/armclang_compiler/modulefiles
+
+        module avail
+        module load acfl/24.04
+        armclang -v
+
+        echo "C_COMPILER=$(which armclang)" >> $GITHUB_ENV
+        echo "CPP_COMPILER=$(which armclang++)" >> $GITHUB_ENV
+
+    #######################################
+    # Build SimEng
+    #######################################
+    - name: Build SimEng
+      shell: bash
+      run: |
+        cmake -B build -S . -DCMAKE_BUILD_TYPE=${{ inputs.MODE }} -DSIMENG_ENABLE_TESTS=ON -DSIMENG_OPTIMIZE=ON -DCMAKE_C_COMPILER=${{ env.GCC_DIR }} \
+          -DCMAKE_CXX_COMPILER=${{ env.CPP_DIR }}
+
+        cmake --build build -j $(nproc)
+
+        cmake --build build --target install
+
diff --git a/.github/actions/setup_armclang_ubuntu/action.yml b/.github/actions/setup_armclang_ubuntu/action.yml
new file mode 100644
index 0000000000..dada710f68
--- /dev/null
+++ b/.github/actions/setup_armclang_ubuntu/action.yml
@@ -0,0 +1,105 @@
+name: setup armclang
+description: installs dependencies and correct armclang version to build and test simeng
+
+inputs:
+  OS:
+    description: docker image name
+    required: true
+  MODE:
+    description: simeng-mode e.g. Release or Debug
+    required: true
+    
+runs:
+  using: 'composite'
+  steps:
+    #######################################
+    # Install dependencies
+    #######################################
+    - name: Install dependencies
+      run: |
+        export DEBIAN_FRONTEND=noninteractive
+
+        # Update package lists
+        apt-get update
+
+        # Install essential packages
+        apt-get install -y \
+          software-properties-common \
+          sudo \
+          wget \
+          zlib1g-dev \
+          python3 \
+          build-essential \
+          libssl-dev \
+          ninja-build \
+          tree \
+          git
+
+        # add pyparsing for benchmarking
+        apt-get install -y python3-pip
+        pip3 install pyparsing
+
+        # Add additional repositories
+        add-apt-repository universe
+        add-apt-repository ppa:ubuntu-toolchain-r/test
+
+        # Update package lists again after adding repositories
+        apt-get update
+
+        # Upgrade all installed packages
+        apt-get upgrade -y 
+
+    #######################################
+    # Install Cmake
+    #######################################
+    - name: Install cmake via apt
+      shell: bash
+      run: |
+        if [[ ${{ inputs.OS == 'ubuntu:18.04' }} ]]; then
+          NAME=bionic
+        else
+          NAME=focal
+        fi
+
+        wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null && \
+        apt-add-repository "deb https://apt.kitware.com/ubuntu/ $NAME main" && \
+        apt update && apt install cmake -y
+        apt upgrade -y
+
+    #######################################
+    # Install ArmClang
+    #######################################
+    - name: Install armclang
+      shell: bash
+      run: |
+        apt-get update
+        apt-get upgrade -y
+        apt-get install environment-modules
+        source /etc/profile.d/modules.sh
+
+        wget https://developer.arm.com/-/cdn-downloads/permalink/Arm-Compiler-for-Linux/Version_24.04/arm-compiler-for-linux_24.04_Ubuntu-22.04_aarch64.tar
+        tar -xf arm-compiler-for-linux_24.04_Ubuntu-22.04_aarch64.tar
+
+        ./arm-compiler-for-linux_24.04_Ubuntu-20.04/arm-compiler-for-linux_24.04_Ubuntu-20.04.sh --install-to ./armclang_compiler
+        export MODULEPATH=$MODULEPATH:$(pwd)/armclang_compiler/modulefiles
+
+        module avail
+        module load acfl/24.04
+        armclang -v
+        echo "C_COMPILER=$(which armclang)" >> $GITHUB_ENV
+        echo "CPP_COMPILER=$(which armclang++)" >> $GITHUB_ENV
+
+    #######################################
+    # Build SimEng
+    #######################################
+
+    - name: Build SimEng
+      shell: bash
+      run: |
+        cmake -B build -S . -DCMAKE_BUILD_TYPE=${{ inputs.MODE }} -DSIMENG_ENABLE_TESTS=ON -DSIMENG_OPTIMIZE=ON -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} -DCMAKE_CXX_COMPILER=${{ env.CPP_COMPILER }}
+
+        cmake --build build -j $(nproc)
+
+        cmake --build build --target install
+
+      
\ No newline at end of file
diff --git a/.github/actions/setup_clang_macos/action.yml b/.github/actions/setup_clang_macos/action.yml
new file mode 100644
index 0000000000..067c6c3565
--- /dev/null
+++ b/.github/actions/setup_clang_macos/action.yml
@@ -0,0 +1,44 @@
+name: setup clang for apple
+description: build and test simeng using clang compiler on apple
+
+inputs:
+  OS:
+    description: docker image name
+    required: true
+  MODE:
+    description: simeng-mode e.g. Release or Debug
+    required: true
+runs:
+  using: 'composite'
+  steps:
+    #######################################
+    # Install dependencies required 
+    #######################################
+    - name: Install dependencies
+      shell: bash
+      run: |
+        pip3 install pyparsing
+        brew install make
+
+    #######################################
+    # Clang is already installed with xcode
+    #######################################
+
+    - name: set clang and clang++ env variables
+      shell: bash
+      run: |
+        echo "C_COMPILER=$(which clang)" >> $GITHUB_ENV
+        echo "CPP_COMPILER=$(which clang++)" >> $GITHUB_ENV
+
+    # #######################################
+    # # Build SimEng
+    # #######################################
+    - name: Build SimEng
+      shell: bash
+      run: |
+        cmake -B build -S . -DCMAKE_BUILD_TYPE=${{ inputs.MODE }} -DSIMENG_ENABLE_TESTS=ON -DSIMENG_OPTIMIZE=ON -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} \
+          -DCMAKE_CXX_COMPILER=${{ env.CPP_COMPILER }}
+
+        cmake --build build -j $(sysctl -n hw.ncpu)
+
+        cmake --build build --target install
diff --git a/.github/actions/setup_gcc_debian/action.yml b/.github/actions/setup_gcc_debian/action.yml
new file mode 100644
index 0000000000..90e4798d95
--- /dev/null
+++ b/.github/actions/setup_gcc_debian/action.yml
@@ -0,0 +1,128 @@
+name: setup gcc
+description: installs dependencies and correct gcc version to build and test simeng
+
+inputs:
+  OS:
+    description: docker image name
+    required: true
+  gcc-version:
+    description: gcc version
+    required: true
+  MODE:
+    description: Release or Debug mode
+    required: true
+
+runs:
+  using: 'composite'
+  steps:
+    #######################################
+    # Install dependencies required (cmake, etc).
+    #######################################
+    - name: Install dependencies
+      shell: bash
+      run: |
+        export DEBIAN_FRONTEND=noninteractive
+
+        # Update package lists
+        apt-get update
+
+        # Install essential packages
+        apt-get install -y \
+          python3-launchpadlib \
+          software-properties-common \
+          build-essential \
+          sudo \
+          wget \
+          zlib1g-dev \
+          python3 \
+          build-essential \
+          libssl-dev \
+          ninja-build \
+          tree \
+          git
+
+        # add pyparsing for benchmarking
+        apt-get install -y python3-pip
+        pip3 install pyparsing
+
+        apt-get update
+        apt-get upgrade -y
+      
+    #######################################
+    # Install Cmake
+    #######################################
+    - name: Install Cmake via apt
+      shell: bash
+      run: |
+        wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null && \
+        apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main' && \
+        apt update && apt install cmake -y
+        apt upgrade -y
+
+    #######################################
+    # Restore gcc from cache.
+    #######################################
+    - name: restore gcc
+      uses: actions/cache/restore@v4
+      id: gcc-restore-v4
+      with: 
+        path: /usr/local/${{ inputs.gcc-version }}.5.0
+        key: ${{ inputs.gcc-version }}-${{ inputs.OS }}
+
+    #######################################
+    # If restoring gcc set env vars for info step in OS_BUILD_TEST.yml.
+    #######################################
+    - if: ${{ steps.gcc-restore-v4.outputs.cache-hit == 'true' }}
+      name: set env vars if restoring from cache
+      shell: bash
+      run: |
+        echo "C_COMPILER=/usr/local/${{ inputs.gcc-version }}.5.0/bin/gcc" >> $GITHUB_ENV
+        echo "CPP_COMPILER=/usr/local/${{ inputs.gcc-version }}.5.0/bin/g++" >> $GITHUB_ENV
+
+    #######################################
+    # Install gcc from source.
+    #######################################
+    - if: ${{ steps.gcc-restore-v4.outputs.cache-hit != 'true' }}
+      name: install gcc
+      shell: bash
+      run: |
+        GCC_VER="${{ inputs.gcc-version }}.5.0"
+
+        wget https://ftp.gnu.org/gnu/gcc/$GCC_VER/$GCC_VER.tar.gz > /dev/null 2>&1
+        tar zxf $GCC_VER.tar.gz
+        cd $GCC_VER
+        ./contrib/download_prerequisites
+        cd ..
+        mkdir gcc-build
+        cd gcc-build
+        ../$GCC_VER/configure --enable-languages=c,c++ --disable-multilib --prefix=/usr/local/$GCC_VER
+        make -j$(nproc)
+        make install
+      
+        echo "C_COMPILER=/usr/local/$GCC_VER/bin/gcc" >> $GITHUB_ENV
+        echo "CPP_COMPILER=/usr/local/$GCC_VER/bin/g++" >> $GITHUB_ENV
+    
+    #######################################
+    # Save gcc to cache if earlier miss occurred.
+    #######################################
+    - if: ${{ steps.gcc-restore-v4.outputs.cache-hit != 'true' }}
+      name: save gcc
+      uses: actions/cache/save@v4
+      id: gcc-save-v4
+      with: 
+        path: /usr/local/${{ inputs.gcc-version }}.5.0
+        key: ${{ inputs.gcc-version }}-${{ inputs.OS }}
+
+    #######################################
+    # Build SimEng
+    #######################################
+    - name: Build SimEng
+      shell: bash
+      run: |
+        cmake -B build -S . -DCMAKE_BUILD_TYPE=${{ inputs.MODE }} -DSIMENG_ENABLE_TESTS=ON -DSIMENG_OPTIMIZE=ON -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} -DCMAKE_CXX_COMPILER=${{ env.CPP_COMPILER }}
+
+        cmake --build build -j $(nproc)
+
+        cmake --build build --target install
+
+        
diff --git a/.github/actions/setup_gcc_macos/action.yml b/.github/actions/setup_gcc_macos/action.yml
new file mode 100644
index 0000000000..9992548c26
--- /dev/null
+++ b/.github/actions/setup_gcc_macos/action.yml
@@ -0,0 +1,50 @@
+name: setup gcc
+description: installs dependencies and correct gcc version to build and test simeng
+
+inputs:
+  OS:
+    description: docker image name
+    required: true
+  gcc-version:
+    description: gcc version
+    required: true
+  MODE:
+    description: Release or Debug mode
+    required: true
+
+runs:
+  using: 'composite'
+  steps:
+    #######################################
+    # Install dependencies required (cmake, etc).
+    #######################################
+
+    - name: install dependencies
+      shell: bash
+      run: |
+        pip3 install pyparsing
+        brew install make
+
+    #######################################
+    # Install gcc from source if not in cache.
+    #######################################
+    - name: install gcc
+      shell: bash
+      run: |
+        brew install gcc@$( echo ${{ inputs.gcc-version }} | cut -d '-' -f 2)
+
+        echo "C_COMPILER=/usr/local/bin/${{ inputs.gcc-version }}" >> $GITHUB_ENV
+        echo "CPP_COMPILER=/usr/local/bin/g++-$( echo ${{ inputs.gcc-version }} | cut -d '-' -f 2)" >> $GITHUB_ENV
+
+    #######################################
+    # Build SimEng
+    #######################################
+    - name: Build SimEng
+      shell: bash
+      run: |
+        cmake -B build -S . -DCMAKE_BUILD_TYPE=${{ inputs.MODE }} -DSIMENG_ENABLE_TESTS=ON -DSIMENG_OPTIMIZE=ON -DCMAKE_C_C_COMPILER=${{ env.C_COMPILER }} \
+          -DCMAKE_CXX_C_COMPILER=${{ env.CPP_COMPILER }}
+             
+        cmake --build build -j $(sysctl -n hw.ncpu)
+
+        cmake --build build --target install
diff --git a/.github/actions/setup_gcc_redhat/action.yml b/.github/actions/setup_gcc_redhat/action.yml
new file mode 100644
index 0000000000..a0d332d1f9
--- /dev/null
+++ b/.github/actions/setup_gcc_redhat/action.yml
@@ -0,0 +1,126 @@
+name: setup gcc
+description: installs dependencies and correct gcc version to build and test simeng
+
+inputs:
+  OS:
+    description: docker image name
+    required: true
+  gcc-version:
+    description: gcc version
+    required: true
+  MODE:
+    description: Release or Debug mode
+    required: true
+
+runs:
+  using: 'composite'
+  steps:
+    #######################################
+    # Install dependencies required (cmake, etc).
+    #######################################
+    - name: Install dependencies
+      shell: bash
+      run: |
+        dnf -y update && \
+        dnf -y install \
+        gcc gcc-c++ make \
+        wget \
+        python3 \
+        git \
+        diffutils \
+        openssl-devel \
+        bzip2 \
+        automake \
+        autoconf \
+        cmake \
+        file \
+        zlib-devel 
+
+        if [[ ${{  inputs.OS  }} == 'redhat/ubi8:latest' ]]; then
+          dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm 
+        elif [[ ${{  inputs.OS  }} == 'redhat/ubi9:latest' ]]; then
+          dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm 
+        fi
+
+        # add pyparsing for benchmarking
+        dnf install -y python3-pip
+        pip3 install pyparsing
+
+        dnf update -y
+        dnf upgrade -y 
+        dnf clean all
+
+    #######################################
+    # Install Cmake
+    #######################################
+    - name: Install Cmake via DNF
+      shell: bash
+      run: |
+        dnf install -y cmake
+
+    #######################################
+    # Restore gcc 7 or 8 from cache.
+    #######################################
+    - name: restore gcc
+      uses: actions/cache/restore@v4
+      id: gcc-restore-v4
+      with: 
+        path: /usr/local/${{ inputs.gcc-version }}.5.0
+        key: ${{ inputs.gcc-version }}-${{ inputs.OS }}
+
+    #######################################
+    # If restoring gcc set env vars for info step in OS_BUILD_TEST.yml.
+    #######################################
+    - if: ${{ steps.gcc-restore-v4.outputs.cache-hit == 'true' }}
+      name: set env vars if restoring from cache
+      shell: bash
+      run: |
+        echo "C_COMPILER=/usr/local/${{ inputs.gcc-version }}.5.0/bin/gcc" >> $GITHUB_ENV
+        echo "CPP_COMPILER=/usr/local/${{ inputs.gcc-version }}.5.0/bin/g++" >> $GITHUB_ENV
+
+    #######################################
+    # As redhat 8 doesn't support installation of gcc via package manager unless you have a subscription,
+    # hence, we install all gcc versions from source.
+    #######################################
+    - if: ${{ steps.gcc-restore-v4.outputs.cache-hit != 'true' }}
+      name: install gcc
+      shell: bash
+      run: |
+        GCC_VER="${{ inputs.gcc-version }}.5.0"
+
+        wget https://ftp.gnu.org/gnu/gcc/$GCC_VER/$GCC_VER.tar.gz > /dev/null 2>&1
+        tar xf $GCC_VER.tar.gz
+        cd $GCC_VER
+        ./contrib/download_prerequisites
+        cd ..
+        mkdir gcc-build
+        cd gcc-build
+        ../$GCC_VER/configure --enable-languages=c,c++ --disable-multilib --prefix=/usr/local/$GCC_VER
+        make -j$(nproc)
+        make install
+
+        echo "C_COMPILER=/usr/local/$GCC_VER/bin/gcc" >> $GITHUB_ENV
+        echo "CPP_COMPILER=/usr/local/$GCC_VER/bin/g++" >> $GITHUB_ENV
+
+    #######################################
+    # Save gcc to cache if earlier miss occurred.
+    #######################################
+    - if: ${{ steps.gcc-restore-v4.outputs.cache-hit != 'true' }}
+      name: save gcc
+      uses: actions/cache/save@v4
+      id: gcc-save-v4
+      with: 
+        path: /usr/local/${{ inputs.gcc-version }}.5.0
+        key: ${{ inputs.gcc-version }}-${{ inputs.OS }}
+
+    #######################################
+    # Build SimEng
+    #######################################
+    - name: Build SimEng
+      shell: bash
+      run: |
+        cmake -B build -S . -DCMAKE_BUILD_TYPE=${{ inputs.MODE }} -DSIMENG_ENABLE_TESTS=ON -DSIMENG_OPTIMIZE=ON -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} -DCMAKE_CXX_COMPILER=${{ env.CPP_COMPILER }}
+
+        cmake --build build -j $(nproc)
+
+        cmake --build build --target install
diff --git a/.github/actions/setup_gcc_rocky/action.yml b/.github/actions/setup_gcc_rocky/action.yml
new file mode 100644
index 0000000000..6798910248
--- /dev/null
+++ b/.github/actions/setup_gcc_rocky/action.yml
@@ -0,0 +1,127 @@
+name: setup gcc
+description: installs dependencies and correct gcc version to build and test simeng
+
+inputs:
+  OS:
+    description: docker image name
+    required: true
+  gcc-version:
+    description: gcc version
+    required: true
+  MODE:
+    description: Release or Debug mode
+    required: true
+
+runs:
+  using: 'composite'
+  steps:
+    #######################################
+    # Install dependencies required (cmake, etc).
+    #######################################
+    - name: Install dependencies
+      shell: bash
+      run: |
+        dnf -y update && \
+        dnf install -y epel-release \
+          gcc gcc-c++ make \
+          git \
+          wget \
+          openssl-devel \
+          automake \
+          autoconf \
+          bzip2 \
+          file \
+          sudo \
+          tree \
+          zlib-devel
+
+        # add pyparsing for benchmarking
+        dnf install -y python3-pip
+        pip3 install pyparsing
+
+        dnf group install -y "Development Tools"
+        dnf update -y
+        dnf upgrade -y
+
+    #######################################
+    # Install Cmake
+    #######################################
+    - name: Install Cmake via DNF
+      shell: bash
+      run: |
+        dnf install -y cmake
+
+    #######################################
+    # Restore gcc 7 or 8 from cache.
+    #######################################
+    - if: ${{ contains(fromJson('["gcc-7", "gcc-8"]'), inputs.gcc-version) }}
+      name: restore gcc
+      uses: actions/cache/restore@v4
+      id: gcc-restore-v4
+      with: 
+        path: /usr/local/${{ inputs.gcc-version }}.5.0
+        key: ${{ inputs.gcc-version }}-${{ inputs.OS }}
+
+    #######################################
+    # If restoring gcc set env vars for info step in OS_BUILD_TEST.yml.
+    #######################################
+    - if: ${{ steps.gcc-restore-v4.outputs.cache-hit == 'true' }}
+      name: set env vars if restoring from cache
+      shell: bash
+      run: |
+        echo "C_COMPILER=/usr/local/${{ inputs.gcc-version }}.5.0/bin/gcc" >> $GITHUB_ENV
+        echo "CPP_COMPILER=/usr/local/${{ inputs.gcc-version }}.5.0/bin/g++" >> $GITHUB_ENV
+
+    #######################################
+    # As rocky 8 doesn't support installing older versions of gcc via package manager,
+    # gcc 7 and 8 are installed from source and gcc 9 and 10 are installed via dnf.
+    #######################################
+    - if: ${{ steps.gcc-restore-v4.outputs.cache-hit != 'true' }}
+      name: install gcc
+      shell: bash
+      run: |
+        if [[ " gcc-9 gcc-10 " =~ (^|[[:space:]])${{inputs.gcc-version}}($|[[:space:]]) ]]; then
+          dnf install -y gcc-toolset-$( echo ${{ inputs.gcc-version }} | cut -d '-' -f 2)
+          echo "C_COMPILER=/opt/rh/gcc-toolset-$( echo ${{ inputs.gcc-version }} | cut -d '-' -f 2)/root/usr/bin/gcc" >> $GITHUB_ENV
+          echo "CPP_COMPILER=/opt/rh/gcc-toolset-$( echo ${{ inputs.gcc-version }} | cut -d '-' -f 2)/root/usr/bin/g++" >> $GITHUB_ENV
+        else
+          GCC_VER="${{ inputs.gcc-version }}.5.0"
+
+          wget https://ftp.gnu.org/gnu/gcc/$GCC_VER/$GCC_VER.tar.gz > /dev/null 2>&1
+          tar zxf $GCC_VER.tar.gz
+          cd $GCC_VER
+          ./contrib/download_prerequisites
+          cd ..
+          mkdir gcc-build
+          cd gcc-build
+          ../$GCC_VER/configure --enable-languages=c,c++ --disable-multilib --prefix=/usr/local/$GCC_VER
+          make -j$(nproc)
+          make install
+
+          echo "C_COMPILER=/usr/local/$GCC_VER/bin/gcc" >> $GITHUB_ENV
+          echo "CPP_COMPILER=/usr/local/$GCC_VER/bin/g++" >> $GITHUB_ENV
+        fi
+
+    #######################################
+    # Save gcc to cache if earlier miss occured.
+    #######################################
+    - if: ${{ contains(fromJson('["gcc-7", "gcc-8"]'), inputs.gcc-version) && steps.gcc-restore-v4.outputs.cache-hit != 'true' }}
+      name: save gcc
+      uses: actions/cache/save@v4
+      id: gcc-save-v4
+      with: 
+        path: /usr/local/${{ inputs.gcc-version }}.5.0
+        key: ${{ inputs.gcc-version }}-${{ inputs.OS }}
+
+    #######################################
+    # Build SimEng
+    #######################################
+    - name: Build SimEng
+      shell: bash
+      run: |
+        cmake -B build -S . -DCMAKE_BUILD_TYPE=${{ inputs.MODE }} -DSIMENG_ENABLE_TESTS=ON -DSIMENG_OPTIMIZE=ON -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} \
+          -DCMAKE_CXX_COMPILER=${{ env.CPP_COMPILER }}
+
+        cmake --build build -j $(nproc)
+
+        cmake --build build --target install
diff --git a/.github/actions/setup_gcc_ubuntu/action.yml b/.github/actions/setup_gcc_ubuntu/action.yml
new file mode 100644
index 0000000000..533a367a69
--- /dev/null
+++ b/.github/actions/setup_gcc_ubuntu/action.yml
@@ -0,0 +1,94 @@
+name: setup gcc
+description: installs dependencies and correct gcc version to build and test simeng
+
+inputs:
+  OS:
+    description: docker image name
+    required: true
+  gcc-version:
+    description: gcc version
+    required: true
+  MODE:
+    description: Release or Debug mode
+    required: true
+
+runs:
+  using: 'composite'
+  steps:
+    #######################################
+    # Install dependencies
+    #######################################
+    - name: Install dependencies
+      shell: bash
+      run: |
+        export DEBIAN_FRONTEND=noninteractive
+
+        # Update package lists
+        apt-get update
+
+        # Install essential packages
+        apt-get install -y \
+          software-properties-common \
+          sudo \
+          wget \
+          zlib1g-dev \
+          python3 \
+          build-essential \
+          libssl-dev \
+          ninja-build \
+          tree \
+          git
+
+        # add pyparsing for benchmarking
+        apt-get install -y python3-pip
+        pip3 install pyparsing
+
+        # Add additional repositories
+        add-apt-repository universe
+        add-apt-repository ppa:ubuntu-toolchain-r/ppa
+
+        # Update package lists again after adding repositories
+        apt-get update
+
+        # Upgrade all installed packages
+        apt-get upgrade -y
+        
+    #######################################
+    # Install Cmake
+    #######################################
+    - name: Install cmake via apt
+      shell: bash
+      run: |
+        wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null && \
+        apt-add-repository "deb https://apt.kitware.com/ubuntu/ focal main universe" && \
+        apt update && apt install cmake -y
+        apt upgrade -y
+
+    #######################################
+    # Install GCC
+    #######################################
+    - name: Install gcc
+      shell: bash
+      run: |
+        apt-get -y install ${{ inputs.gcc-version }}
+        apt-get -y install g++-$( echo ${{ inputs.gcc-version }} | cut -d '-' -f 2)
+        apt update && apt upgrade -y
+        
+    #######################################
+    # Build SimEng
+    # Then store the C compiler and CPP compiler as environmental variables to
+    # be used by LINUX_BUILD_TEST
+    #######################################
+    - name: Build SimEng
+      shell: bash
+      run: |
+        cmake -B build -S . -DCMAKE_BUILD_TYPE=${{ inputs.MODE }} -DSIMENG_ENABLE_TESTS=ON -DSIMENG_OPTIMIZE=ON -DCMAKE_C_COMPILER=/usr/bin/${{ inputs.gcc-version }} -DCMAKE_CXX_COMPILER=/usr/bin/g++-$( echo ${{ inputs.gcc-version }} | cut -d '-' -f 2)
+
+        cmake --build build -j $(nproc)
+
+        cmake --build build --target install
+
+        echo "C_COMPILER=/usr/bin/${{ inputs.gcc-version }}" >> $GITHUB_ENV
+        echo "CPP_COMPILER=/usr/bin/g++-$( echo ${{ inputs.gcc-version }} | cut -d '-' -f 2)" >> $GITHUB_ENV
+
+        
\ No newline at end of file
diff --git a/.github/actions/simeng_benchmarks/action.yml b/.github/actions/simeng_benchmarks/action.yml
new file mode 100644
index 0000000000..d7814ca6f8
--- /dev/null
+++ b/.github/actions/simeng_benchmarks/action.yml
@@ -0,0 +1,43 @@
+name: simeng-benchmarks
+description: runs simeng benchmarks
+
+inputs:
+  RUN_DIR:
+    description: directory from which the benchmark binary should be run
+    required: false
+  ARGS:
+    description: Any additional arguments needed to run this benchmark on SimEng
+    required: false
+    default: ""
+  BIN_PATH:
+    description: path to the binary for the benchmark
+    required: true
+  PASS_STRING:
+    description: string that is searched for in the benchmark's output to confirm whether or not it has passed
+    required: true
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Run Benchmark
+      shell: bash
+      run: |
+        if [ ${{ inputs.RUN_DIR }} ]
+        then
+          cd ${{ inputs.RUN_DIR }}
+        fi
+    
+        simeng "$GITHUB_WORKSPACE/configs/a64fx.yaml" "${{ inputs.BIN_PATH }}" ${{ inputs.ARGS }} > $GITHUB_WORKSPACE/simeng.tmp
+        
+        if grep -q ${{ inputs.PASS_STRING }} "$GITHUB_WORKSPACE/simeng.tmp"
+        then
+          cat $GITHUB_WORKSPACE/simeng.tmp
+          echo "Passed"
+        else
+          cat $GITHUB_WORKSPACE/simeng.tmp
+          echo "Failed"
+          exit 1
+        fi
+
+
+
diff --git a/.github/workflows/LINUX_BUILD_TEST.yml b/.github/workflows/LINUX_BUILD_TEST.yml
new file mode 100644
index 0000000000..d3dcb86516
--- /dev/null
+++ b/.github/workflows/LINUX_BUILD_TEST.yml
@@ -0,0 +1,713 @@
+name: Linux
+
+on:
+  workflow_call:
+    inputs:
+      SIMENG-MODE:
+        required: true
+        type: string
+      RUNNER:
+        default: ubuntu-latest
+        required: true
+        type: string
+
+env:
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  LLVM-VERSION: 18
+  BENCHMARK_BRANCH: 'make-file-build-system' # The branch inside the benchmark repo that has the script to run all benchmarks.
+  PAT: ${{ secrets.SIMENGUOB_PAT }}
+
+
+jobs:
+  Build_and_Run:
+    runs-on: ${{ inputs.RUNNER }}
+    
+    strategy:
+      fail-fast: false
+      matrix:
+        
+        COMPILER: ['gcc-7', 'gcc-8', 'gcc-9', 'gcc-10'] # todo 'armclang'] # compiler names
+        OS: ['ubuntu:20.04', 'rockylinux:8', 'redhat/ubi8:latest', 'redhat/ubi9:latest', 'debian:10', 'debian:11'] # Docker images
+        
+        #######################################
+        # Removes unecessary jobs as jobs are generated in the order seen in the matrix.
+        # "Exclude" is to keep job ordering nice i.e. keeping ubuntu jobs next to each other in the list.
+        #######################################
+        exclude:
+
+          # need redhat 8 for gcc 7 and 8 | redhat 9 for gcc 9 and 10
+          - OS: 'redhat/ubi8:latest'
+            COMPILER: gcc-9
+          - OS: 'redhat/ubi8:latest'
+            COMPILER: gcc-10
+
+          - OS: 'redhat/ubi9:latest'
+            COMPILER: gcc-7
+          - OS: 'redhat/ubi9:latest'
+            COMPILER: gcc-8
+
+          # need debian-10 (buster) for gcc 7 |  
+          - OS: 'debian:10'
+            COMPILER: 'gcc-8'
+          - OS: 'debian:10'
+            COMPILER: 'gcc-9'
+          - OS: 'debian:10'
+            COMPILER: 'gcc-10'
+
+          - OS: 'debian:11'
+            COMPILER: 'gcc-7'
+    
+    #######################################
+    # Choose container and set name of workflow
+    #######################################
+    
+    container:
+      image: ${{ matrix.OS }}
+
+    name: "${{ matrix.OS }}, ${{ matrix.compiler }}"
+
+    steps:
+      #######################################
+      # Clones repo to workspace
+      #######################################
+      - name: checkout v4
+        uses: actions/checkout@v4
+
+      #######################################
+      # Depending on OS and compiler, this step chooses the correct setup action to run.
+      # The action is located in .github/actions/select_setup
+      #######################################
+      - name: setup compiler and OS env + build simeng
+        uses: ./.github/actions/select_setup
+        with: 
+          OS: ${{ matrix.OS }}
+          COMPILER: ${{ matrix.COMPILER }}
+          MODE: ${{ inputs.SIMENG-MODE }}
+
+      #######################################
+      # Prints out info in isolated step for easy access.
+      #######################################
+      - name: INFO
+        shell: bash
+        run: |
+          cat /etc/os-release 
+          echo "_______________________________________"
+          uname -a
+          echo "_______________________________________"
+          cmake --version
+          echo "_______________________________________"
+          "${{ env.C_COMPILER }}" --version
+          which gcc
+          echo "_______________________________________"
+          "${{ env.CPP_COMPILER }}" --version
+          which g++
+          echo "_______________________________________"
+          
+      #######################################
+      # Run Integration Tests.
+      #######################################
+      - name: Integration Tests
+        shell: bash
+        run: |
+          ./build/test/integration/integrationtests
+
+      #######################################
+      # Run Unit Tests.
+      #######################################
+      - name: Unit Tests
+        shell: bash
+        run: |
+          ./build/test/unit/unittests
+
+      #######################################
+      # Run Regression AARCH64 Tests.
+      #######################################
+      - name: Regression Test (aarch64)
+        if: always()
+        shell: bash
+        run: |
+          ./build/test/regression/aarch64/regression-aarch64
+
+      #######################################
+      # Run Regression RISCV Tests.
+      #######################################
+      - name: Regression Test (riscv)
+        if: always()
+        shell: bash
+        run: |
+          ./build/test/regression/riscv/regression-riscv
+
+      #######################################
+      # Run Benchmarks
+      # Separate steps for each benchmark to ensure itemised outputs
+      #  in the actions interface
+      #######################################
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: Checking out benchmark repository (v4)
+        uses: actions/checkout@v4
+        with:
+          repository: UoB-HPC/simeng-benchmarks
+          ref: makefile-build-system
+          token: ${{ env.PAT }}
+          path: simeng-benchmarks
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf serial gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/serial/cloverleaf_gcc8.3.0_armv8.4
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf serial gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/serial/cloverleaf_gcc9.3.0_armv8.4
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf serial gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/serial/cloverleaf_gcc10.3.0_armv8.4
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf serial armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/serial/cloverleaf_armclang20_armv8.4
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf openmp gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/openmp/cloverleaf_gcc8.3.0_armv8.4
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf openmp gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/openmp/cloverleaf_gcc9.3.0_armv8.4
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf openmp gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/openmp/cloverleaf_gcc10.3.0_armv8.4
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf openmp armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/openmp/cloverleaf_armclang20_armv8.4
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: miniBUDE openmp gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          ARGS: "-n 64 -i 1 --deck $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/miniBUDE/bm1"
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/miniBUDE/openmp/minibude_gcc8.3.0_armv8.4
+          PASS_STRING: "Largest difference was 0.000%."
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: miniBUDE openmp gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          ARGS: "-n 64 -i 1 --deck $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/miniBUDE/bm1"
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/miniBUDE/openmp/minibude_gcc9.3.0_armv8.4
+          PASS_STRING: "Largest difference was 0.000%."
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: miniBUDE openmp gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          ARGS: "-n 64 -i 1 --deck $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/miniBUDE/bm1"
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/miniBUDE/openmp/minibude_gcc10.3.0_armv8.4
+          PASS_STRING: "Largest difference was 0.000%."
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: miniBUDE openmp armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          ARGS: "-n 64 -i 1 --deck $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/miniBUDE/bm1"
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/miniBUDE/openmp/minibude_armclang20_armv8.4
+          PASS_STRING: "Largest difference was 0.000%."
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM serial gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream_gcc8.3.0_armv8.4
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM serial gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream_gcc9.3.0_armv8.4
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM serial gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream_gcc10.3.0_armv8.4
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM serial armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream_armclang20_armv8.4
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM openmp gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream-omp_gcc8.3.0_armv8.4
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM openmp gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream-omp_gcc9.3.0_armv8.4
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM openmp gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream-omp_gcc10.3.0_armv8.4
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM openmp armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream-omp_armclang20_armv8.4
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D serial gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf_gcc8.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D serial gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf_gcc9.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D serial gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf_gcc10.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D serial armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf_armclang20_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D openmp gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf-omp_gcc8.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D openmp gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf-omp_gcc9.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D openmp gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf-omp_gcc10.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D openmp armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf-omp_armclang20_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D serial gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf_gcc8.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D serial gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf_gcc9.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D serial gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf_gcc10.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D serial armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf_armclang20_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D openmp gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf-omp_gcc8.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D openmp gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf-omp_gcc9.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D openmp gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf-omp_gcc10.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D openmp armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf-omp_armclang20_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf serial gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/serial/cloverleaf_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf serial gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/serial/cloverleaf_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf serial gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/serial/cloverleaf_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf serial armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/serial/cloverleaf_armclang20_armv8.4+sve
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf openmp gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/openmp/cloverleaf_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf openmp gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/openmp/cloverleaf_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf openmp gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/openmp/cloverleaf_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf openmp armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/openmp/cloverleaf_armclang20_armv8.4+sve
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: miniBUDE openmp gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          ARGS: "-n 64 -i 1 --deck $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/miniBUDE/bm1"
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/miniBUDE/openmp/minibude_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "Largest difference was 0.000%."
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: miniBUDE openmp gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          ARGS: "-n 64 -i 1 --deck $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/miniBUDE/bm1"
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/miniBUDE/openmp/minibude_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "Largest difference was 0.000%."
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: miniBUDE openmp gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          ARGS: "-n 64 -i 1 --deck $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/miniBUDE/bm1"
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/miniBUDE/openmp/minibude_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "Largest difference was 0.000%."
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: miniBUDE openmp armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          ARGS: "-n 64 -i 1 --deck $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/miniBUDE/bm1"
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/miniBUDE/openmp/minibude_armclang20_armv8.4+sve
+          PASS_STRING: "Largest difference was 0.000%."
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM serial gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM serial gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM serial gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM serial armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream_armclang20_armv8.4+sve
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM openmp gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream-omp_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM openmp gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream-omp_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM openmp gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream-omp_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM openmp armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream-omp_armclang20_armv8.4+sve
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D serial gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D serial gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D serial gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D serial armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf_armclang20_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D openmp gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf-omp_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D openmp gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf-omp_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D openmp gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf-omp_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D openmp armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf-omp_armclang20_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D serial gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D serial gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D serial gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D serial armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf_armclang20_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D openmp gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf-omp_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D openmp gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf-omp_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D openmp gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf-omp_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D openmp armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf-omp_armclang20_armv8.4+sve
+          PASS_STRING: "This run PASSED"
\ No newline at end of file
diff --git a/.github/workflows/MACOS_BUILD_TEST.yml b/.github/workflows/MACOS_BUILD_TEST.yml
new file mode 100644
index 0000000000..18a2e86c96
--- /dev/null
+++ b/.github/workflows/MACOS_BUILD_TEST.yml
@@ -0,0 +1,676 @@
+name: MacOS
+
+on:
+  workflow_call:
+    inputs:
+      SIMENG-MODE:
+        required: true
+        type: string
+
+env:
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  BENCHMARK_BRANCH: 'make-file-build-system' # The branch inside the benchmark repo that has the script to run all benchmarks.
+  PAT: ${{ secrets.SIMENGUOB_PAT }}
+
+jobs:
+  Build_and_Run:
+    runs-on: macos-13
+    
+    strategy:
+      fail-fast: false
+      matrix:
+        COMPILER: ['gcc-10', 'apple_clang_15'] # NOTE: only gcc 10 works with provided macos runners on github actions the other versions are difficult to get and don't work
+
+
+    name: "macos-13, ${{ matrix.compiler }}"
+
+    steps:
+      #######################################
+      # Clones repo to workspace.
+      #######################################
+      - name: checkout v4
+        uses: actions/checkout@v4
+
+      #######################################
+      # Depending on OS and compiler, this step chooses the correct setup action to run.
+      #######################################
+      - name: setup compiler and OS env + build simeng
+        uses: ./.github/actions/select_setup
+        with: 
+          OS: macos
+          COMPILER: ${{ matrix.COMPILER }}
+          MODE: ${{ inputs.simeng-mode }}
+
+      #######################################
+      # Prints out info in isolated step for easy access.
+      #######################################
+      - name: INFO
+        shell: bash
+        run: |
+          echo "_______________________________________"
+          cmake --version
+          echo "_______________________________________"
+          "${{ env.C_COMPILER }}" --version
+          which ${{ env.C_COMPILER }}
+          echo "_______________________________________"
+          "${{ env.CPP_COMPILER }}" --version
+          which ${{ env.CPP_COMPILER }}
+          echo "_______________________________________"
+          
+      #######################################
+      # Run Integration Tests.
+      #######################################
+      - name: Integration Tests
+        shell: bash
+        run: |
+          ./build/test/integration/integrationtests
+
+      #######################################
+      # Run Unit Tests.
+      #######################################
+      - name: Unit Tests
+        shell: bash
+        run: |
+          ./build/test/unit/unittests
+
+      #######################################
+      # Run Regression AARCH64 Tests.
+      #######################################
+      - name: regression test (aarch64)
+        if: always()
+        shell: bash
+        run: |
+          ./build/test/regression/aarch64/regression-aarch64
+
+      #######################################
+      # Run Regression RISCV Tests.
+      #######################################
+      - name: regression test (riscv)
+        if: always()
+        shell: bash
+        run: |
+          ./build/test/regression/riscv/regression-riscv
+
+      #######################################
+      # Run Benchmarks
+      # Separate steps for each benchmark to ensure itemised outputs
+      # in the actions interface
+      #######################################
+
+      - if: ${{ contains(fromJson('["ubuntu:18.04"]'), matrix.OS) && inputs.SIMENG-MODE == 'Release' }}
+        name: Checking out benchmark repository (v3)
+        uses: actions/checkout@v3
+        with:
+          repository: UoB-HPC/simeng-benchmarks
+          token: ${{ env.PAT }}
+          ref: makefile-build-system
+          path: simeng-benchmarks
+
+      - if: ${{ !contains(fromJson('["ubuntu:18.04"]'), matrix.OS) && inputs.SIMENG-MODE == 'Release' }}
+        name: Checking out benchmark repository (v4)
+        uses: actions/checkout@v4
+        with:
+          repository: UoB-HPC/simeng-benchmarks
+          ref: makefile-build-system
+          token: ${{ env.PAT }}
+          path: simeng-benchmarks
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf serial gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/serial/cloverleaf_gcc8.3.0_armv8.4
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf serial gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/serial/cloverleaf_gcc9.3.0_armv8.4
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf serial gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/serial/cloverleaf_gcc10.3.0_armv8.4
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf serial armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/serial/cloverleaf_armclang20_armv8.4
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf openmp gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/openmp/cloverleaf_gcc8.3.0_armv8.4
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf openmp gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/openmp/cloverleaf_gcc9.3.0_armv8.4
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf openmp gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/openmp/cloverleaf_gcc10.3.0_armv8.4
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf openmp armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/openmp/cloverleaf_armclang20_armv8.4
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: miniBUDE openmp gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          ARGS: "-n 64 -i 1 --deck $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/miniBUDE/bm1"
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/miniBUDE/openmp/minibude_gcc8.3.0_armv8.4
+          PASS_STRING: "Largest difference was 0.000%."
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: miniBUDE openmp gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          ARGS: "-n 64 -i 1 --deck $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/miniBUDE/bm1"
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/miniBUDE/openmp/minibude_gcc9.3.0_armv8.4
+          PASS_STRING: "Largest difference was 0.000%."
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: miniBUDE openmp gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          ARGS: "-n 64 -i 1 --deck $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/miniBUDE/bm1"
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/miniBUDE/openmp/minibude_gcc10.3.0_armv8.4
+          PASS_STRING: "Largest difference was 0.000%."
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: miniBUDE openmp armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          ARGS: "-n 64 -i 1 --deck $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/miniBUDE/bm1"
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/miniBUDE/openmp/minibude_armclang20_armv8.4
+          PASS_STRING: "Largest difference was 0.000%."
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM serial gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream_gcc8.3.0_armv8.4
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM serial gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream_gcc9.3.0_armv8.4
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM serial gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream_gcc10.3.0_armv8.4
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM serial armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream_armclang20_armv8.4
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM openmp gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream-omp_gcc8.3.0_armv8.4
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM openmp gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream-omp_gcc9.3.0_armv8.4
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM openmp gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream-omp_gcc10.3.0_armv8.4
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM openmp armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream-omp_armclang20_armv8.4
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D serial gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf_gcc8.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D serial gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf_gcc9.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D serial gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf_gcc10.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D serial armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf_armclang20_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D openmp gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf-omp_gcc8.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D openmp gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf-omp_gcc9.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D openmp gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf-omp_gcc10.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D openmp armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf-omp_armclang20_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D serial gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf_gcc8.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D serial gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf_gcc9.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D serial gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf_gcc10.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D serial armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf_armclang20_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D openmp gcc8.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf-omp_gcc8.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D openmp gcc9.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf-omp_gcc9.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D openmp gcc10.3.0 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf-omp_gcc10.3.0_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D openmp armclang20 armv8.4
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf-omp_armclang20_armv8.4
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf serial gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/serial/cloverleaf_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf serial gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/serial/cloverleaf_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf serial gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/serial/cloverleaf_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf serial armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/serial/cloverleaf_armclang20_armv8.4+sve
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf openmp gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/openmp/cloverleaf_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf openmp gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/openmp/cloverleaf_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf openmp gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/openmp/cloverleaf_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: CloverLeaf openmp armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/CloverLeaf
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/CloverLeaf/openmp/cloverleaf_armclang20_armv8.4+sve
+          PASS_STRING: "This test is considered PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: miniBUDE openmp gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          ARGS: "-n 64 -i 1 --deck $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/miniBUDE/bm1"
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/miniBUDE/openmp/minibude_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "Largest difference was 0.000%."
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: miniBUDE openmp gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          ARGS: "-n 64 -i 1 --deck $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/miniBUDE/bm1"
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/miniBUDE/openmp/minibude_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "Largest difference was 0.000%."
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: miniBUDE openmp gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          ARGS: "-n 64 -i 1 --deck $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/miniBUDE/bm1"
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/miniBUDE/openmp/minibude_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "Largest difference was 0.000%."
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: miniBUDE openmp armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          ARGS: "-n 64 -i 1 --deck $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/miniBUDE/bm1"
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/miniBUDE/openmp/minibude_armclang20_armv8.4+sve
+          PASS_STRING: "Largest difference was 0.000%."
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM serial gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM serial gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM serial gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM serial armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream_armclang20_armv8.4+sve
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM openmp gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream-omp_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM openmp gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream-omp_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM openmp gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream-omp_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: STREAM openmp armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/STREAM/stream-omp_armclang20_armv8.4+sve
+          PASS_STRING: "Solution Validates"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D serial gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D serial gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D serial gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D serial armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf_armclang20_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D openmp gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf-omp_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D openmp gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf-omp_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D openmp gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf-omp_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 2D openmp armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/2d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/2d/tealeaf-omp_armclang20_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D serial gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D serial gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D serial gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D serial armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf_armclang20_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D openmp gcc8.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf-omp_gcc8.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D openmp gcc9.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf-omp_gcc9.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D openmp gcc10.3.0 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf-omp_gcc10.3.0_armv8.4+sve
+          PASS_STRING: "This run PASSED"
+
+      - if: ${{ inputs.SIMENG-MODE == 'Release' }}
+        name: TeaLeaf 3D openmp armclang20 armv8.4+sve
+        uses: ./.github/actions/simeng_benchmarks
+        with:
+          RUN_DIR: $GITHUB_WORKSPACE/simeng-benchmarks/Data_Files/TeaLeaf/3d
+          BIN_PATH: $GITHUB_WORKSPACE/simeng-benchmarks/binaries/TeaLeaf/3d/tealeaf-omp_armclang20_armv8.4+sve
+          PASS_STRING: "This run PASSED"
\ No newline at end of file
diff --git a/.github/workflows/MAIN.yml b/.github/workflows/MAIN.yml
new file mode 100644
index 0000000000..8099ed46ab
--- /dev/null
+++ b/.github/workflows/MAIN.yml
@@ -0,0 +1,50 @@
+name: "Build"
+# description: Build and test simeng on various OS with various compilers followed by benchmarking and a performance regression test followed by a clang format if all previous workflow succeed
+
+on:
+  workflow_dispatch:
+
+  pull_request:
+    branches:
+      - dev
+      - main
+  
+jobs:
+
+  ###################################
+  # Debug Mode
+  ###################################
+  DEBUG_LINUX:
+    name: "Debug - build and test"
+    uses: ./.github/workflows/LINUX_BUILD_TEST.yml
+    with:
+      RUNNER: ubuntu-latest
+      SIMENG-MODE: Debug
+    secrets: inherit
+
+  DEBUG_MACOS:
+    name: "Debug - build and test"
+    uses: ./.github/workflows/MACOS_BUILD_TEST.yml
+    with:
+      SIMENG-MODE: Debug
+    secrets: inherit
+
+  ##################################
+  # Release Mode
+  ##################################
+  RELEASE_LINUX:
+    name: "Release - build, test and benchmarks"
+    uses: ./.github/workflows/LINUX_BUILD_TEST.yml
+    with:
+      RUNNER: ubuntu-latest
+      SIMENG-MODE: Release
+    secrets: inherit
+
+  RELEASE_MACOS:
+    name: "Release - build, test and benchmarks"
+    uses: ./.github/workflows/MACOS_BUILD_TEST.yml
+    with:
+      SIMENG-MODE: Release
+    secrets: inherit
+
+
diff --git a/.gitignore b/.gitignore
index c2978fa69b..3ed7dc90d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,8 @@
 obj
 build
 xcode-build
+cmake-build-debug
+cmake-build-release
 docs/sphinx/_build
 binaries
 install
@@ -12,6 +14,7 @@ install-debug
 capstone.pc
 version.hh
 **/specialFiles/
+.cache
 
 CMakeCache.txt
 CPackConfig.cmake
@@ -23,3 +26,4 @@ CMakeFiles/
 
 probe.*
 trace.*
+**/simeng-fileio-test.txt
diff --git a/.jenkins/build_arm22.sh b/.jenkins/build_arm22.sh
deleted file mode 100644
index 99ccee0130..0000000000
--- a/.jenkins/build_arm22.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-source .jenkins/build_test_run.sh
-
-## Download/clean and checkout pull request
-checkout
-
-## Load compilers/libraries
-echo "Compiler Armclang 22.0.2"
-module use /software/arm64/modulefiles
-module load tools/arm-compiler-sles
-module load tools/cmake
-
-## Build, test, and run SimEng
-build armclang armclang++
-test
-run
diff --git a/.jenkins/build_gcc10.sh b/.jenkins/build_gcc10.sh
deleted file mode 100644
index 20ec670795..0000000000
--- a/.jenkins/build_gcc10.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-source .jenkins/build_test_run.sh
-
-## Download/clean and checkout pull request
-checkout
-
-## Load compilers/libraries
-echo "Compiler GCC 10"
-module swap PrgEnv-cray PrgEnv-gnu
-module swap gcc gcc/10.3.0
-module load tools/cmake
-
-## Build, test, and run SimEng
-build gcc g++
-test
-run
diff --git a/.jenkins/build_gcc7.sh b/.jenkins/build_gcc7.sh
deleted file mode 100644
index 512c928706..0000000000
--- a/.jenkins/build_gcc7.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-source .jenkins/build_test_run.sh
-
-## Download/clean and checkout pull request
-checkout
-
-## Load compilers/libraries
-echo "Compiler GCC 7"
-module swap PrgEnv-cray PrgEnv-gnu
-module swap gcc gcc/7.3.0
-module load tools/cmake
-
-## Build, test, and run SimEng
-build gcc g++
-test
-run
diff --git a/.jenkins/build_gcc8.sh b/.jenkins/build_gcc8.sh
deleted file mode 100644
index 729b182895..0000000000
--- a/.jenkins/build_gcc8.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-source .jenkins/build_test_run.sh
-
-## Download/clean and checkout pull request
-checkout
-
-## Load compilers/libraries
-echo "Compiler GCC 8"
-module swap PrgEnv-cray PrgEnv-gnu
-module swap gcc gcc/8.3.0
-module load tools/cmake
-
-## Build, test, and run SimEng
-build gcc g++
-test
-run
diff --git a/.jenkins/build_gcc9.sh b/.jenkins/build_gcc9.sh
deleted file mode 100644
index 46422cc872..0000000000
--- a/.jenkins/build_gcc9.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-source .jenkins/build_test_run.sh
-
-## Download/clean and checkout pull request
-checkout
-
-## Load compilers/libraries
-echo "Compiler GCC 9"
-module swap PrgEnv-cray PrgEnv-gnu
-module swap gcc gcc/9.3.0
-module load tools/cmake
-
-## Build, test, and run SimEng
-build gcc g++
-test
-run
diff --git a/.jenkins/build_intel19.sh b/.jenkins/build_intel19.sh
deleted file mode 100644
index 0b14c734f2..0000000000
--- a/.jenkins/build_intel19.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-source .jenkins/build_test_run.sh
-
-## Download/clean and checkout pull request
-checkout
-
-## Setup environment
-PATH=/cm/shared/apps/intel_parallel_studio_xe_2019_update4/compilers_and_libraries_2019.4.243/linux/bin/intel64/:$PATH
-export CMAKE_C_COMPILER=icc
-export CC=icc
-export CMAKE_CXX_COMPILER=icpc
-export CXX=icpc
-
-## Load compilers/libraries
-echo "Compiler INTEL 19"
-#module load intel-parallel-studio-xe/compilers/64/2019u4/19.0.4 intel-parallel-studio-xe/mpi/64/2019u4/4.243
-export PATH=/home/br-hwaugh/installations/cmake-3.18.5/bin/:$PATH
-
-## Build, test, and run SimEng
-build
-test
-run
diff --git a/.jenkins/build_test_run.sh b/.jenkins/build_test_run.sh
deleted file mode 100644
index 05d78824f0..0000000000
--- a/.jenkins/build_test_run.sh
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/bin/bash
-# This script is not intended be to run direct but rather to be sourced from other scripts
-
-## Set up file structure
-export SIMENG_TOP="$PWD"
-export SIMENG_BUILD="$PWD"/build
-export SIMENG_INSTALL="$PWD"/install
-
-debug () {
-    echo "MODULES"
-    module li
-    echo "CURRENT DIRECTORY"
-    echo "$PWD"
-    echo "GIT BRANCH"
-    git branch
-
-    echo "SIMENG TOP $SIMENG_TOP"
-    echo "SIMENG BUILD $SIMENG_BUILD"
-    echo "SIMENG INSTALL $SIMENG_INSTALL"
-}
-
-# If source available clean and checkout, otherwise download
-checkout () {
-    cd "$SIMENG_TOP" || exit
-    rm -rf build install
-    mkdir build install
-}
-
-# Build common function
-build () {
-    cd "$SIMENG_TOP" || exit
-    rm -rf build/* install/*
-
-    cmake -B build -S . -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX="$SIMENG_INSTALL" -DSIMENG_ENABLE_TESTS=ON -DSIMENG_USE_EXTERNAL_LLVM=ON -DLLVM_DIR=/home/br-simeng/llvm14.0.5/install-gcc7/lib/cmake/llvm/ -DCMAKE_C_COMPILER=$1 -DCMAKE_CXX_COMPILER=$2
-    cmake --build build -j
-    cmake --build build --target install
-}
-
-# Run tests
-test () {
-    cd "$SIMENG_BUILD" || exit
-    ./test/unit/unittests --gtest_output=xml:unittests.xml || true
-    ./test/regression/aarch64/regression-aarch64 --gtest_output=xml:regressiontests.xml || true
-    ./test/regression/riscv/regression-riscv --gtest_output=xml:regressiontests.xml || true
-}
-
-# Run default program with and without specified configuration
-run () {
-    cd "$SIMENG_INSTALL" || exit
-
-    ./bin/simeng > run
-    echo "Simulation without configuration file argument:"
-    cat run
-    echo ""
-    compare_outputs "$(grep "retired:" run | rev | cut -d ' ' -f1 | rev)" "3145731" "retired instructions"
-    compare_outputs "$(grep "cycles:" run | rev | cut -d ' ' -f1 | rev)" "3145736" "simulated cycles"
-    echo ""
-
-    ./bin/simeng "$SIMENG_TOP"/configs/tx2.yaml > run
-    echo "Simulation with configuration file argument:"
-    cat run
-    echo ""
-    compare_outputs "$(grep "retired:" run | rev | cut -d ' ' -f1 | rev)" "3145732" "retired instructions"
-    compare_outputs "$(grep "cycles:" run | rev | cut -d ' ' -f1 | rev)" "1048588" "simulated cycles"
-    echo ""
-}
-
-# Helper function for checking outputs
-compare_outputs() {
-    if [[ $1 != $2 ]]
-    then
-        echo "ERROR: ${STAGE_NAME} run failed due to an incorrect number of $3."
-        echo -e "\tExpect \"$2\""
-        echo -e "\tGot \"$1\""
-        exit 1
-    fi
-}
diff --git a/.jenkins/format.sh b/.jenkins/format.sh
deleted file mode 100644
index 46e979bc43..0000000000
--- a/.jenkins/format.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-module swap PrgEnv-cray PrgEnv-gnu
-module use /lustre/projects/bristol/modules-a64fx/modulefiles/
-module load llvm/11.0
-
-git-clang-format --diff origin/main --extensions cc,hh > FORMATTING
-
-# Check whether any source files were modified
-if grep 'no modified files to format' FORMATTING
-then
-    exit 0
-fi
-
-# Check whether any formatting changes are necessary
-if grep 'clang-format did not modify any files' FORMATTING
-then
-    exit 0
-fi
-
-echo ""
-echo "Code formatting issues detected (see below)."
-echo ""
-cat FORMATTING
-exit 1
\ No newline at end of file
diff --git a/.jenkins/run.sh b/.jenkins/run.sh
deleted file mode 100644
index bb70cbe31b..0000000000
--- a/.jenkins/run.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-pwd
-
-source .jenkins/build_test_run.sh
-
-run
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ccbc9074a0..b8f4379b98 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,7 @@
 cmake_minimum_required(VERSION 3.13)
 include(FetchContent)
+include(CheckCXXCompilerFlag)
+include(ProcessorCount)
 set(FETCHCONTENT_QUIET OFF)
 
 # FetchContent_MakeAvailable was introduced in 3.14
@@ -13,21 +15,15 @@ macro(FetchContent_MakeAvailable_Args NAME ARGS)
     endif()
 endmacro()
 
-# Need an additional macro for LLVM as a sub-directory needs to be targeted for llvm-14.0.5
-macro(FetchContent_MakeAvailable_SubDir_Args NAME SUBDIR ARGS)
-    FetchContent_GetProperties(${NAME})
-    if(NOT ${NAME}_POPULATED)
-        FetchContent_Populate(${NAME})
-        add_subdirectory(${${NAME}_SOURCE_DIR}/${SUBDIR}/ ${${NAME}_BINARY_DIR} ${ARGS})
-    endif()
-endmacro()
-
-# we don't use git for LLVM here as it clones the entire LLVM repo which takes too long and we only need a small part of it
+if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
+    cmake_policy(SET CMP0135 NEW)
+endif()
 
+# we don't use git for LLVM here as it clones the entire LLVM repo which takes too long and we only need a relatively small part of it
 FetchContent_Declare(
-  llvm
-  URL      https://github.com/llvm/llvm-project/releases/download/llvmorg-14.0.5/llvm-14.0.5.src.tar.xz
-  URL_HASH MD5=6bd202e403d950c78985048ce499a518
+        llvm
+         URL https://github.com/llvm/llvm-project/releases/download/llvmorg-18.1.8/llvm-project-18.1.8.src.tar.xz
+         URL_HASH MD5=81cd0be5ae6f1ad8961746116d426a96
 )
 
 FetchContent_Declare(
@@ -38,27 +34,17 @@ FetchContent_Declare(
   GIT_PROGRESS   TRUE
 )
 
-FetchContent_Declare(
-  yaml-cpp
-  GIT_REPOSITORY https://github.com/jbeder/yaml-cpp.git
-  # the latest tagged version (0.6.3) is fairly old and doesn't contain the needed CMake stuff for this to work properly
-  # hence we pick the latest working commit
-  GIT_TAG        a6bbe0e50ac4074f0b9b44188c28cf00caf1a723
-  GIT_PROGRESS   TRUE
-)
-
+# WARNING: When updating capstone version, we MUST make a new, discrete branch in the UoB-HPC Capstone fork to ensure
+#          current / previous SimEng versions continue to operate correctly.
 FetchContent_Declare(
   capstone-lib
   GIT_REPOSITORY https://github.com/UoB-HPC/capstone.git
-  GIT_TAG next
+  GIT_TAG next-update # Branch for SimEng version 0.9.7
   GIT_PROGRESS TRUE
-
-  # Old Git tag pre-Armv9.2
-  # GIT_TAG e7be7d99e718ef9741026b80fc6f5e100fdf4f94 # trunk
 )
 
 cmake_policy(SET CMP0048 NEW)
-project(SimEng VERSION 0.9.4 LANGUAGES C CXX)
+project(SimEng VERSION 0.9.6 LANGUAGES C CXX)
 
 # If no build type was defined, default to Release
 if(NOT CMAKE_BUILD_TYPE)
@@ -79,8 +65,8 @@ set(CMAKE_MACOSX_RPATH 1)
 # Enable PIC for libraries
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
-# Enable additional compiler warnings for all targets
-add_compile_options(-Wall)
+# Create variable to enable additional compiler warnings for SimEng targets only
+set(SIMENG_COMPILE_OPTIONS -Wall -pedantic -Werror) #-Wextra
 
 # Disable RTTI for all targets
 add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fno-rtti>)
@@ -97,37 +83,47 @@ set(CAPSTONE_BUILD_SHARED OFF CACHE BOOL "Disable Capstone shared library")
 set(CAPSTONE_BUILD_CSTOOL OFF CACHE BOOL "Disable cstool build")
 set(CAPSTONE_INSTALL OFF CACHE BOOL "Disable install of capstone")
 
-set(CAPSTONE_ARM_SUPPORT OFF CACHE BOOL "Disable A32 support")
+set(CAPSTONE_ARM_SUPPORT OFF CACHE BOOL "Disable Arm32 support")
 set(CAPSTONE_MIPS_SUPPORT OFF CACHE BOOL "Disable MIPS support")
-set(CAPSTONE_X86_SUPPORT OFF CACHE BOOL "Disable x86 support")
 set(CAPSTONE_PPC_SUPPORT OFF CACHE BOOL "Disable PowerPC support")
+set(CAPSTONE_X86_SUPPORT OFF CACHE BOOL "Disable x86 support")
 set(CAPSTONE_SPARC_SUPPORT OFF CACHE BOOL "Disable Sparc support")
-set(CAPSTONE_SYSZ_SUPPORT OFF CACHE BOOL "Disable SystemZ support")
+set(CAPSTONE_SYSTEMZ_SUPPORT OFF CACHE BOOL "Disable SystemZ support")
 set(CAPSTONE_XCORE_SUPPORT OFF CACHE BOOL "Disable XCore support")
 set(CAPSTONE_M68K_SUPPORT OFF CACHE BOOL "Disable M68K support")
-set(CAPSTONE_TMS320C64X_SUPPORT OFF CACHE BOOL "Disable TMS320C64x")
+set(CAPSTONE_TMS320C64X_SUPPORT OFF CACHE BOOL "Disable TMS320C64x support")
 set(CAPSTONE_M680X_SUPPORT OFF CACHE BOOL "Disable M680x support")
 set(CAPSTONE_EVM_SUPPORT OFF CACHE BOOL "Disable EVM support")
-set(CAPSTONE_MOS65XX_SUPPORT OFF CACHE BOOL "Disable MSO65XX support")
 set(CAPSTONE_WASM_SUPPORT OFF CACHE BOOL "Disable WASM support")
+set(CAPSTONE_MOS65XX_SUPPORT OFF CACHE BOOL "Disable MSO65XX support")
 set(CAPSTONE_BPF_SUPPORT OFF CACHE BOOL "Disable BPF support")
+set(CAPSTONE_SH_SUPPORT OFF CACHE BOOL "Disable SH support")
+set(CAPSTONE_TRICORE_SUPPORT OFF CACHE BOOL "Disable TriCore support")
+set(CAPSTONE_ALPHA_SUPPORT OFF CACHE BOOL "Disable Alpha support")
+set(CAPSTONE_HPPA_SUPPORT OFF CACHE BOOL "Disable HPPA support")
+set(CAPSTONE_LOONGARCH_SUPPORT OFF CACHE BOOL "Disable LoongArch support")
+
 
 FetchContent_MakeAvailable_Args(capstone-lib EXCLUDE_FROM_ALL)
 include_directories("${capstone_BINARY_DIR}/include" "${capstone_SOURCE_DIR}/include")
 
-## Setup yaml-cpp ##
-set(YAML_CPP_BUILD_TESTS OFF)
-set(YAML_CPP_INSTALL OFF)
-
-FetchContent_MakeAvailable_Args(yaml-cpp EXCLUDE_FROM_ALL)
+# Enable the single RYML header to be packaged into the SimEng shared library
+set(RYML_SHARED ON)
 
 option(SIMENG_ENABLE_TESTS "Whether to enable testing for SimEng" OFF)
 option(SIMENG_USE_EXTERNAL_LLVM "Use an external LLVM rather than building it as a submodule" OFF)
 option(SIMENG_SANITIZE "Enable compiler sanitizers" OFF)
-option(SIMENG_OPTIMIZE "Enable Extra Compiler Optimizatoins" OFF)
+option(SIMENG_OPTIMIZE "Enable Extra Compiler Optimizations" OFF)
 option(SIMENG_ENABLE_SST "Compile SimEng SST Wrapper" OFF)
 option(SIMENG_ENABLE_SST_TESTS "Enable testing for SST" OFF)
 
+# Set CXX flag for Apple Mac so that `binary_function` and `unary_function` types that are used in SST can be recognised. 
+# They were deprecated in C++11 and removed in C++17, and Apple Clang v15 no longer supports these types without the following flag
+# TODO: Remove once SST integration has updated to SST version 13 or later - the use of unary and binary functions are removed in later versions.
+if(APPLE)
+  add_compile_definitions(_LIBCPP_ENABLE_CXX17_REMOVED_UNARY_BINARY_FUNCTION)
+endif()
+
 if (SIMENG_OPTIMIZE)
   # Turn on link time optimization for all targets.
   set(CMAKE_INTERPROCEDURAL_OPTIMIZATION ON)
@@ -153,50 +149,95 @@ if(SIMENG_ENABLE_TESTS)
     find_package(LLVM REQUIRED CONFIG NO_CMAKE_BUILDS_PATH)
 
       # Check LLVM version
-    if ((${LLVM_PACKAGE_VERSION} VERSION_LESS "8.0") OR (${LLVM_PACKAGE_VERSION} VERSION_GREATER_EQUAL "14.1"))
-      message(FATAL_ERROR "LLVM version must be >= 8.0 and <= 14.0")
+    if ((${LLVM_PACKAGE_VERSION} VERSION_LESS "8.0") OR (${LLVM_PACKAGE_VERSION} VERSION_GREATER_EQUAL "18.2"))
+      message(FATAL_ERROR "LLVM version must be >= 8.0 and < 18.2")
     endif()
 
       # Print message containing if the full test suite will run
     if (${LLVM_PACKAGE_VERSION} VERSION_LESS "14.0")
       message(STATUS "LLVM version does not support AArch64 extensions SME or SVE2. These test suites will be skipped.")
     endif()
+    if (${LLVM_PACKAGE_VERSION} VERSION_LESS "18.0")
+      message(STATUS "LLVM version does not support AArch64 extensions SME2. These test suites will be skipped.")
+    endif()
 
   else()
-
-    set(LLVM_TARGETS_TO_BUILD "AArch64;RISCV" CACHE INTERNAL "")
-
-    set(LLVM_BUILD_RUNTIME OFF)
-
-    set(LLVM_BUILD_TOOLS OFF)
-    set(LLVM_INCLUDE_TOOLS OFF)
-
-    set(LLVM_BUILD_EXAMPLES OFF)
-    set(LLVM_INCLUDE_EXAMPLES OFF)
-
-    set(LLVM_BUILD_TESTS OFF)
-    set(LLVM_INCLUDE_TESTS OFF)
-
-    set(LLVM_BUILD_BENCHMARKS OFF)
-    set(LLVM_INCLUDE_BENCHMARKS OFF)
-
-    set(LLVM_BUILD_DOCS OFF)
-    set(LLVM_INCLUDE_DOCS OFF)
-
-    set(LLVM_INCLUDE_DOCS OFF)
-    set(LLVM_ENABLE_BINDINGS OFF)
-    set(LLVM_INSTALL_UTILS OFF)
-
-    # XXX all LLVM specific cmake variables must be set BEFORE FetchContent_MakeAvailable otherwise they have no effect
-    FetchContent_MakeAvailable_SubDir_Args(llvm llvm-14.0.5.src EXCLUDE_FROM_ALL)
-    # make sure we get the headers too
-    include_directories("${llvm_BINARY_DIR}/include" "${llvm_SOURCE_DIR}/include")
+    # If external LLVM not provided, download LLVM and build only what we need. Then point SimEng to use this sub-build
+    FetchContent_GetProperties(llvm)
+    if(NOT llvm_POPULATED)
+        FetchContent_Populate(llvm)
+
+        set(COMMAND_ECHO_OPTION "")
+        # COMMAND_ECHO supported only in CMake >= 3.15
+        if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.15")
+            set(COMMAND_ECHO_OPTION COMMAND_ECHO STDOUT)
+        endif()
+
+        execute_process(
+                COMMAND ${CMAKE_COMMAND}
+                -S ${llvm_SOURCE_DIR}/llvm
+                -B ${llvm_BINARY_DIR}
+                -DCMAKE_WARN_DEPRECATED=OFF
+                -DCMAKE_INSTALL_MESSAGE=LAZY
+                -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+                -DCMAKE_INSTALL_PREFIX=${llvm_BINARY_DIR}/dist
+                -DCMAKE_SKIP_RPATH=OFF # keep the rpath prefix to avoid libLLVM.so
+                -DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE}
+                -G${CMAKE_GENERATOR}
+
+                -DLLVM_INCLUDE_BENCHMARKS=OFF
+                -DLLVM_INCLUDE_TESTS=OFF
+                -DLLVM_INCLUDE_DOCS=OFF
+                -DLLVM_INCLUDE_EXAMPLES=OFF
+                -DLLVM_BUILD_TESTS=OFF
+                -DLLVM_BUILD_DOCS=OFF
+                -DLLVM_BUILD_RUNTIME=OFF
+                -DLLVM_BUILD_TOOLS=OFF
+                -DLLVM_BUILD_EXAMPLES=OFF
+                -DLLVM_ENABLE_BINDINGS=OFF
+                -DLLVM_ENABLE_WARNINGS=OFF
+                "-DLLVM_TARGETS_TO_BUILD=AArch64\\;RISCV"
+
+                ${COMMAND_ECHO_OPTION}
+                RESULT_VARIABLE SUCCESS)
+
+        # TODO: replace with COMMAND_ERROR_IS_FATAL in the future (>= 3.19)
+        if (NOT SUCCESS EQUAL "0")
+            message(FATAL_ERROR "LLVM configure did not succeed")
+        else ()
+            message(STATUS "LLVM configuration complete, starting build...")
+        endif ()
+
+        ProcessorCount(NPROC)
+        execute_process(
+                COMMAND ${CMAKE_COMMAND}
+                --build ${llvm_BINARY_DIR}
+                --target
+                # The full list of targets can be discovered via `ninja -t targets` inside the build
+                install-LLVMObject
+                install-LLVMAArch64AsmParser
+                install-LLVMRISCVAsmParser
+                # We also include the headers and CMake exports for a *complete* build
+                install-llvm-headers
+                install-cmake-exports
+                -j ${NPROC}
+
+                ${COMMAND_ECHO_OPTION}
+                RESULT_VARIABLE SUCCESS)
+
+        # TODO: replace with COMMAND_ERROR_IS_FATAL in the future (>= 3.19)
+        if (NOT SUCCESS EQUAL "0")
+            message(FATAL_ERROR "LLVM build did not succeed")
+        endif ()
+    endif()
 
     find_package(LLVM REQUIRED CONFIG NO_DEFAULT_PATH
                  PATHS "${llvm_BINARY_DIR}/lib/cmake/llvm")
 
     # NOTE: we don't do the usual version checks here because it needs vars exported in find_LLVM
-    # we just assume it's good beacuse it must be whitelisted in FetchContent_Declare
+    # we just assume it's good because it must be whitelisted in FetchContent_Declare
 endif()
 
   set(SIMENG_LLVM_VERSION ${LLVM_VERSION_MAJOR} CACHE INTERNAL "LLVM major version number used.")
@@ -219,7 +260,7 @@ endif()
   # saves us from having to build all targets before running the tests
   add_custom_target(test-all
     COMMAND ${CMAKE_CTEST_COMMAND}
-    DEPENDS unittests regression-aarch64 regression-riscv
+    DEPENDS unittests regression-aarch64 regression-riscv integrationtests
   )
 endif()
 
@@ -240,10 +281,10 @@ if (SIMENG_ENABLE_SST)
     endif()
   else()
     message(WARNING "SST build was selected but SST install directory was not specified.
-    Please specify -DSST_INSTALL_DIR=<path> for the SST build to proceed.")  
+    Please specify -DSST_INSTALL_DIR=<path> for the SST build to proceed.")
   endif()
 endif()
 
 # Install SimEng model configs in the build directory
 set(SIMENG_CONFIG_INSTALL_DIR "${CMAKE_BINARY_DIR}/simeng-configs")
-install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/configs/ DESTINATION ${SIMENG_CONFIG_INSTALL_DIR})
\ No newline at end of file
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/configs/ DESTINATION ${SIMENG_CONFIG_INSTALL_DIR})
diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt
index a2485e96e1..dc05816c2e 100644
--- a/CONTRIBUTORS.txt
+++ b/CONTRIBUTORS.txt
@@ -12,6 +12,8 @@ Current development team:
   Finn Wilkinson
   Rahat Muneeb
   Daniel Weaver
+  Alex Cockrean
+  Joseph Moore
 
 Additional Contributors:
   Ainsley Rutterford
diff --git a/LICENSE_RYML.txt b/LICENSE_RYML.txt
new file mode 100644
index 0000000000..052ee79961
--- /dev/null
+++ b/LICENSE_RYML.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2018, Joao Paulo Magalhaes <dev@jpmag.me>
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/LICENSE_YAML-CPP.txt b/LICENSE_YAML-CPP.txt
deleted file mode 100644
index 991fdbbe7d..0000000000
--- a/LICENSE_YAML-CPP.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-Copyright (c) 2008-2015 Jesse Beder.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
diff --git a/RELEASE-NOTES.txt b/RELEASE-NOTES.txt
index f55e29d1fc..fdecc5c30b 100644
--- a/RELEASE-NOTES.txt
+++ b/RELEASE-NOTES.txt
@@ -1,4 +1,4 @@
-Welcome to the 0.9.5 open source release of the Simulation Engine! SimEng is a fast, easy to use and modify, cycle-level simulator for CPUs. Its initial focus is on simulating single Arm cores in server CPUs, and so the instruction set architecture (ISA) target is initially Arm9.2-a+SVE+SME. RISC-V has also seen some inital support, starting with rv64ima. Later versions of the Arm ISA, additional RISC-V ISA extensions and other ISAs, will be supported in future releases.
+Welcome to the 0.9.6 open source release of the Simulation Engine! SimEng is a fast, easy to use and modify, cycle-level simulator for CPUs. Its initial focus is on simulating single Arm cores in server CPUs, and so the instruction set architecture (ISA) target is initially Arm9.2-a+SVE+SME. There is also RISC-V support currently covering the rv64imafdc set of extensions. Later versions of the Arm ISA, additional RISC-V ISA extensions and other ISAs, will be supported in future releases.
 
 This SimEng release should be considered beta software - you are the first users outside of the core developers, so there *will* be lots of bugs and missing features. The good news is that the code has been designed to be simple to understand and modify, so if you find a bug, such as a missing instruction or missing system call, please submit a PR, or report an issue and we’ll get to it as soon as we can. 
 
@@ -6,11 +6,11 @@ You can find all the user and developer documentation on the SimEng webpage:
 
    https://uob-hpc.github.io/SimEng/index.html
 
-SimEng should be fast, much faster than other cycle-level CPU simulators. The fastest hardware we’ve run it on is an M1-Ultra Mac Studio, where we’ve measured SimEng running at up to ~1 MIPS for the Out of Order (OoO) core in our ThunderX2 and A64fx models. SVE (vector) codes are more complex to simulate and tend to be slower - on the same hardware we saw simulated SVE codes running at around 0.6 MIPS.
+SimEng should be fast, much faster than other cycle-level CPU simulators. The fastest hardware we’ve run it on is an M1-Ultra Mac Studio, where we’ve measured SimEng running at up to 1.4 MIPS for the Out of Order (OoO) core in our ThunderX2 and A64fx models. SVE (vector) and SME (matrix) codes are more complex to simulate and tend to be slower - on the same hardware we saw simulated SVE codes running at around 1.1 MIPS and SME codes at around 0.9 MIPS.
 
 Things that should work in this release include:
 * Static AArch64, with SVE/SME, binaries compiled with either GCC 7.3 or later, or Arm Clang 20/LLVM 9 or later
-* Static RISC-V binaries targetting rv64ima and compiled with GCC 12.2.0
+* Static RISC-V binaries targetting rv64imafdc and compiled with GCC 12.2.0
 * Single thread OpenMP programs
 * Binaries compiled from Fortran or C programs
 
@@ -25,7 +25,7 @@ Current limitations (to be addressed in a future release):
 * We don't currently support running MPI programs (we’ve had a statically linked OpenMPI single rank program run correctly, but this was painful to build)
 * We only support a subset of Linux's system calls, and these run under emulation. The subset of 43 calls is enough to run all the codes we've tried so far
 * Only partial support for micro-oping, mainly for Load and Store operations
-* Only partial support for the ISA - we’ve focused on implementing the instructions we’ve seen generated by the compiler for all the test cases we’ve considered, about 1000 different AArch64 instructions (~15% of the entire ISA) and about 150 different RISC-V instructions so far (~33% of the RISC-V ISA supported by Capstone). You’re likely to come across “instruction not implemented” errors, just let us know when you see these
+* Only partial support for the ISA - we’ve focused on implementing the instructions we’ve seen generated by the compiler for all the test cases we’ve considered, about 1000 different AArch64 instructions (~16% of the entire ISA) and about 210 different RISC-V instructions so far (~46% of the RISC-V ISA supported by Capstone). You’re likely to come across “instruction not implemented” errors, just let us know when you see these
 * Single core, single thread only for now
 * Internally, SimEng currently only supports a very simple memory model, assuming all load/stores will hit the L1 cache. However, we have a tested integration with the SST framework (http://sst-simulator.org) allowing for a full memory model to be simulated.
 
@@ -33,7 +33,7 @@ Supported OSs (we’ve tested these, SimEng may also work on other platforms):
 * CentOS 7
 * Ubuntu
 * Cray’s CLE 7, which is based on SLES 15
-* macOS Big Sur/Monterey
+* macOS Monterey/Ventura/Sonoma
 
 Compilers supported for building SimEng itself:
 * GCC 7 or later
diff --git a/SimEngDefaultProgram b/SimEngDefaultProgram
new file mode 100755
index 0000000000..3e43fbe564
Binary files /dev/null and b/SimEngDefaultProgram differ
diff --git a/configs/DEMO_RISCV.yaml b/configs/DEMO_RISCV.yaml
index e5a11d3c54..0d64ca296e 100644
--- a/configs/DEMO_RISCV.yaml
+++ b/configs/DEMO_RISCV.yaml
@@ -1,17 +1,17 @@
----
 # This file is based off of the current tx2.yaml config and serves as an example configuration for RISC-V cores.
 # The following resources where utilised to create the config file and naming schemes:
 # https://en.wikichip.org/wiki/cavium/microarchitectures/vulcan
 
 Core:
   ISA: rv64
+  Compressed: True
   Simulation-Mode: outoforder
-  Clock-Frequency: 2.5
-  Fetch-Block-Size: 32
+  Clock-Frequency-GHz: 2.5
+  Timer-Frequency-MHz: 200
 Fetch:
   Fetch-Block-Size: 32
-  Loop-Buffer-Size: 0
-  Loop-Detection-Threshold: 0
+  Loop-Buffer-Size: 64
+  Loop-Detection-Threshold: 4
 Process-Image:
   Heap-Size: 1073741824
   Stack-Size: 1048576
@@ -20,21 +20,19 @@ Register-Set:
   FloatingPoint-Count: 90
 Pipeline-Widths:
   Commit: 4
-  Dispatch-Rate: 4
   FrontEnd: 4
   LSQ-Completion: 2
 Queue-Sizes:
   ROB: 180
   Load: 64
   Store: 36
+Port-Allocator:
+  Type: Balanced
 Branch-Predictor:
+  Type: "Perceptron"
   BTB-Tag-Bits: 11
-  Saturating-Count-Bits: 2
-  Global-History-Length: 10
+  Global-History-Length: 19
   RAS-entries: 1
-  Fallback-Static-Predictor: "Always-Taken"
-  Branch-Predictor:
-  BTB-bitlength: 16
 L1-Data-Memory:
   Interface-Type: Fixed
 L1-Instruction-Memory:
@@ -50,30 +48,32 @@ LSQ-L1-Interface:
 Ports:
   0:
     Portname: Port 0
-    Instruction-Support:
+    Instruction-Group-Support:
       - INT_SIMPLE
       - INT_MUL
+      - FLOAT
   1:
     Portname: Port 1
-    Instruction-Support:
+    Instruction-Group-Support:
       - INT
+      - FLOAT
   2:
     Portname: Port 2
-    Instruction-Support:
+    Instruction-Group-Support:
       - INT_SIMPLE
       - INT_MUL
       - BRANCH
   3:
     Portname: Port 4
-    Instruction-Support:
+    Instruction-Group-Support:
       - LOAD
   4:
     Portname: Port 5
-    Instruction-Support:
+    Instruction-Group-Support:
       - LOAD
   5:
     Portname: Port 3
-    Instruction-Support:
+    Instruction-Group-Support:
       - STORE
 Reservation-Stations:
   0:
@@ -113,20 +113,40 @@ Latencies:
     Execution-Throughput: 1
   2:
     Instruction-Groups:
-      - INT_DIV
+      - INT_DIV_OR_SQRT
     Execution-Latency: 39
     Execution-Throughput: 39
+  3:
+    Instruction-Groups:
+      - FLOAT_SIMPLE_CMP
+    Execution-Latency: 5
+    Execution-Throughput: 1
+  4:
+    Instruction-Groups:
+      - FLOAT_MUL
+    Execution-Latency: 6
+    Execution-Throughput: 1
+  5:
+    Instruction-Groups:
+      - FLOAT_SIMPLE_CVT
+    Execution-Latency: 7
+    Execution-Throughput: 1
+  6:
+    Instruction-Groups:
+      - FLOAT_DIV_OR_SQRT
+    Execution-Latency: 16
+    Execution-Throughput: 16
 # CPU-Info mainly used to generate a replica of the special (or system) file directory
 # structure
 CPU-Info:
   # Set Generate-Special-Dir to 'T' to generate the special files directory, or to 'F' to not.
   # (Not generating the special files directory may require the user to copy over files manually)
-  Generate-Special-Dir: true
+  Generate-Special-Dir: True
   # Core-Count MUST be 1 as multi-core is not supported at this time. (TX2 true value is 32)
   Core-Count: 1
   # Socket-Count MUST be 1 as multi-socket simulations are not supported at this time. (TX2 true value is 2)
   Socket-Count: 1
-  # SMT MUST be 1 as Simultanious-Multi-Threading is not supported at this time. (TX2 true value is 4)
+  # SMT MUST be 1 as Simultaneous-Multi-Threading is not supported at this time. (TX2 true value is 4)
   SMT: 1
   # Below are the values needed to generate /proc/cpuinfo
   BogoMIPS: 400.00
diff --git a/configs/a64fx.yaml b/configs/a64fx.yaml
index 1ace725d60..c3bd652dc1 100644
--- a/configs/a64fx.yaml
+++ b/configs/a64fx.yaml
@@ -1,14 +1,11 @@
----
 # The following resources where utilised to create the config file and naming schemes:
 # https://github.com/fujitsu/A64FX
 
 Core:
   ISA: AArch64
   Simulation-Mode: outoforder
-  # Clock Frequency is in GHz.
-  Clock-Frequency: 1.8
-  # Timer-Frequency is in MHz.
-  Timer-Frequency: 100
+  Clock-Frequency-GHz: 1.8
+  Timer-Frequency-MHz: 100
   Micro-Operations: True
   Vector-Length: 512
 Fetch:
@@ -31,12 +28,13 @@ Queue-Sizes:
   ROB: 128
   Load: 40
   Store: 24
+Port-Allocator:
+  Type: Balanced
 Branch-Predictor:
+  Type: "Perceptron"
   BTB-Tag-Bits: 11
-  Saturating-Count-Bits: 2
-  Global-History-Length: 11
+  Global-History-Length: 19
   RAS-entries: 8
-  Fallback-Static-Predictor: "Always-Taken"
 L1-Data-Memory:
   Interface-Type: Fixed
 L1-Instruction-Memory:
@@ -52,34 +50,34 @@ LSQ-L1-Interface:
 Ports:
   0:
     Portname: FLA
-    Instruction-Support:
+    Instruction-Group-Support:
     - FP
     - SVE
   1:
     Portname: PR
-    Instruction-Support:
+    Instruction-Group-Support:
     - PREDICATE
   2:
     Portname: EXA
-    Instruction-Support:
+    Instruction-Group-Support:
     - INT_SIMPLE
     - INT_MUL
     - STORE_DATA
   3:
     Portname: FLB
-    Instruction-Support:
+    Instruction-Group-Support:
     - FP_SIMPLE
     - FP_MUL
     - SVE_SIMPLE
     - SVE_MUL
   4:
     Portname: EXB
-    Instruction-Support:
+    Instruction-Group-Support:
     - INT_SIMPLE
     - INT_DIV_OR_SQRT
   5:
     Portname: EAGA
-    Instruction-Support:
+    Instruction-Group-Support:
     - LOAD
     - STORE_ADDRESS
     - INT_SIMPLE_ARTH_NOSHIFT
@@ -87,7 +85,7 @@ Ports:
     - INT_SIMPLE_CMP
   6:
     Portname: EAGB
-    Instruction-Support:
+    Instruction-Group-Support:
     - LOAD
     - STORE_ADDRESS
     - INT_SIMPLE_ARTH_NOSHIFT
@@ -95,7 +93,7 @@ Ports:
     - INT_SIMPLE_CMP
   7:
     Portname: BR
-    Instruction-Support:
+    Instruction-Group-Support:
     - BRANCH
 Reservation-Stations:
   0:
@@ -113,17 +111,17 @@ Reservation-Stations:
     - EXB
   2:
     Size: 10
-    Dispatch-Rate: 2
+    Dispatch-Rate: 1
     Ports:
     - EAGA
   3:
     Size: 10
-    Dispatch-Rate: 2
+    Dispatch-Rate: 1
     Ports:
     - EAGB
   4:
     Size: 19
-    Dispatch-Rate: 2
+    Dispatch-Rate: 1
     Ports:
     - BR
 Execution-Units:
@@ -245,6 +243,125 @@ Latencies:
     - STORE_ADDRESS_SVE
     Execution-Latency: 6
     Execution-Throughput: 1
+# Indexed FMLA instructions split into 2 dependent µops. Latency increased to 15 to mimic such behaviour
+# NOTE: Any changes to the capstone opcode list could invalidate the mapping between ARM instructions and the values below
+  11:
+    Instruction-Opcodes:
+    - 1922
+    - 1924
+    - 1926
+    - 2359
+    - 2360
+    - 2361
+    - 2364
+    - 2365
+    - 2368
+    - 2369
+    - 2371
+    - 2390
+    - 2391
+    - 2392
+    - 2395
+    - 2396
+    - 2399
+    - 2400
+    - 2402
+    - 2445
+    - 2446
+    - 2447
+    - 2450
+    - 2451
+    - 2454
+    - 2455
+    - 2457
+    - 2470
+    - 2471
+    - 2472
+    - 2475
+    - 2476
+    - 2479
+    - 2480
+    - 2482
+    - 3627
+    - 3629
+    - 3631
+    - 3633
+    - 3644
+    - 3646
+    - 3648
+    - 3650
+    - 3709
+    - 3711
+    - 3713
+    - 3715
+    - 4306
+    - 4308
+    - 4310
+    - 4312
+    - 4326
+    - 4328
+    - 4330
+    - 4332
+    - 4372
+    - 4374
+    - 4376
+    - 4378
+    - 4468
+    - 4469
+    - 4470
+    - 4472
+    - 4474
+    - 4476
+    - 4493
+    - 4494
+    - 4495
+    - 4497
+    - 4499
+    - 4501
+    - 4511
+    - 4513
+    - 4515
+    - 4517
+    - 4519
+    - 4521
+    - 4534
+    - 4535
+    - 4536
+    - 4538
+    - 4540
+    - 4542
+    - 4594
+    - 4595
+    - 4599
+    - 4601
+    - 4603
+    - 4605
+    - 4613
+    - 4614
+    - 4618
+    - 4620
+    - 4622
+    - 4624
+    - 4633
+    - 4635
+    - 4637
+    - 4639
+    - 4641
+    - 4643
+    - 5760
+    - 5762
+    - 5764
+    - 5766
+    - 5780
+    - 5782
+    - 5784
+    - 5786
+    - 5824
+    - 5826
+    - 5828
+    - 5830
+    Execution-Latency: 15
+    Execution-Throughput: 1
 # CPU-Info mainly used to generate a replica of the special (or system) file directory
 # structure
 CPU-Info:
diff --git a/configs/a64fx_SME.yaml b/configs/a64fx_SME.yaml
index f304234ee8..5e46a196eb 100644
--- a/configs/a64fx_SME.yaml
+++ b/configs/a64fx_SME.yaml
@@ -1,14 +1,11 @@
----
 # The following resources where utilised to create the config file and naming schemes:
 # https://github.com/fujitsu/A64FX
 
 Core:
   Simulation-Mode: outoforder
   ISA: AArch64
-  # Clock Frequency is in GHz.
-  Clock-Frequency: 1.8
-  # Timer-Frequency is in MHz.
-  Timer-Frequency: 100
+  Clock-Frequency-GHz: 1.8
+  Timer-Frequency-MHz: 100
   Micro-Operations: True
   Vector-Length: 512
   Streaming-Vector-Length: 512
@@ -24,7 +21,8 @@ Register-Set:
   FloatingPoint/SVE-Count: 128
   Predicate-Count: 48
   Conditional-Count: 128
-  Matrix-Count: 2
+  SME-Matrix-Count: 2
+  SME-Lookup-Table-Count: 8
 Pipeline-Widths:
   Commit: 4
   FrontEnd: 4
@@ -33,12 +31,13 @@ Queue-Sizes:
   ROB: 128
   Load: 40
   Store: 24
+Port-Allocator:
+  Type: Balanced
 Branch-Predictor:
+  Type: "Perceptron"
   BTB-Tag-Bits: 11
-  Saturating-Count-Bits: 2
-  Global-History-Length: 11
+  Global-History-Length: 19
   RAS-entries: 8
-  Fallback-Static-Predictor: "Always-Taken"
 L1-Data-Memory:
   Interface-Type: Fixed
 L1-Instruction-Memory:
@@ -54,72 +53,56 @@ LSQ-L1-Interface:
 Ports:
   0:
     Portname: FLA
-    Instruction-Support:
+    Instruction-Group-Support:
     - FP
     - SVE
   1:
     Portname: PR
-    Instruction-Support:
+    Instruction-Group-Support:
     - PREDICATE
   2:
     Portname: EXA
-    Instruction-Support:
+    Instruction-Group-Support:
     - INT_SIMPLE
     - INT_MUL
     - STORE_DATA
   3:
     Portname: FLB
-    Instruction-Support:
+    Instruction-Group-Support:
     - FP_SIMPLE
     - FP_MUL
     - SVE_SIMPLE
     - SVE_MUL
   4:
     Portname: EXB
-    Instruction-Support:
+    Instruction-Group-Support:
     - INT_SIMPLE
     - INT_DIV_OR_SQRT
   5:
     Portname: EAGA
-    Instruction-Support:
-    - LOAD_INT
-    - LOAD_SCALAR
-    - LOAD_VECTOR
-    - LOAD_SVE    
-    - STORE_INT
-    - STORE_SCALAR
-    - STORE_VECTOR
-    - STORE_SVE
+    Instruction-Group-Support: 
+    - LOAD
+    - STORE_ADDRESS
     - INT_SIMPLE_ARTH_NOSHIFT
     - INT_SIMPLE_LOGICAL_NOSHIFT
     - INT_SIMPLE_CMP
   6:
     Portname: EAGB
-    Instruction-Support:
-    - LOAD_INT
-    - LOAD_SCALAR
-    - LOAD_VECTOR
-    - LOAD_SVE    
-    - STORE_INT
-    - STORE_SCALAR
-    - STORE_VECTOR
-    - STORE_SVE
+    Instruction-Group-Support:
+    - LOAD
+    - STORE_ADDRESS
     - INT_SIMPLE_ARTH_NOSHIFT
     - INT_SIMPLE_LOGICAL_NOSHIFT
     - INT_SIMPLE_CMP
   7:
     Portname: BR
-    Instruction-Support:
+    Instruction-Group-Support:
     - BRANCH
+# Define example SME unit
   8:
     Portname: SME
-    Instruction-Support:
+    Instruction-Group-Support:
     - SME
-  9:
-    Portname: SME_LD_STR
-    Instruction-Support:
-    - LOAD_SME
-    - STORE_SME
 Reservation-Stations:
   0:
     Size: 20
@@ -136,29 +119,24 @@ Reservation-Stations:
     - EXB
   2:
     Size: 10
-    Dispatch-Rate: 2
+    Dispatch-Rate: 1
     Ports:
     - EAGA
   3:
     Size: 10
-    Dispatch-Rate: 2
+    Dispatch-Rate: 1
     Ports:
     - EAGB
   4:
     Size: 19
-    Dispatch-Rate: 2
+    Dispatch-Rate: 1
     Ports:
     - BR
   5:
     Size: 20
-    Dispatch-Rate: 2
+    Dispatch-Rate: 1
     Ports:
     - SME
-  6:
-    Size: 10
-    Dispatch-Rate: 2
-    Ports:
-    - SME_LD_STR
 Execution-Units:
   0:
     Pipelined: True
@@ -214,12 +192,6 @@ Execution-Units:
     - INT_DIV_OR_SQRT
     - FP_DIV_OR_SQRT
     - SVE_DIV_OR_SQRT
-  9:
-    Pipelined: True
-    Blocking-Groups:
-    - INT_DIV_OR_SQRT
-    - FP_DIV_OR_SQRT
-    - SVE_DIV_OR_SQRT
 Latencies:
   0:
     Instruction-Groups: 
@@ -248,8 +220,10 @@ Latencies:
     - SCALAR_SIMPLE
     - VECTOR_SIMPLE_LOGICAL
     - SVE_SIMPLE_LOGICAL
+    - SME_SIMPLE_LOGICAL
     - VECTOR_SIMPLE_CMP
     - SVE_SIMPLE_CMP
+    - SME_SIMPLE_CMP
     Execution-Latency: 4
     Execution-Throughput: 1
   5:
@@ -258,18 +232,20 @@ Latencies:
     Execution-Latency: 29
     Execution-Throughput: 29
   6:
-    Instruction-Groups: 
+    Instruction-Groups:
+    - SCALAR_SIMPLE_CVT
     - VECTOR_SIMPLE
     - SVE_SIMPLE
-    - SCALAR_SIMPLE_CVT
+    - SME_SIMPLE
     - FP_MUL
     - SVE_MUL
-    - SME
+    - SME_MUL
     Execution-Latency: 9
     Execution-Throughput: 1
   7:
     Instruction-Groups: 
     - SVE_DIV_OR_SQRT
+    - SME_DIV_OR_SQRT
     Execution-Latency: 98
     Execution-Throughput: 98
   8:
@@ -288,34 +264,129 @@ Latencies:
   10:
     Instruction-Groups: 
     - LOAD_SVE
-    - STORE_ADDRESS_SVE
     - LOAD_SME
+    - STORE_ADDRESS_SVE
     - STORE_ADDRESS_SME
     Execution-Latency: 6
     Execution-Throughput: 1
+# Indexed FMLA instructions split into 2 dependent µops. Latency increased to 15 to mimic such behaviour
+# NOTE: Any changes to the capstone opcode list could invalidate the mapping between ARM instructions and the values below
   11:
-    Instruction-Groups:
-    - SME_SIMPLE_LOGICAL
-    - SME_SIMPLE_CMP
-    # Same as SVE
-    Execution-Latency: 4
-    Execution-Throughput: 1
-  12:
-    Instruction-Groups:
-    - SME_SIMPLE
-    - SME_DIV_OR_SQRT
-    - SME_MUL
-    # SME_MUL Used only by outer-product instructions
-    # Same as SVE. No SME DIV or SQRT so classification to this group should be impossible. 
-    # Kept to catch edge cases.
-    Execution-Latency: 9
-    Execution-Throughput: 1
-  13:
-    Instruction-Groups:
-    - LOAD_SME
-    - STORE_ADDRESS_SME
-    # Same as SVE LD/STR
-    Execution-Latency: 6
+    Instruction-Opcodes:
+    - 1922
+    - 1924
+    - 1926
+    - 2359
+    - 2360
+    - 2361
+    - 2364
+    - 2365
+    - 2368
+    - 2369
+    - 2371
+    - 2390
+    - 2391
+    - 2392
+    - 2395
+    - 2396
+    - 2399
+    - 2400
+    - 2402
+    - 2445
+    - 2446
+    - 2447
+    - 2450
+    - 2451
+    - 2454
+    - 2455
+    - 2457
+    - 2470
+    - 2471
+    - 2472
+    - 2475
+    - 2476
+    - 2479
+    - 2480
+    - 2482
+    - 3627
+    - 3629
+    - 3631
+    - 3633
+    - 3644
+    - 3646
+    - 3648
+    - 3650
+    - 3709
+    - 3711
+    - 3713
+    - 3715
+    - 4306
+    - 4308
+    - 4310
+    - 4312
+    - 4326
+    - 4328
+    - 4330
+    - 4332
+    - 4372
+    - 4374
+    - 4376
+    - 4378
+    - 4468
+    - 4469
+    - 4470
+    - 4472
+    - 4474
+    - 4476
+    - 4493
+    - 4494
+    - 4495
+    - 4497
+    - 4499
+    - 4501
+    - 4511
+    - 4513
+    - 4515
+    - 4517
+    - 4519
+    - 4521
+    - 4534
+    - 4535
+    - 4536
+    - 4538
+    - 4540
+    - 4542
+    - 4594
+    - 4595
+    - 4599
+    - 4601
+    - 4603
+    - 4605
+    - 4613
+    - 4614
+    - 4618
+    - 4620
+    - 4622
+    - 4624
+    - 4633
+    - 4635
+    - 4637
+    - 4639
+    - 4641
+    - 4643
+    - 5760
+    - 5762
+    - 5764
+    - 5766
+    - 5780
+    - 5782
+    - 5784
+    - 5786
+    - 5824
+    - 5826
+    - 5828
+    - 5830
+    Execution-Latency: 15
     Execution-Throughput: 1
 # CPU-Info mainly used to generate a replica of the special (or system) file directory 
 # structure
@@ -339,4 +410,4 @@ CPU-Info:
   CPU-Revision: 0
   # Package-Count is used to generate 
   # /sys/devices/system/cpu/cpu{0..Core-Count}/topology/{physical_package_id, core_id}
-  Package-Count: 1
\ No newline at end of file
+  Package-Count: 1
diff --git a/configs/m1_firestorm.yaml b/configs/m1_firestorm.yaml
index 6bba7aad33..822856dac5 100644
--- a/configs/m1_firestorm.yaml
+++ b/configs/m1_firestorm.yaml
@@ -2,8 +2,8 @@
 Core:
   ISA: AArch64
   Simulation-Mode: outoforder
-  Clock-Frequency: 3.2
-  Timer-Frequency: 100
+  Clock-Frequency-GHz: 3.2
+  Timer-Frequency-MHz: 100
   Micro-Operations: True
 Fetch:
   Fetch-Block-Size: 64
@@ -24,12 +24,13 @@ Queue-Sizes:
   ROB: 630
   Load: 130
   Store: 60
+Port-Allocator:
+  Type: M1
 Branch-Predictor:
-  BTB-Tag-Bits: 11 
-  Saturating-Count-Bits: 2  
-  Global-History-Length: 11 
-  RAS-entries: 8 
-  Fallback-Static-Predictor: "Always-Taken"
+  Type: "Perceptron"
+  BTB-Tag-Bits: 11
+  Global-History-Length: 19
+  RAS-entries: 8
 L1-Data-Memory:
   Interface-Type: Fixed
 L1-Instruction-Memory:
@@ -45,68 +46,68 @@ LSQ-L1-Interface:
 Ports:
   0:
     Portname: INT1
-    Instruction-Support:
+    Instruction-Group-Support:
       - INT_SIMPLE
       - BRANCH
   1:
     Portname: INT2
-    Instruction-Support:
+    Instruction-Group-Support:
       - INT_SIMPLE
       - BRANCH
   2:
     Portname: INT3
-    Instruction-Support:
+    Instruction-Group-Support:
       - INT_SIMPLE
   3:
     Portname: INT4
-    Instruction-Support:
+    Instruction-Group-Support:
       - INT_SIMPLE
   4:
     Portname: INT5
-    Instruction-Support:
+    Instruction-Group-Support:
       - INT_SIMPLE
       - INT_MUL
       - INT_DIV_OR_SQRT
   5:
     Portname: INT6
-    Instruction-Support:
+    Instruction-Group-Support:
       - INT_SIMPLE
       - INT_MUL
   6:
     Portname: LS1
-    Instruction-Support:
+    Instruction-Group-Support:
       - STORE
   7:
     Portname: LS2
-    Instruction-Support:
+    Instruction-Group-Support:
       - LOAD
       - STORE
   8:
     Portname: LS3
-    Instruction-Support:
+    Instruction-Group-Support:
       - LOAD
   9:
     Portname: LS4
-    Instruction-Support:
+    Instruction-Group-Support:
       - LOAD
   10:
     Portname: FP_SIMD1
-    Instruction-Support:
+    Instruction-Group-Support:
       - FP
       - VECTOR
   11:
     Portname: FP_SIMD2
-    Instruction-Support:
+    Instruction-Group-Support:
       - FP
       - VECTOR
   12:
     Portname: FP_SIMD3
-    Instruction-Support:
+    Instruction-Group-Support:
       - FP
       - VECTOR
   13:
     Portname: FP_SIMD4
-    Instruction-Support:
+    Instruction-Group-Support:
       - FP
       - VECTOR
 Reservation-Stations:
diff --git a/configs/sst-cores/a64fx-sst.yaml b/configs/sst-cores/a64fx-sst.yaml
index 643398b6ed..366d3fe652 100644
--- a/configs/sst-cores/a64fx-sst.yaml
+++ b/configs/sst-cores/a64fx-sst.yaml
@@ -1,14 +1,11 @@
----
 # The following resources where utilised to create the config file and naming schemes:
 # https://github.com/fujitsu/A64FX
 
 Core:
   ISA: AArch64
   Simulation-Mode: outoforder
-  # Clock Frequency is in GHz.
-  Clock-Frequency: 1.8
-  # Timer-Frequency is in MHz.
-  Timer-Frequency: 100
+  Clock-Frequency-GHz: 1.8
+  Timer-Frequency-MHz: 100
   Micro-Operations: True
   Vector-Length: 512
 Fetch:
@@ -31,12 +28,13 @@ Queue-Sizes:
   ROB: 128
   Load: 40
   Store: 24
+Port-Allocator:
+  Type: Balanced
 Branch-Predictor:
+  Type: "Perceptron"
   BTB-Tag-Bits: 11
-  Saturating-Count-Bits: 2
-  Global-History-Length: 11
+  Global-History-Length: 19
   RAS-entries: 8
-  Fallback-Static-Predictor: "Always-Taken"
 L1-Data-Memory:
   Interface-Type: External
 L1-Instruction-Memory:
@@ -52,34 +50,34 @@ LSQ-L1-Interface:
 Ports:
   0:
     Portname: FLA
-    Instruction-Support:
+    Instruction-Group-Support:
     - FP
     - SVE
   1:
     Portname: PR
-    Instruction-Support:
+    Instruction-Group-Support:
     - PREDICATE
   2:
     Portname: EXA
-    Instruction-Support:
+    Instruction-Group-Support:
     - INT_SIMPLE
     - INT_MUL
     - STORE_DATA
   3:
     Portname: FLB
-    Instruction-Support:
+    Instruction-Group-Support:
     - FP_SIMPLE
     - FP_MUL
     - SVE_SIMPLE
     - SVE_MUL
   4:
     Portname: EXB
-    Instruction-Support:
+    Instruction-Group-Support:
     - INT_SIMPLE
     - INT_DIV_OR_SQRT
   5:
     Portname: EAGA
-    Instruction-Support:
+    Instruction-Group-Support:
     - LOAD
     - STORE_ADDRESS
     - INT_SIMPLE_ARTH_NOSHIFT
@@ -87,7 +85,7 @@ Ports:
     - INT_SIMPLE_CMP
   6:
     Portname: EAGB
-    Instruction-Support:
+    Instruction-Group-Support:
     - LOAD
     - STORE_ADDRESS
     - INT_SIMPLE_ARTH_NOSHIFT
@@ -95,7 +93,7 @@ Ports:
     - INT_SIMPLE_CMP
   7:
     Portname: BR
-    Instruction-Support:
+    Instruction-Group-Support:
     - BRANCH
 Reservation-Stations:
   0:
@@ -255,7 +253,7 @@ CPU-Info:
   Core-Count: 1
   # Socket-Count MUST be 1 as multi-socket simulations are not supported at this time. (A64FX true value is 1)
   Socket-Count: 1
-  # SMT MUST be 1 as Simultanious-Multi-Threading is not supported at this time. (A64FX true value is 1)
+  # SMT MUST be 1 as Simultaneous-Multi-Threading is not supported at this time. (A64FX true value is 1)
   SMT: 1
   # Below are the values needed to generate /proc/cpuinfo
   BogoMIPS: 200.00
diff --git a/configs/sst-cores/m1_firestorm-sst.yaml b/configs/sst-cores/m1_firestorm-sst.yaml
index 53d3f9846f..ce0302ecc8 100644
--- a/configs/sst-cores/m1_firestorm-sst.yaml
+++ b/configs/sst-cores/m1_firestorm-sst.yaml
@@ -2,8 +2,8 @@
 Core:
   ISA: AArch64
   Simulation-Mode: outoforder
-  Clock-Frequency: 3.2
-  Timer-Frequency: 100
+  Clock-Frequency-GHz: 3.2
+  Timer-Frequency-MHz: 100
   Micro-Operations: True
 Fetch:
   Fetch-Block-Size: 64
@@ -24,12 +24,13 @@ Queue-Sizes:
   ROB: 630
   Load: 130
   Store: 60
+Port-Allocator:
+  Type: M1
 Branch-Predictor:
+  Type: "Perceptron"
   BTB-Tag-Bits: 11 
-  Saturating-Count-Bits: 2  
-  Global-History-Length: 11 
+  Global-History-Length: 11
   RAS-entries: 8 
-  Fallback-Static-Predictor: "Always-Taken"
 L1-Data-Memory:
   Interface-Type: External
 L1-Instruction-Memory:
@@ -45,68 +46,68 @@ LSQ-L1-Interface:
 Ports:
   0:
     Portname: INT1
-    Instruction-Support:
+    Instruction-Group-Support:
       - INT_SIMPLE
       - BRANCH
   1:
     Portname: INT2
-    Instruction-Support:
+    Instruction-Group-Support:
       - INT_SIMPLE
       - BRANCH
   2:
     Portname: INT3
-    Instruction-Support:
+    Instruction-Group-Support:
       - INT_SIMPLE
   3:
     Portname: INT4
-    Instruction-Support:
+    Instruction-Group-Support:
       - INT_SIMPLE
   4:
     Portname: INT5
-    Instruction-Support:
+    Instruction-Group-Support:
       - INT_SIMPLE
       - INT_MUL
       - INT_DIV_OR_SQRT
   5:
     Portname: INT6
-    Instruction-Support:
+    Instruction-Group-Support:
       - INT_SIMPLE
       - INT_MUL
   6:
     Portname: LS1
-    Instruction-Support:
+    Instruction-Group-Support:
       - STORE
   7:
     Portname: LS2
-    Instruction-Support:
+    Instruction-Group-Support:
       - LOAD
       - STORE
   8:
     Portname: LS3
-    Instruction-Support:
+    Instruction-Group-Support:
       - LOAD
   9:
     Portname: LS4
-    Instruction-Support:
+    Instruction-Group-Support:
       - LOAD
   10:
     Portname: FP_SIMD1
-    Instruction-Support:
+    Instruction-Group-Support:
       - FP
       - VECTOR
   11:
     Portname: FP_SIMD2
-    Instruction-Support:
+    Instruction-Group-Support:
       - FP
       - VECTOR
   12:
     Portname: FP_SIMD3
-    Instruction-Support:
+    Instruction-Group-Support:
       - FP
       - VECTOR
   13:
     Portname: FP_SIMD4
-    Instruction-Support:
+    Instruction-Group-Support:
       - FP
       - VECTOR
 Reservation-Stations:
diff --git a/configs/sst-cores/tx2-sst.yaml b/configs/sst-cores/tx2-sst.yaml
index 9d31ee2c2e..174b30f732 100644
--- a/configs/sst-cores/tx2-sst.yaml
+++ b/configs/sst-cores/tx2-sst.yaml
@@ -1,14 +1,11 @@
----
 # The following resources where utilised to create the config file and naming schemes:
 # https://en.wikichip.org/wiki/cavium/microarchitectures/vulcan
 
 Core:
   ISA: AArch64
   Simulation-Mode: outoforder
-  # Clock Frequency is in GHz.
-  Clock-Frequency: 2.5
-  # Timer-Frequency is in MHz.
-  Timer-Frequency: 200
+  Clock-Frequency-GHz: 2.5
+  Timer-Frequency-MHz: 200
   Micro-Operations: True
 Fetch:
   Fetch-Block-Size: 32
@@ -29,12 +26,13 @@ Queue-Sizes:
   ROB: 180
   Load: 64
   Store: 36
+Port-Allocator:
+  Type: Balanced
 Branch-Predictor:
+  Type: "Perceptron"
   BTB-Tag-Bits: 11
-  Saturating-Count-Bits: 2
   Global-History-Length: 10
   RAS-entries: 5
-  Fallback-Static-Predictor: "Always-Taken"
 L1-Data-Memory:
   Interface-Type: External
 L1-Instruction-Memory:
@@ -50,34 +48,34 @@ LSQ-L1-Interface:
 Ports:
   0:
     Portname: Port 0
-    Instruction-Support:
+    Instruction-Group-Support:
     - INT_SIMPLE
     - INT_MUL
     - FP
   1:
     Portname: Port 1
-    Instruction-Support:
+    Instruction-Group-Support:
     - INT
     - FP
   2:
     Portname: Port 2
-    Instruction-Support:
+    Instruction-Group-Support:
     - INT_SIMPLE
     - INT_MUL
     - BRANCH
   3:
     Portname: Port 4
-    Instruction-Support:
+    Instruction-Group-Support:
     - LOAD
     - STORE_ADDRESS
   4:
     Portname: Port 5
-    Instruction-Support:
+    Instruction-Group-Support:
     - LOAD
     - STORE_ADDRESS
   5:
     Portname: Port 3
-    Instruction-Support:
+    Instruction-Group-Support:
     - STORE_DATA
 Reservation-Stations:
   0:
diff --git a/configs/tx2.yaml b/configs/tx2.yaml
index 22b823bb5b..45a8bb498b 100644
--- a/configs/tx2.yaml
+++ b/configs/tx2.yaml
@@ -1,14 +1,11 @@
----
 # The following resources where utilised to create the config file and naming schemes:
 # https://en.wikichip.org/wiki/cavium/microarchitectures/vulcan
 
 Core:
   ISA: AArch64
   Simulation-Mode: outoforder
-  # Clock Frequency is in GHz.
-  Clock-Frequency: 2.5
-  # Timer-Frequency is in MHz.
-  Timer-Frequency: 200
+  Clock-Frequency-GHz: 2.5
+  Timer-Frequency-MHz: 200
   Micro-Operations: True
 Fetch:
   Fetch-Block-Size: 32
@@ -29,12 +26,13 @@ Queue-Sizes:
   ROB: 180
   Load: 64
   Store: 36
+Port-Allocator:
+  Type: Balanced
 Branch-Predictor:
+  Type: "Perceptron"
   BTB-Tag-Bits: 11
-  Saturating-Count-Bits: 2
-  Global-History-Length: 10
+  Global-History-Length: 19
   RAS-entries: 5
-  Fallback-Static-Predictor: "Always-Taken"
 L1-Data-Memory:
   Interface-Type: Fixed
 L1-Instruction-Memory:
@@ -50,34 +48,34 @@ LSQ-L1-Interface:
 Ports:
   0:
     Portname: Port 0
-    Instruction-Support:
+    Instruction-Group-Support:
     - INT_SIMPLE
     - INT_MUL
     - FP
   1:
     Portname: Port 1
-    Instruction-Support:
+    Instruction-Group-Support:
     - INT
     - FP
   2:
     Portname: Port 2
-    Instruction-Support:
+    Instruction-Group-Support:
     - INT_SIMPLE
     - INT_MUL
     - BRANCH
   3:
     Portname: Port 4
-    Instruction-Support:
+    Instruction-Group-Support:
     - LOAD
     - STORE_ADDRESS
   4:
     Portname: Port 5
-    Instruction-Support:
+    Instruction-Group-Support:
     - LOAD
     - STORE_ADDRESS
   5:
     Portname: Port 3
-    Instruction-Support:
+    Instruction-Group-Support:
     - STORE_DATA
 Reservation-Stations:
   0:
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 176c63857f..58104ce890 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,5 +1,5 @@
-Sphinx==1.6.7
+Sphinx==5.0.0
 sphinx-rtd-theme==0.5.2
 mistune==0.8.4
 m2r2==0.3.1
-Jinja2==3.0.3
\ No newline at end of file
+Jinja2==3.1.3
\ No newline at end of file
diff --git a/docs/sphinx/assets/expectations.png b/docs/sphinx/assets/expectations.png
new file mode 100644
index 0000000000..8dfb969d42
Binary files /dev/null and b/docs/sphinx/assets/expectations.png differ
diff --git a/docs/sphinx/assets/instruction_groups.png b/docs/sphinx/assets/instruction_groups.png
deleted file mode 100644
index bf5bf5c73a..0000000000
Binary files a/docs/sphinx/assets/instruction_groups.png and /dev/null differ
diff --git a/docs/sphinx/assets/instruction_groups_RISCV.png b/docs/sphinx/assets/instruction_groups_RISCV.png
index 0294ff98a7..56e54a42db 100644
Binary files a/docs/sphinx/assets/instruction_groups_RISCV.png and b/docs/sphinx/assets/instruction_groups_RISCV.png differ
diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py
index df34a82c8b..b67915721c 100644
--- a/docs/sphinx/conf.py
+++ b/docs/sphinx/conf.py
@@ -20,7 +20,7 @@
 # -- Project information -----------------------------------------------------
 
 project = u'SimEng'
-copyright = u'2021, SimEng developers'
+copyright = u'2024, SimEng developers'
 author = u'SimEng developers'
 
 # The short X.Y version
diff --git a/docs/sphinx/developer/arch/index.rst b/docs/sphinx/developer/arch/index.rst
index 6541ba55e2..e8f90085fd 100644
--- a/docs/sphinx/developer/arch/index.rst
+++ b/docs/sphinx/developer/arch/index.rst
@@ -5,7 +5,7 @@ SimEng architecture definitions are responsible for describing the features and
 
 To achieve this, SimEng defines a set of abstract architecture-related classes. Discrete implementations of these classes are provided for each of the ISAs SimEng supports by default, and must also be implemented for adding support for new or custom ISAs.
 
-ISA support is achieved through the use of the `Capstone <https://github.com/aquynh/capstone/>`_ disassembly framework, which disassembles a binary instruction into a C/C++ object that include operand registers, access types, and immediate values to name a few. In order to update SimEng's AArch64 support from Armv8.4-a to Armv9.2-a, we undertook a Capstone update to allow for disassembly of the Armv9.2-a ISA. The work done for this can be found `here <https://github.com/capstone-engine/capstone/pull/1907>`_, and other useful ISA updating tools present in Capstone can be found `here <https://github.com/capstone-engine/capstone/tree/next/suite/synctools>`_.
+ISA support is achieved through the use of an in-house fork of the `Capstone <https://github.com/aquynh/capstone/>`_ disassembly framework, which disassembles a binary instruction into a C/C++ object that includes operand registers, access types, and immediate values to name a few. In order to update SimEng's AArch64 support from Armv8.4-a to Armv9.2-a, we undertook a Capstone update to allow for disassembly of the Armv9.2-a ISA. The work done for this can be found `here <https://github.com/capstone-engine/capstone/pull/1907>`_. Extensive work continues to be done to Capstone by its community in order to allow for easier updating of ISA versions. This facilitated the recent update of SimEng's AArch64 ISA support to Armv9.4-a.
 
 Below provides more information on the abstract structure of a SimEng architecture and currently supported ISAs. 
 
diff --git a/docs/sphinx/developer/arch/supported/aarch64.rst b/docs/sphinx/developer/arch/supported/aarch64.rst
index 8754f35e74..464113e8fc 100644
--- a/docs/sphinx/developer/arch/supported/aarch64.rst
+++ b/docs/sphinx/developer/arch/supported/aarch64.rst
@@ -1,7 +1,7 @@
 AArch64
 =======
 
-SimEng provides an implementation of the 64-bit AArch64 architecture, specifically the Armv9.2-a ISA. This implementation provides support for decoding and executing a range of common instructions, sufficient to run a number of simple benchmarks. It is also capable of handling supervisor call (syscall) exceptions via basic system call emulation, allowing the execution of programs that have been statically compiled with the standard library.
+SimEng provides an implementation of the 64-bit AArch64 architecture, specifically the Armv9.4-a ISA. This implementation provides support for decoding and executing a range of common instructions, sufficient to run a number of simple benchmarks. It is also capable of handling supervisor call (syscall) exceptions via basic system call emulation, allowing the execution of programs that have been statically compiled with the standard library.
 
 .. contents:: Contents
 
@@ -10,30 +10,28 @@ Decoding
 
 Instruction decoding is performed using the `Capstone <https://github.com/aquynh/capstone/>`_ disassembly framework. The disassembly generated by Capstone is used to determine the properties, operands, and execution behaviour of the corresponding instruction.
 
-The logic held in ``src/lib/arch/aarch64/Instruction_decode.cc`` is primarily associated with converting the provided Capstone instruction metadata into the appropriate SimEng ``instruction`` format. Additionally, an instruction's identifiers are defined here through operand usage and opcode values. For the AArch64 architecture model, the following identifiers are defined:
-
-- ``isScalarData_``, operates on scalar values.
-- ``isVectorData_``, operates on vector values.
-- ``isSVEData_``, uses Z registers as source and/or destination operands.
-- ``isSMEData_``, uses ZA tiles as source and/or destination operands.
-- ``isNoShift_``, doesn't have a shift operand.
-- ``isLogical_``, is a logical operation.
-- ``isCompare_``, is a compare operation.
-- ``isConvert_``, is a convert operation.
-- ``isMultiply_``, is a multiply operation.
-- ``isDivideOrSqrt_``, is a divide or square root operation.
-- ``isPredicate_``, writes to a predicate register.
-- ``isLoad_``, is a load operation.
-- ``isStoreAddress_``, is a store address generation operation.
-- ``isStoreData_``, is a store data operation.
-- ``isBranch_``, is a branch operation.
-- ``isRET_``, is a return instruction.
-- ``isBL_``, is a branch and link instructions.
+The logic held in ``src/lib/arch/aarch64/Instruction_decode.cc`` is primarily associated with converting the provided Capstone instruction metadata into the appropriate SimEng ``instruction`` format. Additionally, an instruction's type identifiers are set here through operand usage and opcode values. For the AArch64 architecture model, the following identifiers are defined in ``src/include/simeng/arch/aarch64/Instruction.hh``:
+
+- ``isScalarData``, operates on scalar values.
+- ``isVectorData``, operates on vector values.
+- ``isSVEData``, uses Z registers as source and/or destination operands.
+- ``isSMEData``, uses ZA tiles as source and/or destination operands.
+- ``isShift``, has a shift operand.
+- ``isLogical``, is a logical operation.
+- ``isCompare``, is a compare operation.
+- ``isConvert``, is a convert operation.
+- ``isMultiply``, is a multiply operation.
+- ``isDivideOrSqrt``, is a divide or square root operation.
+- ``isPredicate``, writes to a predicate register.
+- ``isLoad``, is a load operation.
+- ``isStoreAddress``, is a store address generation operation.
+- ``isStoreData``, is a store data operation.
+- ``isBranch``, is a branch operation.
 
 .. _aarch64-instruction-groups:
 
-Instruction Groups
-******************
+Instruction Groups/Opcodes
+**************************
 Through a combination of the above identifiers, an instruction can be allocated an :ref:`instruction group <instruction-group>`. The instruction groups available to the AArch64 ISA are detailed below:
 
 .. image:: ../../../assets/instruction_groups_AArch64.png
@@ -43,7 +41,7 @@ The above diagram describes the instruction groups currently implemented for the
 
 This hierarchy-based naming convention has been chosen to provide the user with greater control over the number of instructions grouped under one name, whilst also remaining intuitive. A variety of combinations/instruction scopes can be defined through this method and only uses a small set of easily interpreted operation descriptions.
 
-If the supplied instruction groups don't provide a small enough scope, a Capstone opcode can be used instead (found in ``SimEng/build/_deps/capstone-lib-src/arch/AArch64/AArch64GenInstrInfo.inc``) with the format ``~{CAPSTONE_OPCODE}``.
+If the supplied instruction groups don't provide a small enough scope, a numerical Capstone opcode can be used instead (found in ``SimEng/build/_deps/capstone-lib-src/arch/AArch64/AArch64GenInstrInfo.inc``).
 
 Implementation
 ''''''''''''''
@@ -105,7 +103,7 @@ Due to the vast number of AArch64 instruction variants, instructions are only ad
 
 .. Note:: When adding support for new instructions, it's recommended to run SimEng in emulation mode for both speed, and for an execution flow that's easier to follow.
 
-When you first run the new program through SimEng, execution will occur as normal until an unsupported instruction reaches the retirement point. This will then generate an illegal instruction exception, which the architecture will catch and provide a useful output before terminating. An example of the expected output is below::
+When you first run the new program through SimEng, execution will occur as normal until an unsupported instruction reaches the retirement point. This will then generate an not-yet-implemented exception, which the architecture will catch and provide a useful output before terminating. An example of the expected output is below::
 
   Encountered execution not-yet-implemented exception
     Generated by instruction:
@@ -122,27 +120,28 @@ The first step to add a new instruction (and the only, for many instructions) is
 
 There are several useful variables that execution behaviours have access to:
 
-``operands``
+``sourceValues_``
 
 .. _aarch64-adding-execution-behaviour-operands:
 
-  This is a vector of ``RegisterValue``, with each value corresponding to one of the input operands. For most instructions, ``operands[0]`` will be the *second* operand as written textually, as the first operand is typically the destination register. E.g., for the instruction ``add w0, w1, w2``, ``operands[0]`` will correspond to ``w1`` and ``[1]`` to ``w2``.
+  This is a vector of ``RegisterValue``, with each value corresponding to one of the input operands. For most instructions, ``sourceValues_[0]`` will be the *second* operand as written textually, as the first operand is typically the destination register. E.g., for the instruction ``add w0, w1, w2``, ``sourceValues_[0]`` will correspond to ``w1`` and ``[1]`` to ``w2``.
   
-  Some instructions have "implicit" register reads: these are added to the **start** of the operand array. E.g., the instruction ``b.ne #16`` implicitly reads the "NZCV" flags. In this case, ``operands[0]`` will be the value of the flag register. 
+  Some instructions have "implicit" register reads: these are added to the **start** of the operand array. E.g., the instruction ``b.ne #16`` implicitly reads the "NZCV" flags. In this case, ``sourceValues_[0]`` will be the value of the flag register.
   
-  Some instructions have operands to which they both read and write, such as ``fmla v0.d, v1.d, v2.d`` both writing to *and* reading from ``v0.d``; in this case, ``operands[0]`` is ``v0.d``, and ``[1]`` and ``[2]`` are ``v1.d`` and ``v2.d`` respectively.
+  Some instructions have operands to which they both read and write, such as ``fmla v0.d, v1.d, v2.d`` both writing to *and* reading from ``v0.d``; in this case, ``sourceValues_[0]`` is ``v0.d``, and ``[1]`` and ``[2]`` are ``v1.d`` and ``v2.d`` respectively.
 
-  Instructions such as stores may not have any destination registers at all. In these cases, the ``operand`` indices match the positions as they appear: the first operand is ``[0]``, the second ``[1]``, and so on.
+  Instructions such as stores may not have any destination registers at all. In these cases, the ``sourceValues_`` indices match the positions as they appear: the first operand is ``[0]``, the second ``[1]``, and so on.
   
 ``results``
   This is the output vector, into which ``RegisterValue`` instances containing the results should be placed. Each entry in the vector corresponds to a destination register.
 
-  Some instructions have "implicit" destination registers: in these cases, the implicit destinations are added to the start of the results vector. For example, ``subs w0, w1, #1`` writes explicitly to ``w0``, but also implicitly sets the "NZCV" comparison flags. In this case, ``results[0]`` is expected to be the updated NZCV flags, while ``results[1]`` is expected to be the new value of ``w0``.
+  Some instructions have "implicit" destination registers: in these cases, the implicit destinations are added to the **start** of the results vector. For example, ``subs w0, w1, #1`` writes explicitly to ``w0``, but also implicitly sets the "NZCV" comparison flags. In this case, ``results[0]`` is expected to be the updated NZCV flags, while ``results[1]`` is expected to be the new value of ``w0``.
+  Some Load and Store instructions update the base memory address register (pre- or post-indexing). These registers are also classed as implicit destination registers and thus will be added to the start of the results vector. For example, ``ldr	x1, [x0], #8`` writes explicitly to ``x1`` but also writes implicitly to ``x0``. In this case, ``results[0]`` is expected to be the updated ``x0`` value (``x0 + 8``) and ``results[1]`` is expected to be the new value of ``x1``.
 
   Memory instructions may have a "writeback" variant, where the register containing the address is updated by an offset during execution. In these cases, the address register is added as a destination *after* the other registers, corresponding with the textual representation of the registers. E.g., the instruction ``ldr x1, [x2, #8]!`` will expect the value of ``x1`` in ``results[0]``, while the updated address ``x2`` should be placed in ``results[1]``.
 
 ``metadata``
-  Each instruction stores a simplified form of the full disassembly metadata generated by Capstone. This is stored in the ``metadata`` member variable, and is of type ``InstructionMetadata``. The metadata object contains an ``metadata.operands`` array with entries corresponding to the textual operands of the instruction. **Note:** Unlike the instruction's ``operands`` member variable, ``metadata.operands`` entries correspond directly to their textual equivalent. For example, in the instruction ``add w0, w1, w2``, ``metadata.operands[0]`` will describe ``w0``, ``[1]`` describes ``w1``, and so on.
+  Each instruction stores a simplified form of the full disassembly metadata generated by Capstone. This is stored in the ``metadata`` member variable, and is of type ``InstructionMetadata``. The metadata object contains an ``metadata.operands`` array with entries corresponding to the textual operands of the instruction. **Note:** Unlike the instruction's ``sourceValues_`` member variable, ``metadata.operands`` entries correspond directly to their textual equivalent. For example, in the instruction ``add w0, w1, w2``, ``metadata.operands[0]`` will describe ``w0``, ``[1]`` describes ``w1``, and so on.
   
   The primary use for this data is to retrieve immediate values. For example, with the instruction ``add w0, w1, #1``, ``metadata.operands[2].imm`` would contain the value ``1``. Floating point immediates are similarly available, using ``.fp`` in place of ``.imm``.
 
@@ -154,11 +153,13 @@ SimEng supports the Arm SVE extension and thus the use of ``Z`` vector registers
 
 Scalable Matrix Extension
 ''''''''''''''''''''''''''
-Also supported is the Arm SME extension and thus the use of ``ZA`` sub-tile registers. The implementation of the ``ZA`` register is to treat each horizontal row the same as a vector register. Therefore, if a source operand is a sub-tile of ``ZA`` and contains 16 rows, then there will be 16 corresponding entries in the ``operands`` vector. Likewise, if a destination operand is ``ZA`` or a sub-tile of ``ZA`` then the ``results`` vector will require the corresponding number of horizontal rows. 
+Also supported is the Arm SME extension and thus the use of ``ZA`` sub-tile registers. The implementation of the ``ZA`` register is to treat each horizontal row the same as a vector register. Therefore, if a source operand is a sub-tile of ``ZA`` and contains 16 rows, then there will be 16 corresponding entries in the ``sourceValues_`` vector. Likewise, if a destination operand is ``ZA`` or a sub-tile of ``ZA`` then the ``results`` vector will require the corresponding number of horizontal rows.
+
+SME instructions can also operate on sub-tile slices; individual rows or columns within a sub-tile. Regardless of whether a whole sub-tile or a slice is used as a source operand, all rows associated with said tile will be added to the ``sourceValues_`` vector. There are two reasons for this. First, the index value pointing to the relevant slice cannot be evaluated before instruction execution, thus, all sub-tile rows need to be provided. Second, if the source slice is a vertical slice (or a column of the sub-tile) then an element from each row is needed to construct the correct output.
 
-SME instructions can also operate on sub-tile slices; individual rows or columns within a sub-tile. Regardless of whether a whole sub-tile or a slice is used as a source operand, all rows associated with said tile will be added to the ``operands`` vector. There are two reasons for this. First, the index value pointing to the relevant slice cannot be evaluated before instruction execution, thus, all sub-tile rows need to be provided. Second, if the source slice is a vertical slice (or a column of the sub-tile) then an element from each row is needed to construct the correct output.
+Furthermore, a similar situation is present when a sub-tile slice is a destination operand. The ``results`` vector will expect a ``registerValue`` entry for each row of the targeted sub-tile, again due to the same two reasons listed previously. But, when a sub-tile slice is a destination operand, **all** associated rows of the sub-tile will also be added to the ``sourceValues_`` vector. Again, this is down to two key, similar reasons. First, when a destination is a sub-tile slice, we only want to update that row or column. As the we are unable to calculate which slice will be our destination before execution has commenced, all possible slices must be added to the ``results`` vector. If we were to not provide a ``RegisterValue`` to each entry of the ``results`` vector, the default value is 0. Therefore, in order to not zero-out the other slices within the sub-tile we will need access to their current values. Secondly, if the destination is a vertical slice (or sub-tile column) then only one element per row should be updated; the rest should remain unchanged.
 
-Furthermore, a similar situation is present when a sub-tile slice is a destination operand. The ``results`` vector will expect a ``registerValue`` entry for each row of the targetted sub-tile, again due to the same two reasons listed previously. But, when a sub-tile slice is a destination operand, **all** associated rows of the sub-tile will also be added to the ``operands`` vector. Again, this is down to two key, similar reasons. First, when a destination is a sub-tile slice, we only want to update that row or column. As the we are unable to calculate which slice will be our destination before execution has commenced, all possible slices must be added to the ``results`` vector. If we were to not provide a ``RegisterValue`` to each entry of the ``results`` vector, the default value is 0. Therefore, in order to not zero-out the other slices within the sub-tile we will need access to their current values. Secondly, if the destination is a vertical slice (or sub-tile column) then only one element per row should be updated; the rest should remain unchanged.
+Additionally, a fixed width 512-bit register ``ZT0`` was introduced with SME2 and is also now supported by SimEng. It can be treated in the same way as an SVE vector register.
 
 Before implementing any further SME functionality we highly recommend familiarising yourself with the specification; found `here <https://developer.arm.com/documentation/ddi0616/latest>`_.
 
@@ -186,7 +187,7 @@ cstool
 
 Capstone provides a ``cstool`` utility, which provides a visual representation of the ``metadata`` information available for any given instruction. For example, feeding it the bytes for the ``str`` instruction displayed above results in the following::
 
-    $ cstool -d arm64 f30f1ef8
+    $ cstool -d -r aarch64 f30f1ef8
      0  f3 0f 1e f8  str    x19, [sp, #-0x20]!
             op_count: 2
                     operands[0].type: REG = x19
@@ -202,7 +203,7 @@ Capstone provides a ``cstool`` utility, which provides a visual representation o
 Zero registers
 **************
 
-AArch64 provides two zero registers, ``WZR`` and ``XZR``, which are always read as 0. This implementation mirrors that behaviour, and will automatically populate the relevant ``operands`` entry with a 0-value ``RegisterValue``.
+AArch64 provides two zero registers, ``WZR`` and ``XZR``, which are always read as 0. This implementation mirrors that behaviour, and will automatically populate the relevant ``sourceValues_`` entry with a 0-value ``RegisterValue``.
 
 For instructions that are capable of generating multiple results (typically flag-setting instructions), they can claim to write to one of the zero registers: in these cases, the result is discarded. This implementation supports this behaviour, and reduces the number of available ``results`` entries accordingly.
 
@@ -211,7 +212,7 @@ Loads and stores
 
 In addition to an execution behaviour, memory instructions also require a new entry in the address generation behaviour table found in ``src/lib/arch/aarch64/Instruction_address.cc``. These entries are responsible for describing the method used to generate the addresses that these instructions will read from or write to.
 
-Address generation is expected to generate one or more instances of ``MemoryAccessTarget``, containing an address and the number of bytes to access. The same variables described above (``operands``, ``metadata``) are available to use to generate these addresses.
+Address generation is expected to generate one or more instances of ``MemoryAccessTarget``, containing an address and the number of bytes to access. The same variables described above (``sourceValues_``, ``metadata``) are available to use to generate these addresses.
 
 Once the addresses have been generated, they should be supplied in a vector to the ``setMemoryAddresses`` helper function. 
 
@@ -225,9 +226,7 @@ Concerning SVE & SME loads and stores, an effort should be made to merge contigu
 Instruction aliases
 *******************
 
-As Capstone is primarily a disassembler, it will attempt to generate the correct aliases for instructions: for example, the ``cmp w0, #0`` instruction is an alias for ``subs wzr, w0, #0``. As it's the underlying instruction that is of use (in this case, the ``subs`` instruction), this implementation includes a de-aliasing component that reverses this conversion. The logic for this may be found in ``src/lib/arch/aarch64/InstructionMetadata``.
-
-If a known but unsupported alias is encountered, it will generate an invalid instruction error, and the output will identify the instruction as unknown in place of the usual textual representation. It is recommended to reference a disassembled version of the program to identify what the instruction at this address should be correctly disassembled to, and implement the necessary dealiasing logic accordingly.
+Although Capstone has been configured to produce the disassembly information for the "real" instruction rather than that of its (preferred) alias, the instruction's mnemonic and operand string will still be that of its alias. Hence, if an exception occurs the printed instruction information may not match the internal opcode used.
 
 Common Instruction Execution behaviour issues
 *********************************************
diff --git a/docs/sphinx/developer/arch/supported/riscv.rst b/docs/sphinx/developer/arch/supported/riscv.rst
index 852fdf789b..f0467f86b9 100644
--- a/docs/sphinx/developer/arch/supported/riscv.rst
+++ b/docs/sphinx/developer/arch/supported/riscv.rst
@@ -1,7 +1,7 @@
 RISCV
 =======
 
-SimEng provides an almost complete implementation of the rv64ima architecture, as well as being capable of handling some supervisor call (syscall) exceptions via basic system call emulation. This is sufficient to run many simple single threaded programs that have been statically compiled with the standard library.
+SimEng provides an almost complete implementation of the rv64imafdc architecture, as well as being capable of handling some supervisor call (syscall) exceptions via basic system call emulation. This is sufficient to run many simple single threaded programs that have been statically compiled with the standard library.
 
 .. contents:: Contents
 
@@ -10,22 +10,24 @@ Decoding
 
 Instruction decoding is performed using the `Capstone <https://github.com/aquynh/capstone/>`_ disassembly framework. The disassembly generated by Capstone is used to determine the properties, operands, and execution behaviour of the corresponding instruction.
 
-The logic held in ``src/lib/arch/riscv/Instruction_decode.cc`` is primarily associated with converting the provided Capstone instruction metadata into the appropriate SimEng ``instruction`` format. Additionally, an instruction's identifiers are defined here through operand usage and opcode values. For the RISC-V architecture model, the following identifiers are defined:
+The logic held in ``src/lib/arch/riscv/Instruction_decode.cc`` is primarily associated with converting the provided Capstone instruction metadata into the appropriate SimEng ``instruction`` format. Additionally, an instruction's type identifiers are set here through operand usage and opcode values. For the RISC-V architecture model, the following identifiers are defined in ``src/include/simeng/arch/riscv/Instruction.hh``:
 
-- ``isStore_``, is a store operation.
-- ``isLoad_``, is a load operation.
-- ``isBranch_``, is a branch operation.
-- ``isMultiply_``, is a multiply operation.
-- ``isDivide_``, is a divide operation.
-- ``isShift_``, is a shift operation.
-- ``isAtomic_``, is an atomic operation.
-- ``isLogical_``, is a logical operation e.g bitwise and.
-- ``isCompare_``, is a compare operation.
+- ``isStore``, is a store operation.
+- ``isLoad``, is a load operation.
+- ``isBranch``, is a branch operation.
+- ``isMultiply``, is a multiply operation.
+- ``isDivide``, is a divide operation.
+- ``isShift``, is a shift operation.
+- ``isAtomic``, is an atomic operation.
+- ``isLogical``, is a logical operation e.g bitwise and.
+- ``isCompare``, is a compare operation.
+- ``isFloat``, is a floating point operation.
+- ``isConvert``, is a floating point to integer conversion operation.
 
 .. _riscv-instruction-groups:
 
-Instruction Groups
-******************
+Instruction Groups/Opcodes
+**************************
 Through a combination of the above identifiers, an instruction can be allocated an :ref:`instruction group <instruction-group>`. The instruction groups available to the RISC-V ISA are detailed below:
 
 .. image:: ../../../assets/instruction_groups_RISCV.png
@@ -35,7 +37,10 @@ The above diagram follows the same structure as :ref:`AArch64 instruction groups
 
 This hierarchy-based naming convention has been chosen to provide the user with greater control over the number of instructions grouped under one name, whilst also remaining intuitive. A variety of combinations/instruction scopes can be defined through this method and only uses a small set of easily interpreted operation descriptions.
 
-If the supplied instruction groups don't provide a small enough scope, a Capstone opcode can be used instead (found in ``SimEng/build/_deps/capstone-lib-src/arch/RISCV/RISCVGenInstrInfo.inc``) with the format ``~{CAPSTONE_OPCODE}``.
+        .. Note::
+                INT_SIMPLE_CVT and FLOAT_SIMPLE_SHIFT are both invalid instruction groups
+
+If the supplied instruction groups don't provide a small enough scope, a numerical Capstone opcode can be used instead (found in ``SimEng/build/_deps/capstone-lib-src/arch/RISCV/RISCVGenInstrInfo.inc``).
 
 .. _riscv-adding-instructions:
 
@@ -52,10 +57,12 @@ Adding execution behaviour
 
 The process for adding a new instruction is very similar to that of :ref:`AArch64 <aarch64-adding-instructions>`, by adding a new, uniquely identified entry to ``src/lib/arch/riscv/Instruction_execute.cc``.
 
+Compressed instructions are treated in the same way as pseudoinstructions. By design they can be expanded to full instructions from the base and floating point extensions. A new case should be added to the switch statement in ``InstructionMetadata`` to perform the relevant adjustment to the metadata. The instruction can then be allowed to flow through the pipeline - no new execute case is necessary.
+
 Zero registers
 **************
 
-RISC-V provides a zero register ``RO`` which is always read as 0. This implementation mirrors that behaviour, and will automatically populate the relevant ``operands`` entry with a 0-value ``RegisterValue``.
+RISC-V provides a zero register ``RO`` which is always read as 0. This implementation mirrors that behaviour, and will automatically populate the relevant ``sourceValues_`` entry with a 0-value ``RegisterValue``.
 
 For instructions that write to the zero registers, the result is discarded. The number of available ``results`` entries is reduced accordingly.
 
@@ -64,7 +71,7 @@ Loads and stores
 
 In addition to an execution behaviour, memory instructions also require a new entry in the address generation behaviour table found in ``src/lib/arch/riscv/Instruction_address.cc``. These entries are responsible for describing the method used to generate the addresses that these instructions will read from or write to.
 
-Address generation is expected to generate one or more instances of ``MemoryAddressTarget``, containing an address and the number of bytes to access. The same variables as described in the :ref:`AArch64 documentation <aarch64-adding-execution-behaviour-operands>` (``operands``, ``metadata``) are available to use to generate these addresses.
+Address generation is expected to generate one or more instances of ``MemoryAddressTarget``, containing an address and the number of bytes to access. The same variables as described in the :ref:`AArch64 documentation <aarch64-adding-execution-behaviour-operands>` (``sourceValues_``, ``metadata``) are available to use to generate these addresses.
 
 Once the addresses have been generated, they should be supplied in a vector to the ``setMemoryAddresses`` helper function.
 
@@ -78,3 +85,17 @@ An example of this would be the pseudoinstruction ``not rd, rs``. This is implem
 This must be fixed in the ``InstructionMetadata`` constructor. A new entry should be added to the switch statement and the pseudoinstruction mnemonic checked. The correct set of operands can then be set. A couple of helper functions are used for common operand fixes.
 
 To ensure all pseudoinstructions are accounted for, the table in chapter 25 of the `RISC-V Unprivileged specification <https://riscv.org/technical/specifications/>`_ should be checked. It is recommended to implement all pseudoinstructions for all currently implemented instructions.
+
+Rounding Modes
+**************
+
+RISC-V floating point instructions can use either static or dynamic rounding modes. The former embedded as 3 bits within the instruction encoding, and the latter held as 3 bits of the ``fcsr`` system register.
+
+To enforce static rounding modes, the function ``setStaticRoundingModeThen`` is used. This takes the execution logic of the instruction as a parameter in the form of a lambda function. ``setStaticRoundingModeThen`` extracts the rounding mode from the raw instruction encoding as Capstone currently doesn't perform this functionality. It then changes the C++ ``fenv`` rounding mode before calling the lambda to perform the execution logic within this new environment. Before returning execution to the switch statement, it reverts the ``fenv`` rounding mode to its initial state to preserve the dynamic rounding mode.
+
+Updating the dynamic rounding mode can only be performed by a change to the ``fcsr`` system register. This is done using a Zicsr instruction and must happen atomically. To enforce this functionality, the relevant instruction causes a non-fatal exception. This forces all instructions earlier in program order to be committed and all instructions later to be flushed from the pipeline. This allows the ``fenv`` rounding mode to be changed while the pipeline is sterile, thus preventing incorrect rounding of speculatively executed instructions.
+
+Zicsr
+*****
+
+The Zicsr extension is required by the F and D extensions; however, this is left with dummy implementations for this release (0.9.6). Therefore, the ``fcsr`` register is not updated based on the result of operations or the changing of the rounding mode. Thus far, this has not affected our ability to run typical high performance computing applications and miniapps.
diff --git a/docs/sphinx/developer/components/branchPred.rst b/docs/sphinx/developer/components/branchPred.rst
index b49f58c1c1..6a03c85129 100644
--- a/docs/sphinx/developer/components/branchPred.rst
+++ b/docs/sphinx/developer/components/branchPred.rst
@@ -3,7 +3,7 @@ Branch prediction
 
 SimEng's fetch unit is supplied with an instance of the abstract ``BranchPredictor`` class to enable speculative execution. 
 
-Access to the ``BranchPredictor`` is supported through the ``predict``, ``update``, and ``flush`` functions. ``predict`` provides a branch prediction, both target and direction, ``update`` updates an instructions' prediction, and ``flush`` provides optional algorithm specific flushing functionality.
+Access to the ``BranchPredictor`` is supported through the ``predict``, ``update``, and ``flush`` functions. ``predict`` provides a branch prediction, both target and direction, for a branch instruction. ``update`` updates the branch predictor's prediction mechanism on the actual outcome of a branch. ``flush`` provides algorithm specific flushing functionality.
 
 The ``predict`` function is passed an instruction address, branch type, and a possible known target. The branch type argument currently supports the following types:
 
@@ -17,13 +17,15 @@ The usage of these parameters within a branch predictor's ``predict`` function i
 
 The ``update`` function is passed the branch outcome, the instruction address, and the branch type. From this information, any algorithms or branch structures may be updated.
 
+The state of the branch predictor when ``predict`` is called on a branch is stored in the ``ftq`` to be used by the ``update`` function.  For instance, the perceptron predictor stores the globalHistory and confidence for each prediction, but future predictors may store alternative state. The ``ftq`` is a queue that has an entry for each in-flight branch.  A single entry is added to the back of the ftq on ``predict``, and a single entry is removed from the front of the queue on ``update`` and from the back of the queue on ``flush``.
+
 Generic Predictor
 -----------------
 
 The algorithm(s) held within a ``BranchPredictor`` class instance can be model-specific, however, SimEng provides a ``GenericPredictor`` which contains the following logic.
 
 Global History
-    For indexing relevant prediction structures, a global history can be utilised. The global history value uses n-bits to store the n most recent branch direction outcomes, with the left-most bit being the oldest.
+    For indexing relevant prediction structures, a global history can be utilised. The global history value stores the n most recent branch direction outcomes in an unsigned integer, with the least-significant bit being the most recent branch direction. The global history is speculatively updated on ``predict``, and is corrected if needed on ``update`` and ``flush``.  To facilitate this speculative updating, and rolling-back on correction, for a global history of n the branch predictor keeps track of the 2n most recent branch outcomes.  Valid values for Global History are 1-32.
 
 Branch Target Buffer (BTB)
     For each entry, the BTB stores the most recent target along with an n-bit saturating counter for an associated direction. The indexing of this structure uses the lower bits of an instruction address XOR'ed with the current global branch history value.
@@ -31,7 +33,24 @@ Branch Target Buffer (BTB)
     If the supplied branch type is ``Unconditional``, then the predicted direction is overridden to be taken. If the supplied branch type is ``Conditional`` and the predicted direction is not taken, then the predicted target is overridden to be the next sequential instruction.
 
 Return Address Stack (RAS)
-    Identified through the supplied branch type, Return instructions pop values off of the RAS to get their branch target whilst Branch-and-Link instructions push values onto the RAS, for use by a proceeding Return instruction.
+    Identified through the supplied branch type, Return instructions pop values off of the RAS to get their branch target whilst Branch-and-Link instructions push values onto the RAS, for later use by the Branch-and-Link instruction's corresponding Return instruction.
 
 Static Prediction
     Based on the chosen static prediction method of "always taken" or "always not taken", the n-bit saturating counter value in the initial entries of the BTB structure are filled with the weakest variant of taken or not-taken respectively.
+
+Perceptron Predictor
+--------------------
+The ``PerceptronPredictor`` has the same overall structure as the ``GenericPredictor`` but replaces the saturating counter as a means for direction prediction with a perceptron.  The ``PerceptronPredictor`` contains the following logic.
+
+Global History
+    For indexing relevant prediction structures, a global history can be utilised. The global history value stores the n most recent branch direction outcomes in an unsigned integer, with the least-significant bit being the most recent branch direction. The global history is speculatively updated on ``predict``, and is corrected if needed on ``update`` and ``flush``.  To facilitate this speculative updating, and rolling-back on correction, for a global history of n the branch predictor keeps track of the 2n most recent branch outcomes.  Valid values for Global History are 1-32.
+
+Branch Target Buffer (BTB)
+    For each entry, the BTB stores the most recent target along with a perceptron for an associated direction. The indexing of this structure uses the lower, non-zero bits of an instruction address XOR'ed with the current global branch history value.
+
+    The direction prediction is obtained from the perceptron by taking its dot-product with the global history.  The prediction is not taken if this is negative, or taken otherwise.  The perceptron is updated when its prediction is wrong or when the magnitude of the dot-product is below a pre-determined threshold (i.e., the confidence of the prediction is low).  To update, each ith weight of the perceptron is incremented if the actual outcome of the branch is the same as the ith bit of ``globalHistory_``, and decremented otherwise.
+
+    If the supplied branch type is ``Unconditional``, then the predicted direction is overridden to be taken. If the supplied branch type is ``Conditional`` and the predicted direction is not taken, then the predicted target is overridden to be the next sequential instruction.
+
+Return Address Stack (RAS)
+    Identified through the supplied branch type, Return instructions pop values off of the RAS to get their branch target whilst Branch-and-Link instructions push values onto the RAS, for later use by the Branch-and-Link instruction's corresponding Return instruction.
\ No newline at end of file
diff --git a/docs/sphinx/developer/components/coreinstance.rst b/docs/sphinx/developer/components/coreinstance.rst
index 02f69a369a..8b9e99a449 100644
--- a/docs/sphinx/developer/components/coreinstance.rst
+++ b/docs/sphinx/developer/components/coreinstance.rst
@@ -9,7 +9,7 @@ Process the config file
     Either the passed configuration file path, or default configuration string, is used to generate the model configuration class. All subsequent parameterised instantiations of simulation objects utilise this configuration class.
 
 Create the image process
-    From the passed workload path, or default set of instructions, a process image is created. A region of host memory is populated with workload data (e.g. instructions), a region for the HEAP, and an initial stack frame. References to it are then passed between various simulation objects to serve as the underlying process memory space.
+    From the passed workload path, or default binary, a process image is created. A region of host memory is populated with workload data (e.g. instructions), a region for the HEAP, and an initial stack frame. References to it are then passed between various simulation objects to serve as the underlying process memory space.
 
 Construct on-chip cache interfaces
     Based on the supplied configuration options, the on-chip cache interfaces are constructed. These interfaces sit on top of a reference to the process memory space constructed prior. Currently, only L1 instruction and data caches are supported and the interfaces are defined under the :ref:`L1-Data-Memory <l1dcnf>` and  :ref:`L1-Instruction-Memory <l1icnf>` config options.
diff --git a/docs/sphinx/developer/models/index.rst b/docs/sphinx/developer/models/index.rst
index b2ae090e14..51a5ba7cbf 100644
--- a/docs/sphinx/developer/models/index.rst
+++ b/docs/sphinx/developer/models/index.rst
@@ -55,3 +55,159 @@ The current existing processors have supplied configuration files:
 - `ThunderX2 <https://en.wikichip.org/wiki/cavium/microarchitectures/vulcan>`_
 - `A64FX <https://github.com/fujitsu/A64FX/blob/master/doc/A64FX_Microarchitecture_Manual_en_1.8.pdf>`_
 - `M1 Firestorm <https://github.com/UoB-HPC/SimEng/blob/m1-dev/m1_docs/M1_Findings.md>`_
+
+
+Adding model configuration options
+----------------------------------
+
+SimEng utilises a scheme which defines value expectations on mandatory and optional configuration options. The value expectations consist of:
+
+- A default value to be used when creating an in-code default config file
+- A boolean to describe whether the config option is mandatory or optional
+- Either a set of values or an upper and lower bound which the supplied config option must conform to
+
+The scheme is implemented through the ``ExpectationNode`` class which creates a tree structure where each node holds the above expectations. The tree structure should map one-to-one to the structure of the passed config file. For example:
+
+With the YAML:
+
+.. code-block:: text
+
+    Core:
+        ISA: AArch64
+        Simulation-Mode: outoforder
+
+The ``ExpectationNode`` tree structure would be:
+
+.. image:: ../../assets/expectations.png
+  :width: 350
+  :alt: ExpectationNode structure example
+
+The tree structure formed with ``ExpectationNode`` instances is constructed through the ``addChild(...)`` utility of the ``ExpectationNode`` class. Starting at a root node, ``addChild(...)`` is used to create various branches of the tree which map to the various config options as described in the above diagram. For those nodes which hold config values, either ``setValueBounds(...)`` or ``setValueSet(...)`` is used to set the config value restrictions.
+
+Creating a new expectation
+**************************
+
+Many examples of utilising the ``ExpectationNode`` class to set the expectation of a specific config value exist in ``src/lib/config/ModelConfig.cc``. To elaborate on these examples, below is a simplified outline of how you would create the expectations for the above diagram. As with ``src/lib/config/ModelConfig.cc``, ``expectations_`` is treated as the blank root node from which we construct the tree.
+
+First, we create the expectation of the parent config key ``Core``. The ``addChild(...)`` utility takes a new ``ExpectationNode`` instance which is appended to the vector of children in the parent ``ExpectationNode`` instance; thus forming a new branch of the eventual tree structure. To create a new ``ExpectationNode`` instance, the ``ExpectationNode::createExpectation(...)`` utility can be used. There are two variants of the function, namely:
+
+.. code-block:: text
+
+        1.
+            createExpectation(std::string key, bool optional = false)
+
+Which is used for those nodes whose corresponding config option has no value (only a key and children).
+
+.. code-block:: text
+
+        2.
+            template <typename T>
+            createExpectation(T defaultValue, std::string key, bool optional = false)
+
+Which is used for those nodes whose corresponding config option does have a value. The function is templated to support different config value types (e.g. T = string, integer, floating point, etc).
+
+For the expectation on the parent config key ``Core``, given it holds no value, we don't need to set any value expectations. Therefore, the first ``createExpectation`` can be used.
+
+.. code-block:: text
+
+        expectations_.addChild(ExpectationNode::createExpectation("Core"));
+
+With both the ``ISA`` and ``Simulation-Mode`` config options which do have values, the second  ``createExpectation`` must be used. First, we create the ``ExpectationNode`` instances for them and add them to the children of the new "Core" ``ExpectationNode`` instance. For simplicity of the example, we have set the default values of the ``ISA`` and ``Simulation-Mode`` config options to ``AArch64`` and ``emulation`` respectively.
+
+.. code-block:: text
+    
+        expectations_["Core"].addChild(
+            ExpectationNode::createExpectation<std::string>("AArch64", "ISA"));
+    
+        expectations_["Core"].addChild(
+            ExpectationNode::createExpectation<std::string>("emulation",
+                                                      "Simulation-Mode"));
+
+Then we set the value restrictions as described in the above diagram:
+
+.. code-block:: text
+
+        expectations_["Core"]["ISA"].setValueSet(
+            std::vector<std::string>{"AArch64", "rv64"});
+
+        expectations_["Core"]["Simulation-Mode"].setValueSet(
+            std::vector<std::string>{"emulation", "inorderpipelined", "outoforder"});
+
+Wildcard nodes
+**************
+
+To reduce code duplication, the concept of mapping one set of expectations to many config values has been implemented. Denoted as a ``wildcard`` node, these nodes can be used to specify the value expectations on a config option which has multiple instances of the same value format/structure. For example, in the "Latencies" config option, many latencies can be defined as seen below:
+
+.. code-block:: text
+
+    Latencies:
+    0:
+        Instruction-Groups: 
+        - INT_SIMPLE_ARTH
+        - INT_SIMPLE_LOGICAL
+        Execution-Latency: 2
+        Execution-Throughput: 2
+    1:
+        Instruction-Groups: 
+        - INT_SIMPLE_ARTH_NOSHIFT
+        - INT_SIMPLE_LOGICAL_NOSHIFT
+        Execution-Latency: 1
+        Execution-Throughput: 1
+    2:
+        Instruction-Groups: 
+        - INT_MUL
+        Execution-Latency: 5
+        Execution-Throughput: 1
+
+Taking the "Execution-Latency" option as an example, rather than setting the expectation for an unknown number of occurrences, we can instead write:
+
+.. code-block:: text
+
+    expectations_["Latencies"].addChild(
+      ExpectationNode::createExpectation<uint16_t>(0, wildcard));
+
+    expectations_["Latencies"][wildcard].addChild(
+      ExpectationNode::createExpectation<uint16_t>(1, "Execution-Latency"));
+    expectations_["Latencies"][wildcard]["Execution-Latency"]
+      .setValueBounds<uint16_t>(1, UINT16_MAX);
+
+When validating all config options under the "Latencies" key, the ``wildcard`` node created will be used for each option.
+
+Sequence Nodes
+**************
+
+Similar to the motivation behind the ``wildcard`` node usage, a value expectation can be set to be applied to all values in a YAML sequence/list/array/etc. By calling ``.setAsSequence()`` on an instance of ``ExpectationNode``, its value expectations will be applied to a YAML sequence of values. Below is an example of when you might use this.
+
+.. code-block:: text
+
+    Ports:
+    0:
+        Portname: Port 0
+        Instruction-Group-Support:
+        - INT_SIMPLE
+        - INT_MUL
+        - FP
+    1:
+        Portname: Port 1
+        Instruction-Group-Support:
+        - INT
+        - FP
+
+To apply a value expectation for all values in the "Instruction-Group-Support" options, we'd write:
+
+.. code-block:: text
+
+    expectations_["Ports"][wildcard].addChild(
+        ExpectationNode::createExpectation<std::string>(
+            "ALL", "Instruction-Group-Support", true));
+            
+    expectations_["Ports"][wildcard]["Instruction-Group-Support"].setValueSet(
+        {vector of ISA-specific instruction groups});
+    expectations_["Ports"][wildcard]["Instruction-Group-Support"].setAsSequence();
+
+We once again use a ``wildcard`` node here as all options under the "Ports" config key are repeated an unknown number of times.
+
+Additional options
+******************
+
+If any form of config value manipulation/verification is required, for example ensuring each reservation station port has an associated execution unit, this can be done in ``postValidation()`` within ``src/lib/config/ModelConfig.cc``.
diff --git a/docs/sphinx/developer/test/index.rst b/docs/sphinx/developer/test/index.rst
index 7f11a80504..0c094e6a7c 100644
--- a/docs/sphinx/developer/test/index.rst
+++ b/docs/sphinx/developer/test/index.rst
@@ -65,6 +65,7 @@ RISC-V regression suite
 
 The RISC-V regression test suite is located under the ``test/regression/riscv/`` directory. Within this can be found the following test cases:
 
+- Exception: Test non-supervisor call based exceptions.
 - instructions/: This directory holds tests for the functionality of each implemented instruction and their pseudoinstructions.
 - LoadStoreQueue: Test the correct implementation of load and store instructions concerning their interaction with the LSQ.
 - SmokeTest: Trivial ISA related tests.
diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst
index c697b0e95b..55d09d38ac 100644
--- a/docs/sphinx/index.rst
+++ b/docs/sphinx/index.rst
@@ -44,13 +44,13 @@ SimEng is a framework for building modern, cycle-accurate processor simulators.
 - Accurate, aiming for simulated cycle times being within 5-10% of real hardware
 - Open source, with a permissive license to enable collaboration across academia and industry
 
-SimEng places an emphasis on performance and ease of use, whilst maintaining a clean, modern, simple and well-documented code base. For example, the current out-of-order (OoO) model is implemented in around 10,000 lines of simple C++, with another 17,000 and 3,000 lines or so implementing the specifics of the AArch64 and RISC-V ISAs respectively, and around 27,000 lines of code in the accompanying test suite. SimEng should be simple to read and understand, making it ideal to modify to your requirements and include it in your projects.
+SimEng places an emphasis on performance and ease of use, whilst maintaining a clean, modern, simple and well-documented code base. For example, the current out-of-order (OoO) model is implemented in around 11,000 lines of simple C++, with another 18,000 and 5,000 lines or so implementing the specifics of the AArch64 and RISC-V ISAs respectively, and around 41,000 lines of code in the accompanying test suite. SimEng should be simple to read and understand, making it ideal to modify to your requirements and include it in your projects.
 
 
 Features
 --------
 
-Currently, SimEng targets the Armv9.2-a ISA with support for the SVE, SVE2, and SME extensions as well as RISC-V rv64ima. SimEng has the ability to model up to out-of-order, superscalar, single-core processors, and to emulate a subset of Linux system-calls. It supports statically compiled C and Fortran binaries that run on real hardware, with additional support for single-threaded OpenMP binaries too. Internally, SimEng currently models memory as an infinite L1 cache, i.e. it assumes that all loads and stores hit the L1 cache. However, we have a tested integration with the `Structural Simulation Toolkit <http://sst-simulator.org/>`_ (SST) allowing for a full memory model to be simulated; more information can be found in the :doc:`SST Integration section <sst/index>`.
+Currently, SimEng targets the Armv9.4-a ISA with support for the SVE, SVE2, SME, and SME2 extensions as well as RISC-V rv64imafdc. SimEng has the ability to model up to out-of-order, superscalar, single-core processors, and to emulate a subset of Linux system-calls. It supports statically compiled C and Fortran binaries that run on real hardware, with additional support for single-threaded OpenMP binaries too. Internally, SimEng currently models memory as an infinite L1 cache, i.e. it assumes that all loads and stores hit the L1 cache. However, we have a tested integration with the `Structural Simulation Toolkit <http://sst-simulator.org/>`_ (SST) allowing for a full memory model to be simulated; more information can be found in the :doc:`SST Integration section <sst/index>`.
 
 The main component provided by the simulator is a discrete processor core model, shown in diagrammatic form below.  This model accepts a clock signal and supports a memory access interface. A single YAML format configuration file can be passed to the simulation to specify models of existing microarchitectures, such as Marvell's ThunderX2 or Fujitsu's A64fx, or to model hypothetical core designs.
 
@@ -70,7 +70,9 @@ SimEng was first presented by `Professor Simon McIntosh-Smith <http://uob-hpc.gi
 
 Additionally, other works concerning SimEng and its use can be found below:
 
+- Second International workshop on RISC-V for HPC at SC2023 - `An Empirical Comparison of the RISC-V and AArch64 Instruction Sets <https://dl.acm.org/doi/abs/10.1145/3624062.3624233>`_
 - ModSim 2023 - :download:`Leveraging Arm's Scalable Matrix Extension to Accelerate Matrix Multiplication Kernels <assets/modsim23_poster.pdf>`
+- PMBS 2022 - `An Initial Evaluation of Arm's Scalable Matrix Extension <https://ieeexplore.ieee.org/document/10024029>`_
 - ModSim 2022 - :download:`A design space exploration for optimal vector unit composition <assets/modsim22_poster.pdf>`
 - :download:`Modelling Advanced Arm-based CPUs with SimEng <assets/simeng_arm_cpus.pdf>`
 
@@ -78,7 +80,7 @@ Additionally, other works concerning SimEng and its use can be found below:
 Release
 -------
 
-This is SimEng's seventh release, and should be considered beta level software (version 0.9.5). We expect you to find issues, primarily in unimplemented instructions or unimplemented system calls. Please let us know when you hit these, either by submitting a pull request (PR), or by filing an issue on the Github repo. You can find the all the code and associated test suites for SimEng in the `GitHub repository <https://github.com/UoB-HPC/SimEng>`_. The file `RELEASE_NOTES.txt <https://github.com/UoB-HPC/SimEng/blob/main/RELEASE-NOTES.txt>`_, found in the root of the project, explains the status of the project and includes other relevant information from the SimEng development team.
+This is SimEng's eighth release, and should be considered beta level software (version 0.9.6). We expect you to find issues, primarily in unimplemented instructions or unimplemented system calls. Please let us know when you hit these, either by submitting a pull request (PR), or by filing an issue on the Github repo. You can find all the code and associated test suites for SimEng in the `GitHub repository <https://github.com/UoB-HPC/SimEng>`_. The file `RELEASE_NOTES.txt <https://github.com/UoB-HPC/SimEng/blob/main/RELEASE-NOTES.txt>`_, found in the root of the project, explains the status of the project and includes other relevant information from the SimEng development team.
 
 SimEng is released under the same license as LLVM, the permissive `Apache 2.0 <https://www.apache.org/licenses/LICENSE-2.0>`_ license. We are passionate about enabling experimentation with computer architectures, and want users and developers in academic and industry to have complete freedom to use SimEng anyway they wish, including using it in commercial settings.
 
@@ -89,7 +91,7 @@ External project usage
 While we have tried to minimise SimEng's dependencies to keep it as simple as possible, it does make use of a small number of libraries and frameworks to provide crucial capabilities:
 
 - `Capstone disassembly engine <https://www.capstone-engine.org/>`_ - Provides instruction decoding for AArch64, RISC-V, x86 and other important ISAs
-- `Yaml-cpp <https://github.com/jbeder/yaml-cpp>`_ - Parsing YAML configuration files
+- `Rapid YAML <https://github.com/biojppm/rapidyaml>`_ - Parsing YAML configuration files
 - `GoogleTest <https://github.com/google/googletest>`_ - Framework for the test suites
 - `LLVM <https://github.com/llvm-mirror/llvm>`_ - Generation of binaries for use in the regression test suite
 
@@ -105,8 +107,9 @@ Current development team:
 
 - Jack Jones (lead developer) 
 - Finn Wilkinson
-- Rahat Muneeb
 - Dan Weaver
+- Alex Cockrean
+- Joseph Moore
     
 Original SimEng design and implementation:
 
@@ -117,6 +120,7 @@ Additional Contributors:
 
 - Ainsley Rutterford
 - Andrei Poenaru
+- Rahat Muneeb
 - Harry Waugh
 - Mutalib Mohammed
 - Seunghun Lee
diff --git a/docs/sphinx/user/building_simeng.rst b/docs/sphinx/user/building_simeng.rst
index 1a5cd41123..ed701ada40 100644
--- a/docs/sphinx/user/building_simeng.rst
+++ b/docs/sphinx/user/building_simeng.rst
@@ -41,7 +41,7 @@ With this configuration, the build files will be generated in a directory called
                 More information about the LLVM_DIR value can be found `here <https://llvm.org/docs/CMake.html#embedding-llvm-in-your-project>`_.
 
         .. Note::
-                LLVM versions greater than 14 or less than 8 are not supported. We'd recommend using LLVM 14.0.5 where possible as this has been verified by us to work correctly.
+                LLVM versions greater than 18.2 or less than 8 are not supported. We'd recommend using LLVM 18.1.8 where possible as this has been verified by us to work correctly for the most recent version of SimEng. LLVM versions less than 14 will likely not support AArch64 SVE2, SME, or SME2 instructions.
 
         b. Two additional flags are available when building SimEng. Firstly is ``-DSIMENG_SANITIZE={ON, OFF}`` which adds a selection of sanitisation compilation flags (primarily used during the development of the framework). Secondly is ``-SIMENG_OPTIMIZE={ON, OFF}`` which attempts to optimise the framework's compilation for the host machine through a set of compiler flags and options.
 
diff --git a/docs/sphinx/user/configuring_simeng.rst b/docs/sphinx/user/configuring_simeng.rst
index 7061217c2b..9a49893375 100644
--- a/docs/sphinx/user/configuring_simeng.rst
+++ b/docs/sphinx/user/configuring_simeng.rst
@@ -35,10 +35,10 @@ ISA
 Simulation-Mode
     The core archetype to use, the options are ``emulation``, ``inorderpipelined``, and ``outoforder``.
 
-Clock-Frequency
+Clock-Frequency-GHz
     The clock frequency, in GHz, of the processor being modelled.
 
-Timer Frequency
+Timer Frequency-MHz
     This dictates the frequency in MHz that the CPU's internal counter timer is updated. 
 
     i.e. For models based on an Arm ISA, this dictates how often the Virtual Counter Timer system register is updated to the number of cycles completed. This value is then accessible to programmers through ``mrs x0 CNTVCT_el0``.
@@ -52,6 +52,9 @@ Vector-Length (Only in use when ISA is ``AArch64``)
 Streaming-Vector-Length (Only in use when ISA is ``AArch64``)
     The vector length used by instructions belonging to Arm's Scalable Matrix Extension. Although the architecturally valid vector lengths are powers of 2 between 128 and 2048 inclusive, the supported vector lengths are those between 128 and 2048 in increments of 128.
 
+Compressed (Only in use when ISA is ``rv64``)
+    Enables the RISC-V compressed extension. If set to false and compressed instructions are supplied, a misaligned program counter exception is usually thrown.
+
 Fetch
 -----
 
@@ -96,9 +99,12 @@ AArch64
 - Conditional-Count
     The number of physical status/flag/conditional-code registers.
 
-- Matrix-Count (Optional)
+- SME-Matrix-Count (Optional)
     The number of physical ``za`` Arm SME registers.
 
+- SME-Lookup-Table-Count (Optional)
+    The number of physical SME Lookup Table registers (``zt0``).
+
 RISC-V
 
 - GeneralPurpose-Count
@@ -137,6 +143,13 @@ Load
 Store
     The size of the store queue within the load/store queue unit.
 
+Port-Allocator
+--------------
+
+This section allows a user to select which Port Allocator to use. The available options are:
+
+Type
+    The specific allocator algorithm to use. The current options are ``Balanced``, ``A64FX``, and ``M1``. The former implements a round-robin style algorithm, allocating instructions to compatable ports evenly. The latter two implement the port allocation algorithms found in the respective hardware as per their names.
 
 Branch-Predictor
 ----------------
@@ -145,20 +158,23 @@ The Branch-Prediction section contains those options to parameterise the branch
 
 The current options include:
 
+Type
+    The type of branch predictor that is used, the options are ``Generic``, and ``Perceptron``.  Both types of predictor use a branch target buffer with each entry containing a direction prediction mechanism and a target address.  The direction predictor used in ``Generic`` is a saturating counter, and in ``Perceptron`` it is a perceptron.
+
 BTB-Tag-Bits
-    The number of bits used to denote an entry in the Branch Target Buffer (BTB). For example, a ``bits`` value of 12 could denote 4096 entries with the calculation 1 << ``bits``.
+    The number of bits used to index the entries in the Branch Target Buffer (BTB). The number of entries in the BTB is obtained from the calculation: 1 << ``bits``. For example, a ``bits`` value of 12 would result in a BTB with 4096 entries.
 
 Saturating-Count-Bits
-    The number of bits used in the saturating counter value.
+    Only needed for a ``Generic`` predictor.  The number of bits used in the saturating counter value.
 
 Global-History-Length
-    The number of bits used to record the global history of branch directions. Each bit represents one branch direction.
+    The number of bits used to record the global history of branch directions. Each bit represents one branch direction.  For ``PerceptronPredictor``, this dictates the size of the perceptrons (with each perceptron having Global-History-Length + 1 weights).
 
 RAS-entries
     The number of entries in the Return Address Stack (RAS).
 
 Fallback-Static-Predictor
-    The static predictor used when no dynamic prediction is available. The options are either ``"Always-Taken"`` or ``"Always-Not-Taken"``.
+    Only needed for a ``Generic`` predictor.  The static predictor used when no dynamic prediction is available. The options are either ``"Always-Taken"`` or ``"Always-Not-Taken"``.
 
 .. _l1dcnf:
 
@@ -215,7 +231,7 @@ Permitted-Stores-Per-Cycle
 Ports
 -----
 
-Within this section, execution unit port definitions are constructed. Each port is defined with a name and a set of instruction groups it supports. The instruction groups are architecture-dependent, but, the available AArch64 instruction groups can be found :ref:`here <aarch64-instruction-groups>` and for RISC-V, can be found :ref:`here <riscv-instruction-groups>`.
+Within this section, execution unit port definitions are constructed. Each port is defined with a name and a set of instruction groups/opcodes it supports. The instruction groups/opcodes are architecture-dependent, but, the available AArch64 instruction groups/opcodes can be found :ref:`here <aarch64-instruction-groups>` and for RISC-V, can be found :ref:`here <riscv-instruction-groups>`.
 
 To define a port, the following structure must be adhered to:
 
@@ -223,17 +239,25 @@ To define a port, the following structure must be adhered to:
 
     0:
       Portname: <port_name>
-      Instruction-Support:
+      Instruction-Group-Support:
       - <instruction_group>
       - ...
       - <instruction_group>
+      Instruction-Opcode-Support:
+      - <instruction_opcode>
+      - ...
+      - <instruction_opcode>
     ...
     N-1:
-        Portname: <port_name>
-        Instruction-Support:
-        - <instruction_group>
-        - ...
-        - <instruction_group>
+      Portname: <port_name>
+      Instruction-Group-Support:
+      - <instruction_group>
+      - ...
+      - <instruction_group>
+      Instruction-Opcode-Support:
+      - <instruction_opcode>
+      - ...
+      - <instruction_opcode>
 
 With N as the number of execution ports.
 
@@ -297,7 +321,7 @@ With N as the number of execution units. The number of execution units should be
 Latencies
 ---------
 
-The execution latency and throughput can be configured under the Latencies section. A latency/throughput pair can be defined for a set of instruction groups, the groups available are the same as the set discussed in the Ports section.
+The execution latency and throughput can be configured under the Latencies section. A latency/throughput pair can be defined for a set of instruction groups/opcodes, the groups/opcodes available are the same as the set discussed in the Ports section.
 
 The execution latency defines the total number of cycles an instruction will spend in an execution unit. The throughput is how many cycles an instruction will block another instruction entering the execution unit. In non-pipelined execution units, the throughput is equal to the latency.
 
@@ -310,16 +334,24 @@ The following structure must be adhered to when defining group latencies:
       - <instruction_group>
       - ...
       - <instruction_group>
+      Instruction-Opcodes:
+      - <instruction_opcode>
+      - ...
+      - <instruction_opcode>
       Execution-Latency: <number_of_cycles>
       Execution-Throughput: <number_of_cycles>
     ...
     N-1:
-        Instruction-Groups:
-        - <instruction_group>
-        - ...
-        - <instruction_group>
-        Execution-Latency: <number_of_cycles>
-        Execution-Throughput: <number_of_cycles>
+      Instruction-Groups:
+      - <instruction_group>
+      - ...
+      - <instruction_group>
+      Instruction-Opcodes:
+      - <instruction_opcode>
+      - ...
+      - <instruction_opcode>
+      Execution-Latency: <number_of_cycles>
+      Execution-Throughput: <number_of_cycles>
 
 With N as the number of user-defined latency mappings. The default latencies, both execution and throughput, for those instruction groups not covered are 1.
 
@@ -333,9 +365,20 @@ CPU Info
     These fields are currently only used to generate a replica of the required Special Files directory structure.
 
 Generate-Special-Dir
-    Values are either "True" or "False".
-    Dictates whether or not SimEng should generate the SpecialFiles directory tree at runtime.
-    The alternative to this would be to copy in the required SpecialFiles by hand.
+    Values are either `True` or `False`.
+    Dictates whether or not SimEng should generate the Special-Files directory tree at runtime.
+    If your code requires Special-Files but you wish to use your own / existing files from a real system, you will need to set this option to `False`.
+    The files which are currently generated / supported in SimEng are:
+
+        - `/proc/cpuinfo`
+        - `/proc/stat`
+        - `/sys/deviced/system/cpu/online`
+        - `/sys/deviced/system/cpu/cpu{0..CoreCount}/topology/core_id`
+        - `/sys/deviced/system/cpu/cpu{0..CoreCount}/topology/physical_package_id`
+
+Special-File-Dir-Path
+    Represented as a String; is the **absolute path**  to the root directory where the Special-Files will be generated *OR* where existing Special-Files are located.
+    This is optional, and defaults to `SIMENG_BUILD_DIRECTORY/specialFiles`. The root directory must already exist.
 
 Core-Count
     Defines the total number of Physical cores (Not including threads).
diff --git a/docs/sphinx/user/running_simeng.rst b/docs/sphinx/user/running_simeng.rst
index 5d38ebe7cd..0a97fc50fe 100644
--- a/docs/sphinx/user/running_simeng.rst
+++ b/docs/sphinx/user/running_simeng.rst
@@ -7,9 +7,7 @@ SimEng uses a configuration file and a program binary to produce a cycle-accurat
 
         <simeng_install_directory>/bin/simeng <config file> <binary>
 
-If no arguments are passed to SimEng, default options are used. The default configuration file is tuned to a ThunderX2 processor. The default program binary is defined in ``SimEng/src/include/simeng/CoreInstance.hh`` under the ``hex[]`` array which contains a set of raw instructions in a hexadecimal format.
-
-.. Note:: Paths to binaries must be in full, and not relative.
+If no arguments are passed to SimEng, default options are used. The default configuration file is tuned to a ThunderX2 processor. The default program is a binary compiled to AArch64 found at ``SimEng/SimEngDefaultProgram``. This prints a welcome message to the console.
 
 Whilst a configuration file can be specified without a program (will use default program), a specified program must be accompanied by a configuration file.
 
@@ -28,7 +26,8 @@ Exit Clause
     The reason why the simulation has halted. Most commonly this is due to the invoking of the ``exit()`` system call by the workload under simulation.
 
 Statistics
-    A selection of simulation statistics describing the emergent simulated PMU-style hardware events.
+    A selection of simulation statistics describing the emergent simulated PMU-style hardware events. With respect to branch statistics, the misprediction rate
+is calculated as branches mispredicted / branches retired.
 
 All non-workload outputs from SimEng are prefixed with a tag of the format ``[SimEng:Object]`` (e.g. ``[SimEng:ExceptionHandler]``). If the output came from the root of the framework, the ``Object`` field is omitted.
 
diff --git a/src/include/simeng/BranchPredictor.hh b/src/include/simeng/BranchPredictor.hh
deleted file mode 100644
index 8e2ddd0797..0000000000
--- a/src/include/simeng/BranchPredictor.hh
+++ /dev/null
@@ -1,65 +0,0 @@
-#pragma once
-
-#include <cstdint>
-#include <tuple>
-
-namespace simeng {
-
-/** The types of branches recognised. */
-enum class BranchType {
-  Conditional = 0,
-  LoopClosing,
-  Return,
-  SubroutineCall,
-  Unconditional,
-  Unknown
-};
-
-/** A branch result prediction for an instruction. */
-struct BranchPrediction {
-  /** Whether the branch will be taken. */
-  bool taken;
-
-  /** The branch instruction's target address. If `taken = false`, the value
-   * will be ignored. */
-  uint64_t target;
-
-  /** Check for equality of two branch predictions . */
-  bool operator==(const BranchPrediction& other) {
-    if ((taken == other.taken) && (target == other.target))
-      return true;
-    else
-      return false;
-  }
-
-  /** Check for inequality of two branch predictions . */
-  bool operator!=(const BranchPrediction& other) {
-    if ((taken != other.taken) || (target != other.target))
-      return true;
-    else
-      return false;
-  }
-};
-
-/** An abstract branch predictor interface. */
-class BranchPredictor {
- public:
-  virtual ~BranchPredictor(){};
-
-  /** Generate a branch prediction for the specified instruction address with a
-   * branch type and possible known branch offset. */
-  virtual BranchPrediction predict(uint64_t address, BranchType type,
-                                   int64_t knownOffset) = 0;
-
-  /** Provide branch results to update the prediction model for the specified
-   * instruction address. */
-  virtual void update(uint64_t address, bool taken, uint64_t targetAddress,
-                      BranchType type) = 0;
-
-  /** Provides flushing behaviour for the implemented branch prediction schemes
-   * via the instruction address.
-   */
-  virtual void flush(uint64_t address) = 0;
-};
-
-}  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/Core.hh b/src/include/simeng/Core.hh
index e7592c234d..3720fa51da 100644
--- a/src/include/simeng/Core.hh
+++ b/src/include/simeng/Core.hh
@@ -4,17 +4,33 @@
 #include <map>
 #include <string>
 
+#include "simeng/ArchitecturalRegisterFileSet.hh"
+#include "simeng/arch/ProcessStateChange.hh"
+#include "simeng/config/SimInfo.hh"
 #include "simeng/control.hh"
+#include "simeng/memory/MemoryInterface.hh"
 #include "simeng/trace.hh"
-#include "yaml-cpp/yaml.h"
 
 namespace simeng {
 
-class ArchitecturalRegisterFileSet;
+namespace arch {
+// Forward declare Architecture and ExceptionHandler classes.
+class Architecture;
+class ExceptionHandler;
+}  // namespace arch
 
 /** An abstract core model. */
 class Core {
  public:
+  Core(memory::MemoryInterface& dataMemory, const arch::Architecture& isa,
+       const std::vector<RegisterFileStructure>& regFileStructure)
+      : dataMemory_(dataMemory),
+        isa_(isa),
+        registerFileSet_(regFileStructure),
+        clockFrequency_(
+            config::SimInfo::getConfig()["Core"]["Clock-Frequency-GHz"]
+                .as<float>()) {}
+
   virtual ~Core() {}
 
   /** Tick the core. */
@@ -30,11 +46,77 @@ class Core {
   /** Retrieve the number of instructions retired. */
   virtual uint64_t getInstructionsRetiredCount() const = 0;
 
-  /** Retrieve the simulated nanoseconds elapsed since the core started. */
-  virtual uint64_t getSystemTimer() const = 0;
-
   /** Retrieve a map of statistics to report. */
   virtual std::map<std::string, std::string> getStats() const = 0;
+
+  /** Retrieve the simulated nanoseconds elapsed since the core started. */
+  uint64_t getSystemTimer() const {
+    // TODO: This will need to be changed if we start supporting DVFS.
+    return (ticks_ / clockFrequency_);
+  }
+
+ protected:
+  /** Apply changes to the process state. */
+  void applyStateChange(const arch::ProcessStateChange& change) const {
+    auto& regFile = const_cast<ArchitecturalRegisterFileSet&>(
+        getArchitecturalRegisterFileSet());
+    // Update registers in accordance with the ProcessStateChange type
+    switch (change.type) {
+      case arch::ChangeType::INCREMENT: {
+        for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
+          regFile.set(change.modifiedRegisters[i],
+                      regFile.get(change.modifiedRegisters[i]).get<uint64_t>() +
+                          change.modifiedRegisterValues[i].get<uint64_t>());
+        }
+        break;
+      }
+      case arch::ChangeType::DECREMENT: {
+        for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
+          regFile.set(change.modifiedRegisters[i],
+                      regFile.get(change.modifiedRegisters[i]).get<uint64_t>() -
+                          change.modifiedRegisterValues[i].get<uint64_t>());
+        }
+        break;
+      }
+      default: {  // arch::ChangeType::REPLACEMENT
+        // If type is ChangeType::REPLACEMENT, set new values
+        for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
+          regFile.set(change.modifiedRegisters[i],
+                      change.modifiedRegisterValues[i]);
+        }
+        break;
+      }
+    }
+
+    // Update memory
+    // TODO: Analyse if ChangeType::INCREMENT or ChangeType::DECREMENT case is
+    // required for memory changes
+    for (size_t i = 0; i < change.memoryAddresses.size(); i++) {
+      dataMemory_.requestWrite(change.memoryAddresses[i],
+                               change.memoryAddressValues[i]);
+    }
+  }
+
+  /** A memory interface to access data. */
+  memory::MemoryInterface& dataMemory_;
+
+  /** The currently used ISA. */
+  const arch::Architecture& isa_;
+
+  /** The core's register file set. */
+  RegisterFileSet registerFileSet_;
+
+  /** The active exception handler. */
+  std::shared_ptr<arch::ExceptionHandler> exceptionHandler_;
+
+  /** The number of times this core has been ticked. */
+  uint64_t ticks_ = 0;
+
+  /** Whether or not the core has halted. */
+  bool hasHalted_ = false;
+
+  /** Clock frequency of core in GHz */
+  float clockFrequency_ = 0.0f;
 };
 
 }  // namespace simeng
diff --git a/src/include/simeng/CoreInstance.hh b/src/include/simeng/CoreInstance.hh
index c8e151e884..2cc739f3f9 100644
--- a/src/include/simeng/CoreInstance.hh
+++ b/src/include/simeng/CoreInstance.hh
@@ -2,96 +2,71 @@
 
 #include <string>
 
-#include "simeng/AlwaysNotTakenPredictor.hh"
 #include "simeng/Core.hh"
 #include "simeng/Elf.hh"
-#include "simeng/FixedLatencyMemoryInterface.hh"
-#include "simeng/FlatMemoryInterface.hh"
-#include "simeng/GenericPredictor.hh"
-#include "simeng/ModelConfig.hh"
 #include "simeng/SpecialFileDirGen.hh"
 #include "simeng/arch/Architecture.hh"
 #include "simeng/arch/aarch64/Architecture.hh"
 #include "simeng/arch/riscv/Architecture.hh"
+#include "simeng/branchpredictors/AlwaysNotTakenPredictor.hh"
+#include "simeng/branchpredictors/GenericPredictor.hh"
+#include "simeng/branchpredictors/PerceptronPredictor.hh"
+#include "simeng/config/SimInfo.hh"
 #include "simeng/kernel/Linux.hh"
+#include "simeng/memory/FixedLatencyMemoryInterface.hh"
+#include "simeng/memory/FlatMemoryInterface.hh"
 #include "simeng/models/emulation/Core.hh"
 #include "simeng/models/inorder/Core.hh"
 #include "simeng/models/outoforder/Core.hh"
 #include "simeng/pipeline/A64FXPortAllocator.hh"
 #include "simeng/pipeline/BalancedPortAllocator.hh"
-#include "yaml-cpp/yaml.h"
-
-// Program used when no executable is provided; counts down from
-// 1024*1024, with an independent `orr` at the start of each branch.
-uint32_t hex_[] = {
-    0x320C03E0,  // orr w0, wzr, #1048576
-    0x320003E1,  // orr w0, wzr, #1
-    0x71000400,  // subs w0, w0, #1
-    0x54FFFFC1,  // b.ne -8
-                 // .exit:
-    0xD2800000,  // mov x0, #0
-    0xD2800BC8,  // mov x8, #94
-    0xD4000001,  // svc #0
-};
+#include "simeng/pipeline/M1PortAllocator.hh"
 
 namespace simeng {
 
-/** The available modes of simulation. */
-enum class SimulationMode { Emulation, InOrderPipelined, OutOfOrder };
-
 /** A class to create a SimEng core instance from a supplied config. */
 class CoreInstance {
  public:
-  /** Default constructor with an executable and its arguments but no model
-   * configuration. */
+  /** Default constructor with an executable and its arguments. */
   CoreInstance(std::string executablePath,
-               std::vector<std::string> executableArgs);
-
-  /** Constructor with an executable, its arguments, and a model configuration.
-   */
-  CoreInstance(std::string configPath, std::string executablePath,
-               std::vector<std::string> executableArgs);
+               std::vector<std::string> executableArgs,
+               ryml::ConstNodeRef config = config::SimInfo::getConfig());
 
   /** CoreInstance with source code assembled by LLVM and a model configuration.
    */
-  CoreInstance(char* assembledSource, size_t sourceSize,
-               std::string configPath);
+  CoreInstance(uint8_t* assembledSource, size_t sourceSize,
+               ryml::ConstNodeRef config = config::SimInfo::getConfig());
 
   ~CoreInstance();
 
   /** Set the SimEng L1 instruction cache memory. */
-  void setL1InstructionMemory(std::shared_ptr<simeng::MemoryInterface> memRef);
+  void setL1InstructionMemory(
+      std::shared_ptr<simeng::memory::MemoryInterface> memRef);
 
   /** Set the SimEng L1 data cache memory. */
-  void setL1DataMemory(std::shared_ptr<simeng::MemoryInterface> memRef);
+  void setL1DataMemory(std::shared_ptr<simeng::memory::MemoryInterface> memRef);
 
   /** Construct the core and all its associated simulation objects after the
    * process and memory interfaces have been instantiated. */
   void createCore();
 
-  /** Getter for the set simulation mode. */
-  const SimulationMode getSimulationMode() const;
-
-  /** Getter for the set simulation mode in a string format. */
-  const std::string getSimulationModeString() const;
-
   /** Getter for the create core object. */
   std::shared_ptr<simeng::Core> getCore() const;
 
   /** Getter for the create data memory object. */
-  std::shared_ptr<simeng::MemoryInterface> getDataMemory() const;
+  std::shared_ptr<simeng::memory::MemoryInterface> getDataMemory() const;
 
   /** Getter for the create instruction memory object. */
-  std::shared_ptr<simeng::MemoryInterface> getInstructionMemory() const;
+  std::shared_ptr<simeng::memory::MemoryInterface> getInstructionMemory() const;
 
   /** Getter for a shared pointer to the created process image. */
   std::shared_ptr<char> getProcessImage() const;
 
   /** Getter for the size of the created process image. */
-  const uint64_t getProcessImageSize() const;
+  uint64_t getProcessImageSize() const;
 
   /* Getter for heap start. */
-  const uint64_t getHeapStart() const;
+  uint64_t getHeapStart() const;
 
  private:
   /** Generate the appropriate simulation objects as parameterised by the
@@ -99,9 +74,6 @@ class CoreInstance {
   void generateCoreModel(std::string executablePath,
                          std::vector<std::string> executableArgs);
 
-  /** Extract simulation mode from config file. */
-  void setSimulationMode();
-
   /** Construct the SimEng linux process object from command line arguments.
    * Empty command line arguments denote the usage of hardcoded
    * instructions held in the hex_ array. */
@@ -112,25 +84,28 @@ class CoreInstance {
   void createProcessMemory();
 
   /** Construct the SimEng L1 instruction cache memory. */
-  void createL1InstructionMemory(const simeng::MemInterfaceType type);
+  void createL1InstructionMemory(const memory::MemInterfaceType type);
 
   /** Construct the SimEng L1 data cache memory. */
-  void createL1DataMemory(const simeng::MemInterfaceType type);
+  void createL1DataMemory(const memory::MemInterfaceType type);
 
   /** Construct the special file directory. */
   void createSpecialFileDirectory();
 
-  /** Whether or not the source has been assembled by LLVM. */
-  bool assembledSource_ = false;
+  /** The config file describing the modelled core to be created. */
+  ryml::ConstNodeRef config_;
+
+  /** The SimEng Linux kernel object. */
+  simeng::kernel::Linux kernel_;
 
   /** Reference to source assembled by LLVM. */
-  char* source_ = nullptr;
+  uint8_t* source_ = nullptr;
 
   /** Size of the source code assembled by LLVM. */
   size_t sourceSize_ = 0;
 
-  /** The config file describing the modelled core to be created. */
-  YAML::Node config_;
+  /** Whether or not the source has been assembled by LLVM. */
+  bool assembledSource_ = false;
 
   /** Reference to the SimEng linux process object. */
   std::unique_ptr<simeng::kernel::LinuxProcess> process_ = nullptr;
@@ -141,9 +116,6 @@ class CoreInstance {
   /** The process memory space. */
   std::shared_ptr<char> processMemory_;
 
-  /** The SimEng Linux kernel object. */
-  simeng::kernel::Linux kernel_;
-
   /** Whether or not the dataMemory_ must be set manually. */
   bool setDataMemory_ = false;
 
@@ -162,18 +134,11 @@ class CoreInstance {
   /** Reference to the SimEng core object. */
   std::shared_ptr<simeng::Core> core_ = nullptr;
 
-  /** The simulation mode in use, defaulting to emulation. */
-  SimulationMode mode_ = SimulationMode::Emulation;
-
-  /** A string format for the simulation mode in use, defaulting to emulation.
-   */
-  std::string modeString_ = "Emulation";
-
   /** Reference to the SimEng data memory object. */
-  std::shared_ptr<simeng::MemoryInterface> dataMemory_ = nullptr;
+  std::shared_ptr<simeng::memory::MemoryInterface> dataMemory_ = nullptr;
 
   /** Reference to the SimEng instruction memory object. */
-  std::shared_ptr<simeng::MemoryInterface> instructionMemory_ = nullptr;
+  std::shared_ptr<simeng::memory::MemoryInterface> instructionMemory_ = nullptr;
 };
 
 }  // namespace simeng
diff --git a/src/include/simeng/Elf.hh b/src/include/simeng/Elf.hh
index 96e9b0f06e..b02e56e9e7 100644
--- a/src/include/simeng/Elf.hh
+++ b/src/include/simeng/Elf.hh
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <cstdint>
 #include <string>
 #include <vector>
 
diff --git a/src/include/simeng/GenericPredictor.hh b/src/include/simeng/GenericPredictor.hh
deleted file mode 100644
index cba924125c..0000000000
--- a/src/include/simeng/GenericPredictor.hh
+++ /dev/null
@@ -1,76 +0,0 @@
-#pragma once
-
-#include <deque>
-#include <map>
-#include <vector>
-
-#include "simeng/BranchPredictor.hh"
-#include "yaml-cpp/yaml.h"
-
-namespace simeng {
-
-/** A generic branch predictor implementing well known/text book branch
- * predictor logic. The following predictors have been included:
- *
- * - Static predictor based on pre-allocated branch type.
- *
- * - A Branch Target Buffer (BTB) with a local and global indexing scheme and a
- * 2-bit saturating counter.
- *
- * - A Return Address Stack (RAS) is also in use.
- */
-
-class GenericPredictor : public BranchPredictor {
- public:
-  /** Initialise predictor models. */
-  GenericPredictor(YAML::Node config);
-  ~GenericPredictor();
-
-  /** Generate a branch prediction for the supplied instruction address, a
-   * branch type, and a known branch offset; defaults to 0 meaning offset is not
-   * known. Returns a branch direction and branch target address. */
-  BranchPrediction predict(uint64_t address, BranchType type,
-                           int64_t knownOffset = 0) override;
-
-  /** Updates appropriate predictor model objects based on the address and
-   * outcome of the branch instruction. */
-  void update(uint64_t address, bool taken, uint64_t targetAddress,
-              BranchType type) override;
-
-  /** Provides RAS rewinding behaviour. */
-  void flush(uint64_t address) override;
-
- private:
-  /** The bitlength of the BTB index; BTB will have 2^bits entries. */
-  uint64_t btbBits_;
-
-  /** A 2^bits length vector of pairs containing a satCntBits_-bit saturating
-   * counter and a branch target. */
-  std::vector<std::pair<uint8_t, uint64_t>> btb_;
-
-  /** The previous BTB index calculated for an address. */
-  std::map<uint64_t, uint64_t> btbHistory_;
-
-  /** The number of bits used to form the saturating counter in a BTB entry. */
-  uint64_t satCntBits_;
-
-  /** A n-bit history of previous branch directions where n is equal to
-   * globalHistoryLength_. */
-  uint64_t globalHistory_ = 0;
-
-  /** The number of previous branch directions recorded globally. */
-  uint64_t globalHistoryLength_;
-
-  /** A return address stack. */
-  std::deque<uint64_t> ras_;
-
-  /** RAS history with instruction address as the keys. A non-zero value
-   * represents the target prediction for a return instruction and a 0 entry for
-   * a branch-and-link instruction. */
-  std::map<uint64_t, uint64_t> rasHistory_;
-
-  /** The size of the RAS. */
-  uint64_t rasSize_;
-};
-
-}  // namespace simeng
diff --git a/src/include/simeng/Instruction.hh b/src/include/simeng/Instruction.hh
index 9a42cae25f..1d389bd5e8 100644
--- a/src/include/simeng/Instruction.hh
+++ b/src/include/simeng/Instruction.hh
@@ -3,28 +3,40 @@
 #include <vector>
 
 #include "capstone/capstone.h"
-#include "simeng/BranchPredictor.hh"
-#include "simeng/MemoryInterface.hh"
-#include "simeng/RegisterFileSet.hh"
+#include "simeng/Register.hh"
 #include "simeng/RegisterValue.hh"
+#include "simeng/branchpredictors/BranchPrediction.hh"
+#include "simeng/memory/MemoryInterface.hh"
 #include "simeng/span.hh"
 
 using InstructionException = short;
 
 namespace simeng {
 
+/** A struct holding user-defined execution information for an instruction. */
+struct ExecutionInfo {
+  /** The latency for the instruction. */
+  uint16_t latency = 1;
+
+  /** The execution throughput for the instruction. */
+  uint16_t stallCycles = 1;
+
+  /** The ports that support the instruction. */
+  std::vector<uint16_t> ports = {};
+};
+
 /** An abstract instruction definition.
  * Each supported ISA should provide a derived implementation of this class. */
 class Instruction {
  public:
   virtual ~Instruction(){};
 
-  /** Check whether an exception has been encountered while processing this
-   * instruction. */
-  bool exceptionEncountered() const;
-
   /** Retrieve the source registers this instruction reads. */
-  virtual const span<Register> getOperandRegisters() const = 0;
+  virtual const span<Register> getSourceRegisters() const = 0;
+
+  /** Retrieve the data contained in the source registers this instruction
+   * reads.*/
+  virtual const span<RegisterValue> getSourceOperands() const = 0;
 
   /** Retrieve the destination registers this instruction will write to.
    * A register value of -1 signifies a Zero Register read, and should not be
@@ -33,69 +45,34 @@ class Instruction {
 
   /** Override the specified source register with a renamed physical register.
    */
-  virtual void renameSource(uint8_t i, Register renamed) = 0;
+  virtual void renameSource(uint16_t i, Register renamed) = 0;
 
   /** Override the specified destination register with a renamed physical
    * register. */
-  virtual void renameDestination(uint8_t i, Register renamed) = 0;
+  virtual void renameDestination(uint16_t i, Register renamed) = 0;
 
   /** Provide a value for the operand at the specified index. */
-  virtual void supplyOperand(uint8_t i, const RegisterValue& value) = 0;
+  virtual void supplyOperand(uint16_t i, const RegisterValue& value) = 0;
 
   /** Check whether the operand at index `i` has had a value supplied. */
   virtual bool isOperandReady(int i) const = 0;
 
-  /** Check whether all operand values have been supplied, and the instruction
-   * is ready to execute. */
-  virtual bool canExecute() const = 0;
-
-  /** Execute the instruction. */
-  virtual void execute() = 0;
-
-  /** Check whether the instruction has executed and has results ready to
-   * write back. */
-  bool hasExecuted() const;
-
-  /** Mark the instruction as ready to commit. */
-  void setCommitReady();
-
-  /** Check whether the instruction has written its values back and is ready to
-   * commit. */
-  bool canCommit() const;
-
   /** Retrieve register results. */
   virtual const span<RegisterValue> getResults() const = 0;
 
   /** Generate memory addresses this instruction wishes to access. */
-  virtual span<const MemoryAccessTarget> generateAddresses() = 0;
+  virtual span<const memory::MemoryAccessTarget> generateAddresses() = 0;
+
+  /** Retrieve previously generated memory addresses. */
+  virtual span<const memory::MemoryAccessTarget> getGeneratedAddresses()
+      const = 0;
 
   /** Provide data from a requested memory address. */
   virtual void supplyData(uint64_t address, const RegisterValue& data) = 0;
 
-  /** Retrieve previously generated memory addresses. */
-  virtual span<const MemoryAccessTarget> getGeneratedAddresses() const = 0;
-
   /** Retrieve supplied memory data. */
   virtual span<const RegisterValue> getData() const = 0;
 
-  /** Check whether all required data has been supplied. */
-  bool hasAllData() const;
-
-  /** Early misprediction check; see if it's possible to determine whether the
-   * next instruction address was mispredicted without executing the
-   * instruction. Returns a {mispredicted, target} tuple representing whether
-   * the instruction was mispredicted, and the correct target address. */
-  virtual std::tuple<bool, uint64_t> checkEarlyBranchMisprediction() const = 0;
-
-  /** Check for misprediction. */
-  bool wasBranchMispredicted() const;
-
-  /** Retrieve branch address. */
-  uint64_t getBranchAddress() const;
-
-  /** Was the branch taken? */
-  bool wasBranchTaken() const;
-
   /** Retrieve branch type. */
   virtual BranchType getBranchType() const = 0;
 
@@ -116,88 +93,192 @@ class Instruction {
   /** Is this a branch operation? */
   virtual bool isBranch() const = 0;
 
-  /** Set this instruction's instruction memory address. */
-  void setInstructionAddress(uint64_t address);
+  /** Retrieve the instruction group this instruction belongs to. */
+  virtual uint16_t getGroup() const = 0;
 
-  /** Get this instruction's instruction memory address. */
-  uint64_t getInstructionAddress() const;
+  /** Check whether all operand values have been supplied, and the instruction
+   * is ready to execute. */
+  virtual bool canExecute() const = 0;
 
-  /** Supply a branch prediction. */
-  void setBranchPrediction(BranchPrediction prediction);
+  /** Execute the instruction. */
+  virtual void execute() = 0;
 
-  /** Get a branch prediction. */
-  BranchPrediction getBranchPrediction() const;
+  /** Get this instruction's supported set of ports. */
+  virtual const std::vector<uint16_t>& getSupportedPorts() = 0;
+
+  /** Set this instruction's execution information including it's execution
+   * latency and throughput, and the set of ports which support it. */
+  virtual void setExecutionInfo(const ExecutionInfo& info) = 0;
 
   /** Set this instruction's sequence ID. */
-  void setSequenceId(uint64_t seqId);
+  void setSequenceId(uint64_t seqId) { sequenceId_ = seqId; }
 
   /** Retrieve this instruction's sequence ID. */
-  uint64_t getSequenceId() const;
+  uint64_t getSequenceId() const { return sequenceId_; }
 
   /** Set this instruction's instruction ID. */
-  void setInstructionId(uint64_t insnId);
+  void setInstructionId(uint64_t insnId) { instructionId_ = insnId; }
 
   /** Retrieve this instruction's instruction ID. */
-  uint64_t getInstructionId() const;
+  uint64_t getInstructionId() const { return instructionId_; }
 
-  /** Mark this instruction as flushed. */
-  void setFlushed();
+  /** Set this instruction's instruction memory address. */
+  void setInstructionAddress(uint64_t address) {
+    instructionAddress_ = address;
+  }
 
-  /** Check whether this instruction has been flushed. */
-  bool isFlushed() const;
+  /** Get this instruction's instruction memory address. */
+  uint64_t getInstructionAddress() const { return instructionAddress_; }
 
-  /** Retrieve the instruction group this instruction belongs to. */
-  virtual uint16_t getGroup() const = 0;
+  /** Supply a branch prediction. */
+  void setBranchPrediction(BranchPrediction prediction) {
+    prediction_ = prediction;
+  }
 
-  /** Retrieve the number of cycles this instruction will take to execute. */
-  uint16_t getLatency() const;
+  /** Get a branch prediction. */
+  BranchPrediction getBranchPrediction() const { return prediction_; }
 
-  /** Retrieve the number of cycles this instruction will take to be prcoessed
-   * by the LSQ. */
-  uint16_t getLSQLatency() const;
+  /** Retrieve branch address. */
+  uint64_t getBranchAddress() const { return branchAddress_; }
 
-  /** Retrieve the number of cycles this instruction will block the unit
-   * executing it. */
-  uint16_t getStallCycles() const;
+  /** Was the branch taken? */
+  bool wasBranchTaken() const { return branchTaken_; }
 
-  /** Get this instruction's supported set of ports. */
-  virtual const std::vector<uint16_t>& getSupportedPorts() = 0;
+  /** Check for misprediction. */
+  bool wasBranchMispredicted() const {
+    assert(executed_ &&
+           "Branch misprediction check requires instruction to have executed");
+    // Flag as mispredicted if taken state was wrongly predicted, or taken
+    // and predicted target is wrong
+    return ((branchTaken_ != prediction_.isTaken) ||
+            (prediction_.target != branchAddress_));
+  }
 
-  /** Is this a micro-operation? */
-  bool isMicroOp() const;
+  /** Check whether an exception has been encountered while processing this
+   * instruction. */
+  bool exceptionEncountered() const { return exceptionEncountered_; }
 
-  /** Is this the last uop in the possible sequence of decoded uops? */
-  bool isLastMicroOp() const;
+  /** Check whether all required data has been supplied. */
+  bool hasAllData() const { return (dataPending_ == 0); }
+
+  /** Check whether the instruction has executed and has results ready to
+   * write back. */
+  bool hasExecuted() const { return executed_; }
+
+  /** Retrieve the number of cycles this instruction will take to execute. */
+  uint16_t getLatency() const { return latency_; }
+
+  /** Retrieve the number of cycles this instruction will block the unit
+   * executing it. */
+  uint16_t getStallCycles() const { return stallCycles_; }
+
+  /** Retrieve the number of cycles this instruction will take to be processed
+   * by the LSQ. */
+  uint16_t getLSQLatency() const { return lsqExecutionLatency_; }
 
   /** Set the micro-operation in an awaiting commit signal state. */
-  void setWaitingCommit();
+  void setWaitingCommit() { waitingCommit_ = true; }
 
   /** Is the micro-operation in an awaiting commit state? */
-  bool isWaitingCommit() const;
+  bool isWaitingCommit() const { return waitingCommit_; }
+
+  /** Mark the instruction as ready to commit. */
+  void setCommitReady() { canCommit_ = true; }
+
+  /** Check whether the instruction has written its values back and is ready to
+   * commit. */
+  bool canCommit() const { return canCommit_; }
+
+  /** Mark this instruction as flushed. */
+  void setFlushed() { flushed_ = true; }
+
+  /** Check whether this instruction has been flushed. */
+  bool isFlushed() const { return flushed_; }
+
+  /** Is this a micro-operation? */
+  bool isMicroOp() const { return isMicroOp_; }
+
+  /** Is this the last uop in the possible sequence of decoded uops? */
+  bool isLastMicroOp() const { return isLastMicroOp_; }
 
   /** Get arbitrary micro-operation index. */
-  int getMicroOpIndex() const;
+  int getMicroOpIndex() const { return microOpIndex_; }
 
   /** Set this instructions' trace ID. */
-  void setTraceId(uint64_t trId);
+  void setTraceId(uint64_t trId) { traceId_ = trId; }
 
   /** Retrieve this instructions' trace ID. */
-  uint64_t getTraceId() const;
+  uint64_t getTraceId() const { return traceId_; }
 
  protected:
-  /** Whether an exception has been encountered. */
-  bool exceptionEncountered_ = false;
+  /** Set the accessed memory addresses, and create a corresponding memory data
+   * vector. */
+  void setMemoryAddresses(
+      const std::vector<memory::MemoryAccessTarget>& addresses) {
+    memoryData_.resize(addresses.size());
+    memoryAddresses_ = addresses;
+    dataPending_ = addresses.size();
+  }
+
+  /** Set the accessed memory addresses, and create a corresponding memory data
+   * vector. */
+  void setMemoryAddresses(std::vector<memory::MemoryAccessTarget>&& addresses) {
+    dataPending_ = addresses.size();
+    memoryData_.resize(addresses.size());
+    memoryAddresses_ = std::move(addresses);
+  }
+
+  /** Set the accessed memory addresses, and create a corresponding memory data
+   * vector. */
+  void setMemoryAddresses(memory::MemoryAccessTarget address) {
+    dataPending_ = 1;
+    memoryData_.resize(1);
+    memoryAddresses_.push_back(address);
+  }
+
+  // Instruction Info
+  /** This instruction's instruction ID used to group micro-operations together
+   * by macro-op; a higher ID represents a chronologically newer instruction. */
+  uint64_t instructionId_ = 0;
+
+  /** This instruction's sequence ID; a higher ID represents a chronologically
+   * newer instruction. */
+  uint64_t sequenceId_ = 0;
 
   /** The location in memory of this instruction was decoded at. */
-  uint64_t instructionAddress_;
+  uint64_t instructionAddress_ = 0;
 
+  // Execution
   /** Whether or not this instruction has been executed. */
   bool executed_ = false;
 
+  /** The number of cycles this instruction takes to execute. */
+  uint16_t latency_ = 1;
+
+  /** The number of cycles a load or store instruction takes to execute within
+   * the load/store queue. */
+  uint16_t lsqExecutionLatency_ = 1;
+
+  /** The number of cycles this instruction will stall the unit executing it
+   * for. */
+  uint16_t stallCycles_ = 1;
+
+  /** The execution ports that this instruction can be issued to. */
+  std::vector<uint16_t> supportedPorts_ = {};
+
   /** Whether or not this instruction is ready to commit. */
   bool canCommit_ = false;
 
   // Memory
+  /** The memory addresses this instruction accesses, as a vector of {offset,
+   * width} pairs. */
+  std::vector<memory::MemoryAccessTarget> memoryAddresses_;
+
+  /** A vector of memory values, that were either loaded memory, or are prepared
+   * for sending to memory (according to instruction type). Each entry
+   * corresponds to a `memoryAddresses` entry. */
+  std::vector<RegisterValue> memoryData_;
+
   /** The number of data items that still need to be supplied. */
   uint8_t dataPending_ = 0;
 
@@ -219,26 +300,11 @@ class Instruction {
   int64_t knownOffset_ = 0;
 
   // Flushing
-  /** This instruction's sequence ID; a higher ID represents a chronologically
-   * newer instruction. */
-  uint64_t sequenceId_;
-
   /** Has this instruction been flushed? */
   bool flushed_ = false;
 
-  /** The number of cycles this instruction takes to execute. */
-  uint16_t latency_ = 1;
-
-  /** The number of cycles a load or store instruction takes to execute within
-   * the load/store queue. */
-  uint16_t lsqExecutionLatency_ = 1;
-
-  /** The number of cycles this instruction will stall the unit executing it
-   * for. */
-  uint16_t stallCycles_ = 1;
-
-  /** The execution ports that this instruction can be issued to. */
-  std::vector<uint16_t> supportedPorts_ = {};
+  /** Whether an exception has been encountered. */
+  bool exceptionEncountered_ = false;
 
   // Micro operations
   /** Is a resultant micro-operation from an instruction split? */
@@ -248,17 +314,13 @@ class Instruction {
    * of decoded uops. Default case is that it is. */
   bool isLastMicroOp_ = true;
 
-  /** This instruction's instruction ID used to group micro-operations together
-   * by macro-op; a higher ID represents a chronologically newer instruction. */
-  uint64_t instructionId_;
-
   /** Is the micro-operation in a committable state but must wait for all
    * associated micro-operations to also be committable? */
   bool waitingCommit_ = false;
 
   /** An arbitrary index value for the micro-operation. Its use is based on the
    * implementation of specific micro-operations. */
-  int microOpIndex_;
+  int microOpIndex_ = 0;
 
   // Traces
   /** This instruction's trace ID; a higher ID represents a chronologically
diff --git a/src/include/simeng/ModelConfig.hh b/src/include/simeng/ModelConfig.hh
deleted file mode 100644
index fab383e165..0000000000
--- a/src/include/simeng/ModelConfig.hh
+++ /dev/null
@@ -1,193 +0,0 @@
-#pragma once
-
-#include <math.h>
-
-#include <algorithm>
-#include <cassert>
-#include <climits>
-#include <fstream>
-#include <iostream>
-#include <queue>
-#include <string>
-#include <unordered_map>
-
-#include "simeng/arch/aarch64/Instruction.hh"
-#include "yaml-cpp/yaml.h"
-
-#define DEFAULT_CONFIG                                                         \
-  ("{Core: {ISA: AArch64, Simulation-Mode: inorderpipelined, "                 \
-   "Clock-Frequency: 2.5, Timer-Frequency: 100, Micro-Operations: True, "      \
-   "Vector-Length: 512, Streaming-Vector-Length: 512}, Fetch: "                \
-   "{Fetch-Block-Size: 32, Loop-Buffer-Size: 64, Loop-Detection-Threshold: "   \
-   "4}, Process-Image: {Heap-Size: 10485760, Stack-Size: 1048576}, "           \
-   "Register-Set: {GeneralPurpose-Count: 154, FloatingPoint/SVE-Count: 90, "   \
-   "Predicate-Count: 17, Conditional-Count: 128, Matrix-Count: 2}, "           \
-   "Pipeline-Widths: {Commit: 4, FrontEnd: 4, LSQ-Completion: 2}, "            \
-   "Queue-Sizes: {ROB: 180, Load: 64, Store: 36}, Branch-Predictor: "          \
-   "{BTB-Tag-Bits: 11, Saturating-Count-Bits: 2, Global-History-Length: 10, "  \
-   "RAS-entries: 5, Fallback-Static-Predictor: 2}, L1-Data-Memory: "           \
-   "{Interface-Type: Flat}, L1-Instruction-Memory: {Interface-Type: Flat}, "   \
-   "LSQ-L1-Interface: {Access-Latency: 4, Exclusive: False, Load-Bandwidth: "  \
-   "32, Store-Bandwidth: 16, Permitted-Requests-Per-Cycle: 2, "                \
-   "Permitted-Loads-Per-Cycle: 2, Permitted-Stores-Per-Cycle: 1}, Ports: "     \
-   "{'0': {Portname: Port 0, Instruction-Group-Support: [1, 8, 14]}, '1': "    \
-   "{Portname: Port 1, Instruction-Group-Support: [0, 14]}, '2': {Portname: "  \
-   "Port 2, Instruction-Group-Support: [1, 8, 71]}, '3': {Portname: Port 4, "  \
-   "Instruction-Group-Support: [67]}, '4': {Portname: Port 5, "                \
-   "Instruction-Group-Support: [67]}, '5': {Portname: Port 3, "                \
-   "Instruction-Group-Support: [70]}}, Reservation-Stations: {'0': {Size: "    \
-   "60, Dispatch-Rate: 4, Ports: [0, 1, 2, 3, 4, 5]}}, Execution-Units: "      \
-   "{'0': {Pipelined: true}, '1': {Pipelined: true}, '2': {Pipelined: true}, " \
-   "'3': {Pipelined:true}, '4': {Pipelined: true}, '5': {Pipelined: true}}, "  \
-   "CPU-Info: {Generate-Special-Dir: false, Core-Count: 1, Socket-Count: 1, "  \
-   "SMT: 1, BogoMIPS: 200.00, Features: fp asimd evtstrm atomics cpuid, "      \
-   "CPU-Implementer: 0x0, CPU-Architecture: 0, CPU-Variant: 0x0, CPU-Part: "   \
-   "0x0, CPU-Revision: 0, Package-Count: 1}}")
-
-namespace simeng {
-
-namespace ExpectedValue {
-const uint8_t Integer = 0;
-const uint8_t UInteger = 1;
-const uint8_t Float = 2;
-const uint8_t String = 3;
-const uint8_t Bool = 4;
-}  // namespace ExpectedValue
-
-/** A class to correctly validate and format the provided
- * configuration YAML file. */
-class ModelConfig {
- public:
-  /** Construct a ModelConfig class by reading in the YAML file and
-   * running it through checks and formatting. */
-  ModelConfig(std::string path);
-
-  /** Return the checked and formatted config file. */
-  YAML::Node getConfigFile();
-
- private:
-  /** If using a base config file, inherit and overwrite values form
-   * the base file. */
-  void inherit();
-
-  /** Validate all required fields are filled with an appropriate
-   * value. */
-  void validate();
-
-  /** From a pre-defined vector of instruction group strings, instantiate an
-   * ISA specific mapping between the instruction group strings and the
-   * relevant instruction group variables. */
-  void createGroupMapping();
-
-  /** Given a node, value requirements, and possibly a default value,
-   * validate the value held within the node. All methods perform, at
-   * least, an existence and "read as type" check with the latter
-   * reading the value as the given type within a try catch
-   * expressions. */
-  // Set of values requirement, no default value
-  template <typename T>
-  int nodeChecker(const YAML::Node& node, const std::string& field,
-                  const std::vector<T>& value_set, uint8_t expected);
-  // Set of values requirement, with default value
-  template <typename T>
-  int nodeChecker(YAML::Node node, const std::string& field,
-                  const std::vector<T>& value_set, uint8_t expected,
-                  T default_value);
-  // Pair of inclusive bounds requirement, no default value
-  template <typename T>
-  int nodeChecker(const YAML::Node& node, const std::string& field,
-                  const std::pair<T, T>& bounds, uint8_t expected);
-  // Pair of inclusive bounds requirement, with default value
-  template <typename T>
-  int nodeChecker(YAML::Node node, const std::string& field,
-                  const std::pair<T, T>& bounds, uint8_t expected,
-                  const T& default_value);
-
-  /** Given a set of values (value_set), ensure the supplied node is one of
-   * these options. */
-  template <typename T>
-  int setChecker(YAML::Node node, const std::string& field,
-                 const std::vector<T>& value_set, uint8_t expected) {
-    // Ensure node value can be read as specified type
-    try {
-      T node_value = node.as<T>();
-      // Check if a set of expected options has been defined
-      if (value_set.size()) {
-        // Ensure values lies within the defined options
-        if (!(std::find(value_set.begin(), value_set.end(), node_value) !=
-              value_set.end())) {
-          invalid_ << "\t- " << field << " value \"" << node_value
-                   << "\" is not in the valid set {";
-          for (int i = 0; i < value_set.size(); i++) {
-            invalid_ << value_set[i];
-            if (i != value_set.size() - 1) invalid_ << ", ";
-          }
-          invalid_ << "}\n";
-          return 0;
-        }
-      }
-    } catch (...) {
-      invalid_ << "\t- " << field << invalidTypeMap_[expected] << "\n";
-      return 0;
-    }
-    return 1;
-  }
-
-  /** Given a set of bounds (bounds) ensure the supplied node is between
-   * these value inclusively. */
-  template <typename T>
-  int boundChecker(YAML::Node node, const std::string& field,
-                   const std::pair<T, T>& bounds, uint8_t expected) {
-    // Ensure node value can be read as specified type
-    try {
-      T node_value = node.as<T>();
-      // Extract bounds from bounds pair
-      T lower_bound = bounds.first;
-      T upper_bound = bounds.second;
-      assert(lower_bound <= upper_bound &&
-             "Defined lower bound of config option is not equal or "
-             "less than defined upper bound");
-
-      // Ensure value lies within the defined bounds
-      if (lower_bound > node_value || node_value > upper_bound) {
-        invalid_ << "\t- " << field
-                 << " must conform to the inclusive bounds of " << lower_bound
-                 << " and " << upper_bound << "\n";
-        return 0;
-      }
-    } catch (...) {
-      invalid_ << "\t- " << field << invalidTypeMap_[expected] << "\n";
-      return 0;
-    }
-    return 1;
-  }
-
-  /** The YAML formatted config file. */
-  YAML::Node configFile_;
-
-  /** The ISA specific vector of instruction group strings for matching
-   * against user inputted groups. */
-  std::vector<std::string> groupOptions_;
-
-  /** ISA specific mapping between the defined instruction strings and the
-   * instruction group variables. */
-  std::unordered_map<std::string, uint16_t> groupMapping_;
-
-  /** A mapping between the expected data type and the error message if a
-   * field cannot be read as the expected type. */
-  std::unordered_map<uint8_t, std::string> invalidTypeMap_ = {
-      {ExpectedValue::Integer, " must be of type integer"},
-      {ExpectedValue::UInteger, " must be of type unsigned integer"},
-      {ExpectedValue::Float, " must be of type float"},
-      {ExpectedValue::String, " must be of type string"},
-      {ExpectedValue::Bool, " must be of type bool"}};
-
-  /** A string stream containing information about missing config
-   * fields. */
-  std::ostringstream missing_;
-
-  /** A string stream containing information about invalid values. */
-  std::ostringstream invalid_;
-};  // namespace ModelConfig
-
-}  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/Register.hh b/src/include/simeng/Register.hh
new file mode 100644
index 0000000000..0152813268
--- /dev/null
+++ b/src/include/simeng/Register.hh
@@ -0,0 +1,29 @@
+#pragma once
+#include <cstdint>
+
+namespace simeng {
+
+/** A generic register identifier. */
+struct Register {
+  /** An identifier representing the type of register - e.g. 0 = general, 1 =
+   * vector. Used to determine which register file to access. */
+  uint8_t type;
+
+  /** A tag identifying the register. May correspond to either physical or
+   * architectural register, depending on point of usage. */
+  uint16_t tag;
+
+  /** A boolean identifier for whether the creation of this register has been a
+   * result of a register renaming scheme. */
+  bool renamed = false;
+
+  /** Check for equality of two register identifiers. */
+  bool operator==(const Register& other) const {
+    return (other.type == type && other.tag == tag);
+  }
+
+  /** Check for inequality of two register identifiers. */
+  bool operator!=(const Register& other) const { return !(other == *this); }
+};
+
+}  // namespace simeng
diff --git a/src/include/simeng/RegisterFileSet.hh b/src/include/simeng/RegisterFileSet.hh
index 41909da174..89f768d11a 100644
--- a/src/include/simeng/RegisterFileSet.hh
+++ b/src/include/simeng/RegisterFileSet.hh
@@ -2,34 +2,21 @@
 
 #include <vector>
 
+#include "simeng/Register.hh"
 #include "simeng/RegisterValue.hh"
 
 namespace simeng {
 
-/** A generic register identifier. */
-struct Register {
-  /** An identifier representing the type of register - e.g. 0 = general, 1 =
-   * vector. Used to determine which register file to access. */
-  uint8_t type;
-
-  /** A tag identifying the register. May correspond to either physical or
-   * architectural register, depending on point of usage. */
-  uint16_t tag;
-
-  /** Check for equality of two register identifiers. */
-  bool operator==(const Register& other) const;
-
-  /** Check for inequality of two register identifiers. */
-  bool operator!=(const Register& other) const;
-};
-std::ostream& operator<<(std::ostream& os, simeng::Register const& reg);
-
 /** Defines the structure of a register file. */
 struct RegisterFileStructure {
   /** The number of bytes per register. */
   uint16_t bytes;
   /** The number of registers. */
   uint16_t quantity;
+  /** Check for the equality of two RegisterFileStructure structs. */
+  bool operator==(const RegisterFileStructure& other) const {
+    return (bytes == other.bytes) && (quantity == other.quantity);
+  }
 };
 
 /** A processor register file set. Holds the physical registers for each
diff --git a/src/include/simeng/RegisterValue.hh b/src/include/simeng/RegisterValue.hh
index 20004432d0..4faadb301d 100644
--- a/src/include/simeng/RegisterValue.hh
+++ b/src/include/simeng/RegisterValue.hh
@@ -54,7 +54,7 @@ class RegisterValue {
    */
   RegisterValue(const char* ptr, uint16_t bytes, uint16_t capacity)
       : bytes(capacity) {
-    assert(capacity >= bytes && "Capacity is less then requested bytes");
+    assert(capacity >= bytes && "Capacity is less than requested bytes");
     char* dest;
     if (isLocal()) {
       dest = this->value;
@@ -90,7 +90,7 @@ class RegisterValue {
    * the specified datatype. */
   template <class T>
   const T* getAsVector() const {
-    static_assert(alignof(T) <= 8 && "Alignment over 8 bytes not guranteed");
+    static_assert(alignof(T) <= 8 && "Alignment over 8 bytes not guaranteed");
     assert(bytes > 0 && "Attempted to access an uninitialised RegisterValue");
     assert(sizeof(T) <= bytes &&
            "Attempted to access a RegisterValue as a datatype larger than the "
@@ -131,4 +131,16 @@ class RegisterValue {
   alignas(8) char value[MAX_LOCAL_BYTES];
 };
 
+inline bool operator==(const RegisterValue& lhs, const RegisterValue& rhs) {
+  if (lhs.size() == rhs.size()) {
+    auto lhV = lhs.getAsVector<char>();
+    auto rhV = rhs.getAsVector<char>();
+    for (size_t i = 0; i < lhs.size(); i++) {
+      if (lhV[i] != rhV[i]) return false;
+    }
+    return true;
+  }
+  return false;
+}
+
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/SpecialFileDirGen.hh b/src/include/simeng/SpecialFileDirGen.hh
index faf5d2fac5..a60c0d54ca 100644
--- a/src/include/simeng/SpecialFileDirGen.hh
+++ b/src/include/simeng/SpecialFileDirGen.hh
@@ -3,15 +3,14 @@
 #include <fstream>
 #include <string>
 
-#include "simeng/version.hh"
-#include "yaml-cpp/yaml.h"
+#include "simeng/config/SimInfo.hh"
 
 namespace simeng {
 class SpecialFileDirGen {
  public:
   /** Construct a SpecialFileDirGen class by reading in the YAML file and
    * running it through checks and formatting. */
-  SpecialFileDirGen(YAML::Node config);
+  SpecialFileDirGen(ryml::ConstNodeRef config = config::SimInfo::getConfig());
 
   /** Removes all files inside the '/src.lib/kernel/specialFiles' directory. */
   void RemoveExistingSFDir();
@@ -22,21 +21,21 @@ class SpecialFileDirGen {
 
  private:
   /** Path to the root of the SimEng special files directory. */
-  const std::string specialFilesDir_ = SIMENG_BUILD_DIR "/specialFiles";
+  const std::string specialFilesDir_;
 
   /** Values declared in YAML config file needed to create the Special Files
    * Directory tree. */
-  uint64_t core_count;
-  uint64_t smt;
-  uint64_t socket_count;
-  float bogoMIPS;
-  std::string features;
-  std::string cpu_implementer;
-  uint64_t cpu_architecture;
-  std::string cpu_variant;
-  std::string cpu_part;
-  uint64_t cpu_revision;
-  uint64_t package_count;
+  uint64_t coreCount_;
+  uint64_t socketCount_;
+  uint64_t smt_;
+  float bogoMIPS_;
+  std::string features_;
+  std::string cpuImplementer_;
+  uint64_t cpuArchitecture_;
+  std::string cpuVariant_;
+  std::string cpuPart_;
+  uint64_t cpuRevision_;
+  uint64_t packageCount_;
 
 };  // namespace SpecialFilesDirGen
 
diff --git a/src/include/simeng/arch/ArchInfo.hh b/src/include/simeng/arch/ArchInfo.hh
new file mode 100644
index 0000000000..e029699c07
--- /dev/null
+++ b/src/include/simeng/arch/ArchInfo.hh
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <vector>
+
+#include "capstone/capstone.h"
+#include "simeng/RegisterFileSet.hh"
+#include "simeng/config/yaml/ryml.hh"
+
+namespace simeng {
+namespace arch {
+
+/** A class to hold and generate architecture specific configuration options. */
+class ArchInfo {
+ public:
+  virtual ~ArchInfo(){};
+
+  /** Get the set of system register enums currently supported. */
+  virtual const std::vector<uint64_t>& getSysRegEnums() const = 0;
+
+  /** Get the structure of the architecture register fileset(s). */
+  virtual const std::vector<simeng::RegisterFileStructure>& getArchRegStruct()
+      const = 0;
+
+  /** Get the structure of the physical register fileset(s) as defined in the
+   * simulation configuration. */
+  virtual const std::vector<simeng::RegisterFileStructure>& getPhysRegStruct()
+      const = 0;
+
+  /** Get the quantities of the physical register in each fileset as defined in
+   * the simulation configuration. */
+  virtual const std::vector<uint16_t>& getPhysRegQuantities() const = 0;
+};
+
+}  // namespace arch
+}  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/Architecture.hh b/src/include/simeng/arch/Architecture.hh
index f789bece68..e4684bef8c 100644
--- a/src/include/simeng/arch/Architecture.hh
+++ b/src/include/simeng/arch/Architecture.hh
@@ -3,11 +3,13 @@
 #include <tuple>
 #include <vector>
 
-#include "simeng/BranchPredictor.hh"
 #include "simeng/Core.hh"
 #include "simeng/Instruction.hh"
-#include "simeng/MemoryInterface.hh"
+#include "simeng/arch/ProcessStateChange.hh"
+#include "simeng/branchpredictors/BranchPredictor.hh"
 #include "simeng/control.hh"
+#include "simeng/kernel/Linux.hh"
+#include "simeng/memory/MemoryInterface.hh"
 #include "simeng/trace.hh"
 
 namespace simeng {
@@ -16,23 +18,6 @@ using MacroOp = std::vector<std::shared_ptr<Instruction>>;
 
 namespace arch {
 
-/** The types of changes that can be made to values within the process state. */
-enum class ChangeType { REPLACEMENT, INCREMENT, DECREMENT };
-
-/** A structure describing a set of changes to the process state. */
-struct ProcessStateChange {
-  /** Type of changes to be made */
-  ChangeType type;
-  /** Registers to modify */
-  std::vector<Register> modifiedRegisters;
-  /** Values to set modified registers to */
-  std::vector<RegisterValue> modifiedRegisterValues;
-  /** Memory address/width pairs to modify */
-  std::vector<MemoryAccessTarget> memoryAddresses;
-  /** Values to write to memory */
-  std::vector<RegisterValue> memoryAddressValues;
-};
-
 /** The result from a handled exception. */
 struct ExceptionResult {
   /** Whether execution should halt. */
@@ -61,27 +46,21 @@ class ExceptionHandler {
  * ISA should provide a derived implementation of this class. */
 class Architecture {
  public:
+  Architecture(kernel::Linux& kernel) : linux_(kernel) {}
+
   virtual ~Architecture(){};
 
   /** Attempt to pre-decode from `bytesAvailable` bytes of instruction memory.
    * Writes into the supplied macro-op vector, and returns the number of bytes
    * consumed to produce it; a value of 0 indicates too few bytes were present
    * for a valid decoding. */
-  virtual uint8_t predecode(const void* ptr, uint8_t bytesAvailable,
+  virtual uint8_t predecode(const uint8_t* ptr, uint16_t bytesAvailable,
                             uint64_t instructionAddress, MacroOp& output,
                             std::string& disasm) const = 0;
 
-  /** Returns a vector of {size, number} pairs describing the available
-   * registers. */
-  virtual std::vector<RegisterFileStructure> getRegisterFileStructures()
-      const = 0;
-
   /** Returns a zero-indexed register tag for a system register encoding. */
   virtual int32_t getSystemRegisterTag(uint16_t reg) const = 0;
 
-  /** Returns the number of system registers that have a mapping. */
-  virtual uint16_t getNumSystemRegisters() const = 0;
-
   /** Create an exception handler for the exception generated by
    * `instruction`, providing the core model object and a reference to
    * process memory. Returns a smart pointer to an `ExceptionHandler` which
@@ -89,7 +68,7 @@ class Architecture {
    * obtained. */
   virtual std::shared_ptr<ExceptionHandler> handleException(
       const std::shared_ptr<Instruction>& instruction, const Core& core,
-      MemoryInterface& memory) const = 0;
+      memory::MemoryInterface& memory) const = 0;
 
   /** Retrieve the initial process state. */
   virtual ProcessStateChange getInitialState() const = 0;
@@ -97,20 +76,30 @@ class Architecture {
   /** Returns the maximum size of a valid instruction in bytes. */
   virtual uint8_t getMaxInstructionSize() const = 0;
 
-  /** Returns the physical register structure as defined within the config
-   * file
-   */
-  virtual std::vector<RegisterFileStructure> getConfigPhysicalRegisterStructure(
-      YAML::Node config) const = 0;
-
-  /** Returns the physical register quantities as defined within the config file
-   */
-  virtual std::vector<uint16_t> getConfigPhysicalRegisterQuantities(
-      YAML::Node config) const = 0;
+  /** Returns the minimum size of a valid instruction in bytes. */
+  virtual uint8_t getMinInstructionSize() const = 0;
 
   /** Updates System registers of any system-based timers. */
   virtual void updateSystemTimerRegisters(RegisterFileSet* regFile,
                                           const uint64_t iterations) const = 0;
+
+ protected:
+  /** A Capstone decoding library handle, for decoding instructions. */
+  csh capstoneHandle_;
+
+  /** A reference to a Linux kernel object to forward syscalls to. */
+  kernel::Linux& linux_;
+
+  /** A mapping from system register encoding to a zero-indexed tag. */
+  std::unordered_map<uint16_t, uint16_t> systemRegisterMap_;
+
+  /** A map to hold the relationship between instruction groups and
+   * user-defined execution information. */
+  std::unordered_map<uint16_t, ExecutionInfo> groupExecutionInfo_;
+
+  /** A map to hold the relationship between instruction opcode and
+   * user-defined execution information. */
+  std::unordered_map<uint16_t, ExecutionInfo> opcodeExecutionInfo_;
 };
 
 }  // namespace arch
diff --git a/src/include/simeng/arch/ProcessStateChange.hh b/src/include/simeng/arch/ProcessStateChange.hh
new file mode 100644
index 0000000000..08302a8233
--- /dev/null
+++ b/src/include/simeng/arch/ProcessStateChange.hh
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <vector>
+
+#include "simeng/Register.hh"
+#include "simeng/RegisterValue.hh"
+#include "simeng/memory/MemoryAccessTarget.hh"
+
+namespace simeng {
+
+namespace arch {
+
+/** The types of changes that can be made to values within the process state. */
+enum class ChangeType { REPLACEMENT, INCREMENT, DECREMENT };
+
+/** A structure describing a set of changes to the process state. */
+struct ProcessStateChange {
+  /** Type of changes to be made */
+  ChangeType type;
+  /** Registers to modify */
+  std::vector<Register> modifiedRegisters;
+  /** Values to set modified registers to */
+  std::vector<RegisterValue> modifiedRegisterValues;
+  /** Memory address/width pairs to modify */
+  std::vector<memory::MemoryAccessTarget> memoryAddresses;
+  /** Values to write to memory */
+  std::vector<RegisterValue> memoryAddressValues;
+};
+
+}  // namespace arch
+}  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/aarch64/ArchInfo.hh b/src/include/simeng/arch/aarch64/ArchInfo.hh
new file mode 100644
index 0000000000..1403da08f8
--- /dev/null
+++ b/src/include/simeng/arch/aarch64/ArchInfo.hh
@@ -0,0 +1,109 @@
+#pragma once
+
+#include "simeng/arch/ArchInfo.hh"
+
+namespace simeng {
+namespace arch {
+namespace aarch64 {
+
+/** A class to hold and generate aarch64 specific architecture configuration
+ * options. */
+class ArchInfo : public simeng::arch::ArchInfo {
+ public:
+  ArchInfo(ryml::ConstNodeRef config)
+      : sysRegisterEnums_({aarch64_sysreg::AARCH64_SYSREG_DCZID_EL0,
+                           aarch64_sysreg::AARCH64_SYSREG_FPCR,
+                           aarch64_sysreg::AARCH64_SYSREG_FPSR,
+                           aarch64_sysreg::AARCH64_SYSREG_TPIDR_EL0,
+                           aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1,
+                           aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0,
+                           aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0,
+                           aarch64_sysreg::AARCH64_SYSREG_SVCR}),
+        zaSize_(config["Core"]["Streaming-Vector-Length"].as<uint16_t>() / 8) {
+    // Generate the architecture-defined architectural register structure
+    archRegStruct_ = {
+        {8, 32},    // General purpose
+        {256, 32},  // Vector
+        {32, 17},   // Predicate
+        {1, 1},     // NZCV
+        {8, static_cast<uint16_t>(sysRegisterEnums_.size())},  // System
+        {256, zaSize_},  // Matrix (Each row is a register)
+        {64, 1}          // SME ZT0 table register (fixed width of 512-bit)
+    };
+
+    // Generate the config-defined physical register structure and quantities
+    ryml::ConstNodeRef regConfig = config["Register-Set"];
+    uint16_t gpCount = regConfig["GeneralPurpose-Count"].as<uint16_t>();
+    uint16_t fpCount = regConfig["FloatingPoint/SVE-Count"].as<uint16_t>();
+    uint16_t predCount = regConfig["Predicate-Count"].as<uint16_t>();
+    uint16_t condCount = regConfig["Conditional-Count"].as<uint16_t>();
+    uint16_t matCount = regConfig["SME-Matrix-Count"].as<uint16_t>();
+    uint16_t tabCount = regConfig["SME-Lookup-Table-Count"].as<uint16_t>();
+    // SME-Matrix-Count multiplied by (SVL/8) as internal representation of ZA
+    // is a block of row-vector-registers. Therefore, we need to convert
+    // physical counts from whole-ZA to rows-in-ZA.
+    matCount *= zaSize_;
+    physRegStruct_ = {{8, gpCount},
+                      {256, fpCount},
+                      {32, predCount},
+                      {1, condCount},
+                      {8, static_cast<uint16_t>(sysRegisterEnums_.size())},
+                      {256, matCount},
+                      {64, tabCount}};
+    physRegQuantities_ = {gpCount,
+                          fpCount,
+                          predCount,
+                          condCount,
+                          static_cast<uint16_t>(sysRegisterEnums_.size()),
+                          matCount,
+                          tabCount};
+  }
+
+  /** Get the set of system register enums currently supported. */
+  const std::vector<uint64_t>& getSysRegEnums() const override {
+    return sysRegisterEnums_;
+  }
+
+  /** Get the structure of the architecture register fileset(s). */
+  const std::vector<simeng::RegisterFileStructure>& getArchRegStruct()
+      const override {
+    return archRegStruct_;
+  }
+
+  /** Get the structure of the physical register fileset(s) as defined in the
+   * simulation configuration. */
+  const std::vector<simeng::RegisterFileStructure>& getPhysRegStruct()
+      const override {
+    return physRegStruct_;
+  }
+
+  /** Get the quantities of the physical register in each fileset as defined in
+   * the simulation configuration. */
+  const std::vector<uint16_t>& getPhysRegQuantities() const override {
+    return physRegQuantities_;
+  }
+
+ private:
+  /** The vector of all system register Capstone enum values used in the
+   * associated Architecture class. */
+  const std::vector<uint64_t> sysRegisterEnums_;
+
+  /** The structure of the architectural register filesets within the
+   * implemented aarch64 architecture. */
+  std::vector<simeng::RegisterFileStructure> archRegStruct_;
+
+  /** The structure of the physical register filesets within the
+   * implemented aarch64 architecture. */
+  std::vector<simeng::RegisterFileStructure> physRegStruct_;
+
+  /** The quantities of the physical register within each filesets of the
+   * implemented aarch64 architecture. */
+  std::vector<uint16_t> physRegQuantities_;
+
+  /** The size, in bytes, used by the aarch64 SME ZA register. */
+  const uint16_t zaSize_;
+};
+
+}  // namespace aarch64
+}  // namespace arch
+}  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/aarch64/Architecture.hh b/src/include/simeng/arch/aarch64/Architecture.hh
index 681d575463..1119fd564f 100644
--- a/src/include/simeng/arch/aarch64/Architecture.hh
+++ b/src/include/simeng/arch/aarch64/Architecture.hh
@@ -7,7 +7,6 @@
 #include "simeng/arch/Architecture.hh"
 #include "simeng/arch/aarch64/ExceptionHandler.hh"
 #include "simeng/arch/aarch64/MicroDecoder.hh"
-#include "simeng/kernel/Linux.hh"
 
 using csh = size_t;
 
@@ -18,32 +17,29 @@ namespace aarch64 {
 /* A basic Armv9.2-a implementation of the `Architecture` interface. */
 class Architecture : public arch::Architecture {
  public:
-  Architecture(kernel::Linux& kernel, YAML::Node config);
+  Architecture(kernel::Linux& kernel,
+               ryml::ConstNodeRef config = config::SimInfo::getConfig());
+
   ~Architecture();
+
   /** Pre-decode instruction memory into a macro-op of `Instruction`
    * instances. Returns the number of bytes consumed to produce it (always 4),
    * and writes into the supplied macro-op vector. */
-  uint8_t predecode(const void* ptr, uint8_t bytesAvailable,
+  uint8_t predecode(const uint8_t* ptr, uint16_t bytesAvailable,
                     uint64_t instructionAddress, MacroOp& output,
                     std::string& disasm) const override;
 
-  /** Returns an Armv9.2-a register file structure description. */
-  std::vector<RegisterFileStructure> getRegisterFileStructures() const override;
-
   /** Returns a zero-indexed register tag for a system register encoding.
    * Returns -1 in the case that the system register has no mapping. */
   int32_t getSystemRegisterTag(uint16_t reg) const override;
 
-  /** Returns the number of system registers that have a mapping. */
-  uint16_t getNumSystemRegisters() const override;
-
   /** Create an exception handler for the exception generated by `instruction`,
    * providing the core model object and a reference to process memory.
    * Returns a smart pointer to an `ExceptionHandler` which may be ticked until
    * the exception is resolved, and results then obtained. */
   std::shared_ptr<arch::ExceptionHandler> handleException(
       const std::shared_ptr<simeng::Instruction>& instruction, const Core& core,
-      MemoryInterface& memory) const override;
+      memory::MemoryInterface& memory) const override;
 
   /** Retrieve the initial process state. */
   ProcessStateChange getInitialState() const override;
@@ -51,36 +47,35 @@ class Architecture : public arch::Architecture {
   /** Returns the maximum size of a valid instruction in bytes. */
   uint8_t getMaxInstructionSize() const override;
 
-  /** Returns the current vector length set by the provided configuration. */
-  uint64_t getVectorLength() const;
-
-  /** Returns the current streaming vector length set by the provided
-   * configuration. */
-  uint64_t getStreamingVectorLength() const;
+  /** Returns the minimum size of a valid instruction in bytes. */
+  uint8_t getMinInstructionSize() const override;
 
   /** Updates System registers of any system-based timers. */
   void updateSystemTimerRegisters(RegisterFileSet* regFile,
                                   const uint64_t iterations) const override;
 
-  /** Returns the physical register structure as defined within the config file
-   */
-  std::vector<RegisterFileStructure> getConfigPhysicalRegisterStructure(
-      YAML::Node config) const override;
-
-  /** Returns the physical register quantities as defined within the config file
-   */
-  std::vector<uint16_t> getConfigPhysicalRegisterQuantities(
-      YAML::Node config) const override;
-
   /** Retrieve an ExecutionInfo object for the requested instruction. If a
    * opcode-based override has been defined for the latency and/or
    * port information, return that instead of the group-defined execution
    * information. */
-  ExecutionInfo getExecutionInfo(Instruction& insn) const;
+  virtual ExecutionInfo getExecutionInfo(const Instruction& insn) const;
+
+  /** Returns the current vector length set by the provided configuration. */
+  uint64_t getVectorLength() const;
+
+  /** Returns the current streaming vector length set by the provided
+   * configuration. */
+  uint64_t getStreamingVectorLength() const;
 
   /** Returns the current value of SVCRval_. */
   uint64_t getSVCRval() const;
 
+  /** Returns if SVE Streaming Mode is enabled. */
+  bool isStreamingModeEnabled() const;
+
+  /** Returns if the SME ZA Register is enabled. */
+  bool isZARegisterEnabled() const;
+
   /** Update the value of SVCRval_. */
   void setSVCRval(const uint64_t newVal) const;
 
@@ -88,31 +83,14 @@ class Architecture : public arch::Architecture {
   /** A decoding cache, mapping an instruction word to a previously decoded
    * instruction. Instructions are added to the cache as they're decoded, to
    * reduce the overhead of future decoding. */
-  static std::unordered_map<uint32_t, Instruction> decodeCache;
+  mutable std::unordered_map<uint32_t, Instruction> decodeCache_;
+
+  mutable std::unordered_map<uint32_t, std::string> disasmCache;
+
   /** A decoding metadata cache, mapping an instruction word to a previously
    * decoded instruction metadata bundle. Metadata is added to the cache as it's
    * decoded, to reduce the overhead of future decoding. */
-  static std::forward_list<InstructionMetadata> metadataCache;
-
-  /** A copy of the value of the SVCR system register. */
-  static uint64_t SVCRval_;
-
-  /** A mapping from system register encoding to a zero-indexed tag. */
-  std::unordered_map<uint16_t, uint16_t> systemRegisterMap_;
-
-  /** A map to hold the relationship between aarch64 instruction groups and
-   * user-defined execution information. */
-  std::unordered_map<uint16_t, ExecutionInfo> groupExecutionInfo_;
-
-  /** A map to hold the relationship between aarch64 instruction opcode and
-   * user-defined execution information. */
-  std::unordered_map<uint16_t, ExecutionInfo> opcodeExecutionInfo_;
-
-  /** A Capstone decoding library handle, for decoding instructions. */
-  csh capstoneHandle;
-
-  /** A reference to a Linux kernel object to forward syscalls to. */
-  kernel::Linux& linux_;
+  mutable std::forward_list<InstructionMetadata> metadataCache_;
 
   /** A reference to a micro decoder object to split macro operations. */
   std::unique_ptr<MicroDecoder> microDecoder_;
@@ -123,6 +101,9 @@ class Architecture : public arch::Architecture {
   /** The streaming vector length used by the SME extension in bits. */
   uint64_t SVL_;
 
+  /** A copy of the value of the SVCR system register. */
+  mutable uint64_t SVCRval_ = 0;
+
   /** System Register of Virtual Counter Timer. */
   simeng::Register VCTreg_;
 
diff --git a/src/include/simeng/arch/aarch64/ExceptionHandler.hh b/src/include/simeng/arch/aarch64/ExceptionHandler.hh
index 3e59bc58eb..5947462bde 100644
--- a/src/include/simeng/arch/aarch64/ExceptionHandler.hh
+++ b/src/include/simeng/arch/aarch64/ExceptionHandler.hh
@@ -16,7 +16,7 @@ class ExceptionHandler : public simeng::arch::ExceptionHandler {
   /** Create an exception handler with references to the instruction that caused
    * the exception, along with the core model object and process memory. */
   ExceptionHandler(const std::shared_ptr<simeng::Instruction>& instruction,
-                   const Core& core, MemoryInterface& memory,
+                   const Core& core, memory::MemoryInterface& memory,
                    kernel::Linux& linux);
 
   /** Progress handling of the exception, by calling and returning the result of
@@ -59,7 +59,7 @@ class ExceptionHandler : public simeng::arch::ExceptionHandler {
                       bool firstCall = true);
 
   /** A data buffer used for reading data from memory. */
-  std::vector<uint8_t> dataBuffer;
+  std::vector<uint8_t> dataBuffer_;
 
   /** Performs a readlinkat syscall using the path supplied. */
   void readLinkAt(span<char> path);
@@ -75,10 +75,10 @@ class ExceptionHandler : public simeng::arch::ExceptionHandler {
   const Instruction& instruction_;
 
   /** The core model object. */
-  const Core& core;
+  const Core& core_;
 
   /** The process memory. */
-  MemoryInterface& memory_;
+  memory::MemoryInterface& memory_;
 
   /** The Linux kernel to forward syscalls to. */
   kernel::Linux& linux_;
@@ -96,6 +96,16 @@ class ExceptionHandler : public simeng::arch::ExceptionHandler {
   static constexpr Register R3 = {RegisterType::GENERAL, 3};
   static constexpr Register R4 = {RegisterType::GENERAL, 4};
   static constexpr Register R5 = {RegisterType::GENERAL, 5};
+
+  /** Let the following ExceptionHandlerTest derived classes be a friend of this
+   * class to allow proper testing of `readStringThen()`, `readBufferThen()` and
+   * `printException()` functions. */
+  friend class AArch64ExceptionHandlerTest_readStringThen_Test;
+  friend class AArch64ExceptionHandlerTest_readStringThen_maxLen0_Test;
+  friend class AArch64ExceptionHandlerTest_readStringThen_maxLenReached_Test;
+  friend class AArch64ExceptionHandlerTest_readBufferThen_Test;
+  friend class AArch64ExceptionHandlerTest_readBufferThen_length0_Test;
+  friend class AArch64ExceptionHandlerTest_printException_Test;
 };
 
 }  // namespace aarch64
diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh
index 5d747860cd..d510c1f373 100644
--- a/src/include/simeng/arch/aarch64/Instruction.hh
+++ b/src/include/simeng/arch/aarch64/Instruction.hh
@@ -3,152 +3,242 @@
 #include <array>
 #include <unordered_map>
 
-#include "simeng/BranchPredictor.hh"
 #include "simeng/Instruction.hh"
 #include "simeng/arch/aarch64/InstructionGroups.hh"
+#include "simeng/arch/aarch64/operandContainer.hh"
+#include "simeng/branchpredictors/BranchPredictor.hh"
 
-struct cs_arm64_op;
+struct cs_aarch64_op;
 
 namespace simeng {
 namespace arch {
 namespace aarch64 {
 
-/** Apply the shift specified by `shiftType` to the unsigned integer `value`,
- * shifting by `amount`. */
-template <typename T>
-std::enable_if_t<std::is_integral_v<T> && std::is_unsigned_v<T>, T> shiftValue(
-    T value, uint8_t shiftType, uint8_t amount) {
-  switch (shiftType) {
-    case ARM64_SFT_LSL:
-      return value << amount;
-    case ARM64_SFT_LSR:
-      return value >> amount;
-    case ARM64_SFT_ASR:
-      return static_cast<std::make_signed_t<T>>(value) >> amount;
-    case ARM64_SFT_ROR: {
-      // Assuming sizeof(T) is a power of 2.
-      const auto mask = sizeof(T) * 8 - 1;
-      assert((amount <= mask) && "Rotate amount exceeds type width");
-      amount &= mask;
-      return (value >> amount) | (value << ((-amount) & mask));
-    }
-    case ARM64_SFT_MSL: {
-      // pad in with ones instead of zeros
-      const auto mask = (1 << amount) - 1;
-      return (value << amount) | mask;
+class Architecture;
+struct InstructionMetadata;
+
+// operandContainer type aliases - used to improve readability of source and
+// destination operand containers.
+using srcRegContainer = operandContainer<Register, MAX_SOURCE_REGISTERS>;
+using srcValContainer = operandContainer<RegisterValue, MAX_SOURCE_REGISTERS>;
+using destRegContainer = operandContainer<Register, MAX_DESTINATION_REGISTERS>;
+using destValContainer =
+    operandContainer<RegisterValue, MAX_DESTINATION_REGISTERS>;
+
+namespace RegisterType {
+/** The 64-bit general purpose register set: [w|x]0-31. */
+const uint8_t GENERAL = 0;
+/** The 128|2048 bit vector register set: [v|z]0-31. */
+const uint8_t VECTOR = 1;
+/** The 32 bit predicate register set: p0-15. */
+const uint8_t PREDICATE = 2;
+/** The 4-bit NZCV condition flag register. */
+const uint8_t NZCV = 3;
+/** The system registers. */
+const uint8_t SYSTEM = 4;
+/** The [256-byte x (SVL / 8)] SME matrix register za. */
+const uint8_t MATRIX = 5;
+/** The fixed width (512-bit) SME ZT0 table register. */
+const uint8_t TABLE = 6;
+
+/** A special register value representing the zero register. */
+const Register ZERO_REGISTER = {GENERAL, (uint16_t)-1};
+}  // namespace RegisterType
+
+/** The various exceptions that can be raised by an individual instruction. */
+enum class InstructionException {
+  None = 0,
+  EncodingUnallocated,
+  ExecutionNotYetImplemented,
+  MisalignedPC,
+  DataAbort,
+  SupervisorCall,
+  HypervisorCall,
+  SecureMonitorCall,
+  NoAvailablePort,
+  UnmappedSysReg,
+  StreamingModeUpdate,
+  ZAregisterStatusUpdate,
+  SMZAUpdate,
+  ZAdisabled,
+  SMdisabled
+};
+
+/** The opcodes of simeng aarch64 micro-operations. */
+namespace MicroOpcode {
+const uint8_t OFFSET_IMM = 0;
+const uint8_t OFFSET_REG = 1;
+const uint8_t LDR_ADDR = 2;
+const uint8_t STR_ADDR = 3;
+const uint8_t STR_DATA = 4;
+// INVALID is the default value reserved for non-micro-operation instructions
+const uint8_t INVALID = 255;
+}  // namespace MicroOpcode
+
+/** A struct to group micro-operation information together. */
+struct MicroOpInfo {
+  bool isMicroOp = false;
+  uint8_t microOpcode = MicroOpcode::INVALID;
+  uint8_t dataSize = 0;
+  bool isLastMicroOp = true;
+  int microOpIndex = 0;
+};
+
+/** Get the size of the data to be accessed from/to memory. */
+inline uint8_t getDataSize(cs_aarch64_op op) {
+  // No V-register enum identifiers exist. Instead, depending on whether a full
+  // or half vector is accessed, a Q or D register is used instead.
+  // A `is_vreg` bool in `op` defines if we are using v-vector registers.
+  if (op.is_vreg && ((AARCH64_REG_D0 <= op.reg && op.reg <= AARCH64_REG_D31) ||
+                     (AARCH64_REG_Q0 <= op.reg && op.reg <= AARCH64_REG_Q31))) {
+    AArch64Layout_VectorLayout vas = op.vas;
+    assert(vas != AARCH64LAYOUT_INVALID && "Invalid VAS type");
+    switch (vas) {
+      case AARCH64LAYOUT_VL_16B:
+      case AARCH64LAYOUT_VL_8H:
+      case AARCH64LAYOUT_VL_4S:
+      case AARCH64LAYOUT_VL_2D:
+      case AARCH64LAYOUT_VL_1Q:
+      case AARCH64LAYOUT_VL_Q:
+        return 16;
+      case AARCH64LAYOUT_VL_8B:
+      case AARCH64LAYOUT_VL_4H:
+      case AARCH64LAYOUT_VL_2S:
+      case AARCH64LAYOUT_VL_1D:
+      case AARCH64LAYOUT_VL_D:
+        return 8;
+      case AARCH64LAYOUT_VL_4B:
+      case AARCH64LAYOUT_VL_2H:
+      case AARCH64LAYOUT_VL_1S:
+      case AARCH64LAYOUT_VL_S:
+        return 4;
+      case AARCH64LAYOUT_VL_H:
+        return 2;
+      case AARCH64LAYOUT_VL_B:
+        return 1;
+      default:
+        std::cerr << "[SimEng] Cannot determine size of Arm V vector register "
+                     "elements with `reg` value "
+                  << op.reg << " and `vas` value of " << vas << ". Exiting..."
+                  << std::endl;
+        exit(1);
+        break;
     }
-    case ARM64_SFT_INVALID:
-      return value;
-    default:
-      assert(false && "Unknown shift type");
-      return 0;
   }
-}
 
-/** Get the size of the data to be accessed from/to memory. */
-inline uint8_t getDataSize(cs_arm64_op op) {
-  // Check from top of the range downwards
+  // SME ZA Tiles, SVE Z registers, and SVE P predicates also have Vector
+  // Arrangement Specifier set
+  /** TODO: When SME, SVE instruction splitting is supported / implemented,
+   * update the data size returned based on VAS. */
 
-  // ARM64_REG_V0 -> {end} are vector registers
-  if (op.reg >= ARM64_REG_V0) {
-    // Data size for vector registers relies on opcode thus return 0
+  // Work top down through register enums (highest value -> lowest value)
+  if (op.reg >= AARCH64_REG_D0_D1) {
+    // Multi-register currently not supported. Return 0.
     return 0;
   }
 
-  // ARM64_REG_ZAB0 -> +31 are tiles of the matrix register (ZA)
-  if (op.reg >= ARM64_REG_ZAB0 || op.reg == ARM64_REG_ZA) {
+  // AARCH64_REG_ZAB0 -> +31 are tiles of the matrix register (ZA)
+  // AARCH64_REG_ZT0 is new 512-bit register from SME2
+  if (op.reg >= AARCH64_REG_ZAB0 || op.reg == AARCH64_REG_ZA ||
+      op.reg == AARCH64_REG_ZT0) {
     // Data size for tile registers relies on opcode thus return 0
     return 0;
   }
 
-  // ARM64_REG_Z0 -> +31 are scalable vector registers (Z)
-  if (op.reg >= ARM64_REG_Z0) {
+  // AARCH64_REG_Z0 -> +31 are scalable vector registers (Z)
+  if (op.reg >= AARCH64_REG_Z0) {
     // Data size for vector registers relies on opcode thus return 0
     return 0;
   }
 
-  // ARM64_REG_X0 -> +28 are 64-bit (X) registers
-  if (op.reg >= ARM64_REG_X0) {
+  // AARCH64_REG_X0 -> +28 are 64-bit (X) registers
+  if (op.reg >= AARCH64_REG_X0) {
     return 8;
   }
 
-  // ARM64_REG_W0 -> +30 are 32-bit (W) registers
-  if (op.reg >= ARM64_REG_W0) {
+  // AARCH64_REG_W0 -> +30 are 32-bit (W) registers
+  if (op.reg >= AARCH64_REG_W0) {
     return 4;
   }
 
-  // ARM64_REG_S0 -> +31 are 32-bit arranged (S) neon registers
-  if (op.reg >= ARM64_REG_S0) {
+  // AARCH64_REG_S0 -> +31 are 32-bit arranged (S) neon registers
+  if (op.reg >= AARCH64_REG_S0) {
     return 4;
   }
 
-  // ARM64_REG_Q0 -> +31 are 128-bit arranged (Q) neon registers
-  if (op.reg >= ARM64_REG_Q0) {
+  // AARCH64_REG_Q0 -> +31 are 128-bit arranged (Q) neon registers
+  if (op.reg >= AARCH64_REG_Q0) {
     return 16;
   }
 
-  // ARM64_REG_P0 -> +15 are 256-bit (P) registers
-  if (op.reg >= ARM64_REG_P0) {
-    return 1;
+  // ARCH64_REG_PN0 -> +15 are 256-bit (P) registers
+  if (op.reg >= AARCH64_REG_PN0) {
+    return 32;
   }
 
-  // ARM64_REG_H0 -> +31 are 16-bit arranged (H) neon registers
-  if (op.reg >= ARM64_REG_H0) {
+  // AARCH64_REG_P0 -> +15 are 256-bit (P) registers
+  if (op.reg >= AARCH64_REG_P0) {
+    return 32;
+  }
+
+  // AARCH64_REG_H0 -> +31 are 16-bit arranged (H) neon registers
+  if (op.reg >= AARCH64_REG_H0) {
     return 2;
   }
 
-  // ARM64_REG_D0 -> +31 are 64-bit arranged (D) neon registers
-  if (op.reg >= ARM64_REG_D0) {
+  // AARCH64_REG_D0 -> +31 are 64-bit arranged (D) neon registers
+  if (op.reg >= AARCH64_REG_D0) {
     return 8;
   }
 
-  // ARM64_REG_B0 -> +31 are 8-bit arranged (B) neon registers
-  if (op.reg >= ARM64_REG_B0) {
+  // AARCH64_REG_B0 -> +31 are 8-bit arranged (B) neon registers
+  if (op.reg >= AARCH64_REG_B0) {
     return 1;
   }
 
-  // ARM64_REG_XZR is the 64-bit zero register
-  if (op.reg == ARM64_REG_XZR) {
+  // AARCH64_REG_XZR is the 64-bit zero register
+  if (op.reg == AARCH64_REG_XZR) {
     return 8;
   }
 
-  // ARM64_REG_WZR is the 32-bit zero register
-  if (op.reg == ARM64_REG_WZR) {
+  // AARCH64_REG_WZR is the 32-bit zero register
+  if (op.reg == AARCH64_REG_WZR) {
     return 4;
   }
 
-  // ARM64_REG_WSP (w31) is the 32-bit stack pointer register
-  if (op.reg == ARM64_REG_WSP) {
+  // AARCH64_REG_WSP (w31) is the 32-bit stack pointer register
+  if (op.reg == AARCH64_REG_WSP) {
     return 4;
   }
 
-  // ARM64_REG_SP (x31) is the 64-bit stack pointer register
-  if (op.reg == ARM64_REG_SP) {
+  // AARCH64_REG_SP (x31) is the 64-bit stack pointer register
+  if (op.reg == AARCH64_REG_SP) {
     return 8;
   }
 
-  // ARM64_REG_NZCV is the NZCV flag register
-  if (op.reg == ARM64_REG_NZCV) {
+  // AARCH64_REG_NZCV is the NZCV flag register
+  if (op.reg == AARCH64_REG_NZCV) {
     return 1;
   }
 
-  // ARM64_REG_X30 is the 64-bit link register
-  if (op.reg == ARM64_REG_X30) {
+  // AARCH64_REG_X30 is the 64-bit link register
+  if (op.reg == AARCH64_REG_X30) {
     return 8;
   }
 
-  // ARM64_REG_X29 is the 64-bit frame pointer
-  if (op.reg == ARM64_REG_X29) {
+  // AARCH64_REG_X29 is the 64-bit frame pointer
+  if (op.reg == AARCH64_REG_X29) {
     return 8;
   }
 
-  // ARM64_REG_FFR (p15) is a special purpose predicate register
-  if (op.reg == ARM64_REG_FFR) {
+  // AARCH64_REG_FFR (p15) is a special purpose predicate register
+  if (op.reg == AARCH64_REG_FFR) {
     return 1;
   }
 
-  // ARM64_REG_INVALID is an invalid capstone register so return 0 bytes as size
-  if (op.reg == ARM64_REG_INVALID) {
+  // AARCH64_REG_INVALID is an invalid capstone register so return 0 bytes as
+  // size
+  if (op.reg == AARCH64_REG_INVALID) {
     return 0;
   }
 
@@ -156,75 +246,41 @@ inline uint8_t getDataSize(cs_arm64_op op) {
   return 0;
 }
 
-class Architecture;
-struct InstructionMetadata;
-
-namespace RegisterType {
-/** The 64-bit general purpose register set: [w|x]0-31. */
-const uint8_t GENERAL = 0;
-/** The 128|2048 bit vector register set: [v|z]0-31. */
-const uint8_t VECTOR = 1;
-/** The 32 bit predicate register set: p0-15. */
-const uint8_t PREDICATE = 2;
-/** The 4-bit NZCV condition flag register. */
-const uint8_t NZCV = 3;
-/** The system registers. */
-const uint8_t SYSTEM = 4;
-/** The [256-byte x (SVL / 8)] SME matrix register za. */
-const uint8_t MATRIX = 5;
-}  // namespace RegisterType
-
-/** A struct holding user-defined execution information for a aarch64
- * instruction. */
-struct ExecutionInfo {
-  /** The latency for the instruction. */
-  uint16_t latency = 1;
-
-  /** The execution throughput for the instruction. */
-  uint16_t stallCycles = 1;
-
-  /** The ports that support the instruction. */
-  std::vector<uint16_t> ports = {};
-};
-
-/** The various exceptions that can be raised by an individual instruction. */
-enum class InstructionException {
-  None = 0,
-  EncodingUnallocated,
-  EncodingNotYetImplemented,
-  ExecutionNotYetImplemented,
-  MisalignedPC,
-  DataAbort,
-  SupervisorCall,
-  HypervisorCall,
-  SecureMonitorCall,
-  NoAvailablePort,
-  UnmappedSysReg,
-  StreamingModeUpdate,
-  ZAregisterStatusUpdate,
-  SMZAUpdate,
-  ZAdisabled,
-  SMdisabled
-};
-
-/** The opcodes of simeng aarch64 micro-operations. */
-namespace MicroOpcode {
-const uint8_t OFFSET_IMM = 0;
-const uint8_t OFFSET_REG = 1;
-const uint8_t LDR_ADDR = 2;
-const uint8_t STR_ADDR = 3;
-const uint8_t STR_DATA = 4;
-// INVALID is the default value reserved for non-micro-operation instructions
-const uint8_t INVALID = 255;
-}  // namespace MicroOpcode
-
-/** A struct to group micro-operation information together. */
-struct MicroOpInfo {
-  bool isMicroOp = false;
-  uint8_t microOpcode = MicroOpcode::INVALID;
-  uint8_t dataSize = 0;
-  bool isLastMicroOp = true;
-  int microOpIndex = 0;
+// AArch64 Instruction Identifier Masks
+enum class InsnType : uint32_t {
+  /** Writes scalar values to one or more registers and/or memory locations. */
+  isScalarData = 1 << 0,
+  /** Writes NEON vector values to one or more registers and/or memory
+     locations. */
+  isVectorData = 1 << 1,
+  /** Writes SVE vector values to one or more registers and/or memory locations.
+   */
+  isSVEData = 1 << 2,
+  /** Writes SME matrix values to one or more registers and/or memory locations.
+   */
+  isSMEData = 1 << 3,
+  /** Has a shift operand. */
+  isShift = 1 << 4,
+  /** Is a logical operation. */
+  isLogical = 1 << 5,
+  /** Is a compare operation. */
+  isCompare = 1 << 6,
+  /** Is a convert operation. */
+  isConvert = 1 << 7,
+  /** Is a multiply operation. */
+  isMultiply = 1 << 8,
+  /** Is a divide or square root operation */
+  isDivideOrSqrt = 1 << 9,
+  /** Writes to a predicate register */
+  isPredicate = 1 << 10,
+  /** Is a load operation. */
+  isLoad = 1 << 11,
+  /** Is a store address operation. */
+  isStoreAddress = 1 << 12,
+  /** Is a store data operation. */
+  isStoreData = 1 << 13,
+  /** Is a branch operation. */
+  isBranch = 1 << 14
 };
 
 /** A basic Armv9.2-a implementation of the `Instruction` interface. */
@@ -241,47 +297,40 @@ class Instruction : public simeng::Instruction {
               const InstructionMetadata& metadata,
               InstructionException exception);
 
-  /** Retrieve the identifier for the first exception that occurred during
-   * processing this instruction. */
-  virtual InstructionException getException() const;
-
   /** Retrieve the source registers this instruction reads. */
-  const span<Register> getOperandRegisters() const override;
+  const span<Register> getSourceRegisters() const override;
+
+  /** Retrieve the data contained in the source registers this instruction
+   * reads.*/
+  const span<RegisterValue> getSourceOperands() const override;
 
   /** Retrieve the destination registers this instruction will write to.
    * A register value of -1 signifies a Zero Register read, and should not be
    * renamed. */
   const span<Register> getDestinationRegisters() const override;
 
-  /** Check whether the operand at index `i` has had a value supplied. */
-  bool isOperandReady(int index) const override;
-
   /** Override the specified source register with a renamed physical register.
    */
-  void renameSource(uint8_t i, Register renamed) override;
+  void renameSource(uint16_t i, Register renamed) override;
 
   /** Override the specified destination register with a renamed physical
    * register. */
-  void renameDestination(uint8_t i, Register renamed) override;
+  void renameDestination(uint16_t i, Register renamed) override;
 
   /** Provide a value for the operand at the specified index. */
-  virtual void supplyOperand(uint8_t i, const RegisterValue& value) override;
+  void supplyOperand(uint16_t i, const RegisterValue& value) override;
 
-  /** Check whether all operand values have been supplied, and the instruction
-   * is ready to execute. */
-  bool canExecute() const override;
-
-  /** Execute the instruction. */
-  void execute() override;
+  /** Check whether the operand at index `i` has had a value supplied. */
+  bool isOperandReady(int index) const override;
 
   /** Retrieve register results. */
   const span<RegisterValue> getResults() const override;
 
   /** Generate memory addresses this instruction wishes to access. */
-  span<const MemoryAccessTarget> generateAddresses() override;
+  span<const memory::MemoryAccessTarget> generateAddresses() override;
 
   /** Retrieve previously generated memory addresses. */
-  span<const MemoryAccessTarget> getGeneratedAddresses() const override;
+  span<const memory::MemoryAccessTarget> getGeneratedAddresses() const override;
 
   /** Provide data from a requested memory address. */
   void supplyData(uint64_t address, const RegisterValue& data) override;
@@ -289,11 +338,6 @@ class Instruction : public simeng::Instruction {
   /** Retrieve supplied memory data. */
   span<const RegisterValue> getData() const override;
 
-  /** Early misprediction check; see if it's possible to determine whether the
-   * next instruction address was mispredicted without executing the
-   * instruction. */
-  std::tuple<bool, uint64_t> checkEarlyBranchMisprediction() const override;
-
   /** Retrieve branch type. */
   BranchType getBranchType() const override;
 
@@ -317,160 +361,111 @@ class Instruction : public simeng::Instruction {
   /** Retrieve the instruction group this instruction belongs to. */
   uint16_t getGroup() const override;
 
-  /** Set this instruction's execution information including it's execution
-   * latency and throughput, and the set of ports which support it. */
-  void setExecutionInfo(const ExecutionInfo& info);
+  /** Check whether all operand values have been supplied, and the instruction
+   * is ready to execute. */
+  bool canExecute() const override;
+
+  /** Execute the instruction. */
+  void execute() override;
 
   /** Get this instruction's supported set of ports. */
   const std::vector<uint16_t>& getSupportedPorts() override;
 
+  /** Set this instruction's execution information including it's execution
+   * latency and throughput, and the set of ports which support it. */
+  void setExecutionInfo(const ExecutionInfo& info) override;
+
   /** Retrieve the instruction's metadata. */
   const InstructionMetadata& getMetadata() const;
 
   /** Retrieve the instruction's associated architecture. */
   const Architecture& getArchitecture() const;
 
-  /** A special register value representing the zero register. If passed to
-   * `setSourceRegisters`/`setDestinationRegisters`, the value will be
-   * automatically supplied as zero. */
-  static const Register ZERO_REGISTER;
+  /** Retrieve the identifier for the first exception that occurred during
+   * processing this instruction. */
+  InstructionException getException() const;
 
  private:
-  /** A reference to the ISA instance this instruction belongs to. */
-  const Architecture& architecture_;
-
-  /** A reference to the decoding metadata for this instruction. */
-  const InstructionMetadata& metadata;
-
-  /** A vector of source registers. */
-  std::vector<Register> sourceRegisters;
-  /** The number of source registers this instruction reads from. */
-  uint16_t sourceRegisterCount = 0;
-
-  /** A vector of destination registers. */
-  std::vector<Register> destinationRegisters;
-  /** The number of destination registers this instruction writes to. */
-  uint16_t destinationRegisterCount = 0;
-
-  /** A vector of provided operand values. Each entry corresponds to a
-   * `sourceRegisters` entry. */
-  std::vector<RegisterValue> operands;
-
-  /** A vector of generated output results. Each entry corresponds to a
-   * `destinationRegisters` entry. */
-  std::vector<RegisterValue> results;
-
-  /** The current exception state of this instruction. */
-  InstructionException exception_ = InstructionException::None;
-
-  // Decoding
   /** Process the instruction's metadata to determine source/destination
    * registers. */
   void decode();
 
-  /** Set the source registers of the instruction, and create a corresponding
-   * operands vector. Zero register references will be pre-supplied with a value
-   * of 0. */
-  void setSourceRegisters(const std::vector<Register>& registers);
-
-  /** Set the destination registers for the instruction, and create a
-   * corresponding results vector. */
-  void setDestinationRegisters(const std::vector<Register>& registers);
+  /** Update the instruction's identifier with an additional field. */
+  constexpr void setInstructionType(InsnType identifier) {
+    instructionIdentifier_ |=
+        static_cast<std::underlying_type_t<InsnType>>(identifier);
+  }
 
-  // Scheduling
-  /** The number of operands that have not yet had values supplied. Used to
-   * determine execution readiness. */
-  short operandsPending = 0;
+  /** Tests whether this instruction has the given identifier set. */
+  constexpr bool isInstruction(InsnType identifier) const {
+    return (instructionIdentifier_ &
+            static_cast<std::underlying_type_t<InsnType>>(identifier));
+  }
 
-  // Execution
   /** Generate an ExecutionNotYetImplemented exception. */
   void executionNYI();
 
-  // Execution
   /** Generate an EncodingUnallocated exception. */
   void executionINV();
 
-  // Execution
   /** Generate an StreamingModeUpdate exception. */
   void streamingModeUpdated();
 
-  // Execution
   /** Generate an ZAregisterStatusUpdate exception. */
   void zaRegisterStatusUpdated();
 
-  // Execution
   /** Generate an SMZAupdate exception. */
   void SMZAupdated();
 
-  // Execution
   /** Generate a ZAdisabled exception. */
   void ZAdisabled();
 
-  // Execution
   /** Generate a SMdisabled exception. */
   void SMdisabled();
 
-  // Instruction Identifiers
-  /** Operates on scalar values */
-  bool isScalarData_ = false;
-  /** Operates on vector values. */
-  bool isVectorData_ = false;
-  /** Uses Z registers as source and/or destination operands. */
-  bool isSVEData_ = false;
-  /** Uses ZA register or tiles of ZA as destination. */
-  bool isSMEData_ = false;
-  /** Doesn't have a shift operand. */
-  bool isNoShift_ = true;
-  /** Is a logical operation. */
-  bool isLogical_ = false;
-  /** Is a compare operation. */
-  bool isCompare_ = false;
-  /** Is a convert operation. */
-  bool isConvert_ = false;
-  /** Is a multiply operation. */
-  bool isMultiply_ = false;
-  /** Is a divide or square root operation */
-  bool isDivideOrSqrt_ = false;
-  /** Writes to a predicate register */
-  bool isPredicate_ = false;
-  /** Is a load operation. */
-  bool isLoad_ = false;
-  /** Is a store address operation. */
-  bool isStoreAddress_ = false;
-  /** Is a store data operation. */
-  bool isStoreData_ = false;
-  /** Is a branch operation. */
-  bool isBranch_ = false;
-  /** Is the micro-operation opcode of the instruction, where appropriate. */
-  uint8_t microOpcode_ = MicroOpcode::INVALID;
-  /** Is the micro-operation opcode of the instruction, where appropriate. */
-  uint8_t dataSize_ = 0;
+  /** A reference to the ISA instance this instruction belongs to. */
+  const Architecture& architecture_;
+
+  /** A reference to the decoding metadata for this instruction. */
+  const InstructionMetadata& metadata_;
+
+  /** An operandContainer of source registers. */
+  srcRegContainer sourceRegisters_;
 
-  // Memory
-  /** Set the accessed memory addresses, and create a corresponding memory data
-   * vector. */
-  void setMemoryAddresses(const std::vector<MemoryAccessTarget>& addresses);
+  /** The number of source registers this instruction reads from. */
+  uint16_t sourceRegisterCount_ = 0;
 
-  void setMemoryAddresses(std::vector<MemoryAccessTarget>&& addresses);
+  /** An operandContainer of destination registers. */
+  destRegContainer destinationRegisters_;
 
-  void setMemoryAddresses(MemoryAccessTarget address);
+  /** The number of destination registers this instruction writes to. */
+  uint16_t destinationRegisterCount_ = 0;
 
-  /** The memory addresses this instruction accesses, as a vector of {offset,
-   * width} pairs. */
-  std::vector<MemoryAccessTarget> memoryAddresses;
+  /** An operandContainer of provided operand values. Each entry corresponds to
+   * a `sourceRegisters` entry. */
+  srcValContainer sourceValues_;
 
-  /** A vector of memory values, that were either loaded memory, or are prepared
-   * for sending to memory (according to instruction type). Each entry
-   * corresponds to a `memoryAddresses` entry. */
-  std::vector<RegisterValue> memoryData;
+  /** An operandContainer of generated output results. Each entry corresponds to
+   * a `destinationRegisters` entry. */
+  destValContainer results_;
+
+  /** The current exception state of this instruction. */
+  InstructionException exception_ = InstructionException::None;
 
-  // Execution helpers
-  /** Extend `value` according to `extendType`, and left-shift the result by
-   * `shift` */
-  uint64_t extendValue(uint64_t value, uint8_t extendType, uint8_t shift) const;
+  /** The number of source operands that have not yet had values supplied. Used
+   * to determine execution readiness. */
+  uint16_t sourceOperandsPending_ = 0;
+
+  /** Is the micro-operation opcode of the instruction, where appropriate. */
+  uint8_t microOpcode_ = MicroOpcode::INVALID;
+
+  /** Is the micro-operation opcode of the instruction, where appropriate. */
+  uint8_t dataSize_ = 0;
 
-  /** Extend `value` using extension/shifting rules defined in `op`. */
-  uint64_t extendOffset(uint64_t value, const cs_arm64_op& op) const;
+  /** Used to denote what type of instruction this is. Utilises the constants in
+   * the `InsnType` namespace allowing each bit to represent a unique
+   * identifier such as `isLoad` or `isMultiply` etc. */
+  uint32_t instructionIdentifier_ = 0;
 };
 
 }  // namespace aarch64
diff --git a/src/include/simeng/arch/aarch64/InstructionGroups.hh b/src/include/simeng/arch/aarch64/InstructionGroups.hh
index fd8e06c1a2..fc15e95230 100644
--- a/src/include/simeng/arch/aarch64/InstructionGroups.hh
+++ b/src/include/simeng/arch/aarch64/InstructionGroups.hh
@@ -4,7 +4,33 @@ namespace simeng {
 namespace arch {
 namespace aarch64 {
 
-/** The IDs of the instruction groups for AArch64 instructions. */
+/** The IDs of the instruction groups for AArch64 instructions.
+ * Each new group must contain 14 entries to ensure correct group assignment and
+ * general functionality.
+ * Their order must be as follows:
+ *  - BASE
+ *  - BASE_SIMPLE
+ *  - BASE_SIMPLE_ARTH
+ *  - BASE_SIMPLE_ARTH_NOSHIFT
+ *  - BASE_SIMPLE_LOGICAL
+ *  - BASE_SIMPLE_LOGICAL_NOSHIFT
+ *  - BASE_SIMPLE_CMP
+ *  - BASE_SIMPLE_CVT
+ *  - BASE_MUL
+ *  - BASE_DIV_OR_SQRT
+ *  - LOAD_BASE
+ *  - STORE_ADDRESS_BASE
+ *  - STORE_DATA_BASE
+ *  - STORE_BASE
+ *
+ * An exception to the above is "Parent" groups which do not require the LOAD_*
+ * or STORE_* groups.
+ * "Parent" groups allow for easier grouping of similar groups that may have
+ * identical execution latencies, ports, etc. For example, FP is the parent
+ * group of SCALAR and VECTOR.
+ * In simulation, an instruction's allocated group will never be a "Parent"
+ * group; they are only used to simplify config file creation and management.
+ */
 namespace InstructionGroups {
 const uint16_t INT = 0;
 const uint16_t INT_SIMPLE = 1;
@@ -92,12 +118,19 @@ const uint16_t LOAD_SME = 82;
 const uint16_t STORE_ADDRESS_SME = 83;
 const uint16_t STORE_DATA_SME = 84;
 const uint16_t STORE_SME = 85;
+const uint16_t ALL = 86;
+const uint16_t NONE = 87;
 }  // namespace InstructionGroups
 
 /** The number of aarch64 instruction groups. */
-static constexpr uint8_t NUM_GROUPS = 86;
+static constexpr uint8_t NUM_GROUPS = 88;
 
-const std::unordered_map<uint16_t, std::vector<uint16_t>> groupInheritance = {
+const std::unordered_map<uint16_t, std::vector<uint16_t>> groupInheritance_ = {
+    {InstructionGroups::ALL,
+     {InstructionGroups::INT, InstructionGroups::FP, InstructionGroups::SVE,
+      InstructionGroups::SME, InstructionGroups::PREDICATE,
+      InstructionGroups::LOAD, InstructionGroups::STORE,
+      InstructionGroups::BRANCH}},
     {InstructionGroups::INT,
      {InstructionGroups::INT_SIMPLE, InstructionGroups::INT_DIV_OR_SQRT,
       InstructionGroups::INT_MUL}},
diff --git a/src/include/simeng/arch/aarch64/MicroDecoder.hh b/src/include/simeng/arch/aarch64/MicroDecoder.hh
index 22f0cb89b4..6503d370e8 100644
--- a/src/include/simeng/arch/aarch64/MicroDecoder.hh
+++ b/src/include/simeng/arch/aarch64/MicroDecoder.hh
@@ -4,15 +4,14 @@
 
 #include "simeng/arch/Architecture.hh"
 #include "simeng/arch/aarch64/Instruction.hh"
-#include "yaml-cpp/yaml.h"
 
 namespace simeng {
 namespace arch {
 namespace aarch64 {
 
-/** A struct to hold information to construct a default cs_arm64_op from. */
+/** A struct to hold information to construct a default cs_aarch64_op from. */
 struct OpType {
-  arm64_op_type type;
+  aarch64_op_type type;
   bool isDestination = false;
 };
 
@@ -20,8 +19,9 @@ struct OpType {
  */
 class MicroDecoder {
  public:
-  /** Construct a micro decoder for splitting relevant instructons. */
-  MicroDecoder(YAML::Node config);
+  /** Construct a micro decoder for splitting relevant instructions. */
+  MicroDecoder(ryml::ConstNodeRef config = config::SimInfo::getConfig());
+
   ~MicroDecoder();
 
   /** From a macro-op, split into one or more micro-ops and populate passed
@@ -30,65 +30,73 @@ class MicroDecoder {
                  const Instruction& macroOp, MacroOp& output,
                  csh capstoneHandle);
 
+ private:
   /** Detect if there's an overlap between the underlying hardware registers
    * (e.g. z5, v5, q5, d5, s5, h5, and b5). */
-  bool detectOverlap(arm64_reg registerA, arm64_reg registerB);
+  bool detectOverlap(aarch64_reg registerA, aarch64_reg registerB);
 
   /** Create a default cs_detail object from a vector of operand types. */
   cs_detail createDefaultDetail(std::vector<OpType> opTypes);
 
   /** Create an address offset uop from a base register and an immediate. */
   Instruction createImmOffsetUop(const Architecture& architecture,
-                                 arm64_reg base, int64_t offset,
+                                 aarch64_reg base, int64_t offset,
+                                 csh capstoneHandle, bool lastMicroOp = false,
+                                 int microOpIndex = 0);
+
+  /** Create an address offset uop from a base register and a register. */
+  Instruction createRegOffsetUop(const Architecture& architecture,
+                                 aarch64_reg base, aarch64_reg offset,
                                  csh capstoneHandle, bool lastMicroOp = false,
                                  int microOpIndex = 0);
 
   /** Create a load uop from a destination register and a capstone memory
    * operand. */
-  Instruction createLdrUop(const Architecture& architecture, arm64_reg dest,
-                           arm64_op_mem mem, csh capstoneHandle,
+  Instruction createLdrUop(const Architecture& architecture, aarch64_reg dest,
+                           aarch64_op_mem mem, csh capstoneHandle,
                            bool lastMicroOp = false, int microOpIndex = 0,
                            uint8_t dataSize = 0);
 
   /** Create a store data uop from a source register. */
-  Instruction createSDUop(const Architecture& architecture, arm64_reg src,
+  Instruction createSDUop(const Architecture& architecture, aarch64_reg src,
                           csh capstoneHandle, bool lastMicroOp = false,
                           int microOpIndex = 0);
 
   /** Create a store address uop from a capstone memory
    * operand. */
-  Instruction createStrUop(const Architecture& architecture, arm64_op_mem mem,
+  Instruction createStrUop(const Architecture& architecture, aarch64_op_mem mem,
                            csh capstoneHandle, bool lastMicroOp = false,
                            int microOpIndex = 0, uint8_t dataSize = 0);
 
- private:
   /** Flag to determine whether instruction splitting is enabled. */
-  bool instructionSplit_;
+  const bool instructionSplit_;
 
   /** A micro-decoding cache, mapping an instruction word to a previously split
    * instruction. Instructions are added to the cache as they're split into
-   * their repsective micro-operations, to reduce the overhead of future
+   * their respective micro-operations, to reduce the overhead of future
    * splitting. */
   static std::unordered_map<uint32_t, std::vector<Instruction>>
-      microDecodeCache;
+      microDecodeCache_;
 
   /** A cache for newly created instruction metadata. Ensures metadata values
    * persist for a micro-operations' life cycle. */
-  static std::forward_list<InstructionMetadata> microMetadataCache;
+  static std::forward_list<InstructionMetadata> microMetadataCache_;
 
   // Default objects
   /** Default capstone instruction structure. */
-  cs_arm64 default_info = {ARM64_CC_INVALID, false, false, 0, {}};
+  cs_aarch64 default_info = {AArch64CC_Invalid, false, false, false, 0, {}};
 
   /** Default register. */
-  cs_arm64_op default_op = {0,
-                            ARM64_VAS_INVALID,
-                            {ARM64_SFT_INVALID, 0},
-                            ARM64_EXT_INVALID,
-                            ARM64_OP_INVALID,
-                            ARM64_SVCR_INVALID,
-                            {},
-                            CS_AC_READ};
+  cs_aarch64_op default_op = {0,
+                              AARCH64LAYOUT_INVALID,
+                              {AARCH64_SFT_INVALID, 0},
+                              AARCH64_EXT_INVALID,
+                              AARCH64_OP_INVALID,
+                              false,
+                              {},
+                              {},
+                              CS_AC_READ,
+                              false};
 
   /** Default capstone instruction detail. */
   cs_detail default_detail = {{}, 0, {}, 0, {}, 0, {}};
diff --git a/src/include/simeng/arch/aarch64/helpers/arithmetic.hh b/src/include/simeng/arch/aarch64/helpers/arithmetic.hh
index 13485c16fa..d137095234 100644
--- a/src/include/simeng/arch/aarch64/helpers/arithmetic.hh
+++ b/src/include/simeng/arch/aarch64/helpers/arithmetic.hh
@@ -5,204 +5,193 @@
 namespace simeng {
 namespace arch {
 namespace aarch64 {
-class arithmeticHelp {
- public:
-  /** Helper function for instructions with the format `add rd, rn, rm`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns single value of type T. */
-  template <typename T>
-  static T add_3ops(std::vector<RegisterValue>& operands) {
-    const T n = operands[0].get<T>();
-    const T m = operands[1].get<T>();
-    return (n + m);
-  }
 
-  /** Helper function for instructions with the format `adc rd, rn, rm`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns tuple of [resulting value, nzcv]. */
-  template <typename T>
-  static std::tuple<T, uint8_t> addCarry_3ops(
-      std::vector<RegisterValue>& operands) {
-    const uint8_t carry = operands[0].get<uint8_t>() & 0b0010;
-    const T n = operands[1].get<T>();
-    const T m = operands[2].get<T>();
-    return AuxFunc::addWithCarry(n, m, carry);
-  }
+/** Helper function for instructions with the format `add rd, rn, rm`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns single value of type T. */
+template <typename T>
+T add_3ops(srcValContainer& sourceValues) {
+  const T n = sourceValues[0].get<T>();
+  const T m = sourceValues[1].get<T>();
+  return (n + m);
+}
 
-  /** Helper function for instructions with the format `add rd, rn, rm{, extend
-   * {#amount}}`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns tuple of [resulting value, nzcv]. */
-  template <typename T>
-  static std::tuple<T, uint8_t> addExtend_3ops(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      bool calcNZCV) {
-    const T n = operands[0].get<T>();
-    const T m =
-        AuxFunc::extendValue(operands[1].get<T>(), metadata.operands[2].ext,
-                             metadata.operands[2].shift.value);
-    if (calcNZCV) return AuxFunc::addWithCarry(n, m, 0);
-    return {(n + m), 0};
-  }
+/** Helper function for instructions with the format `adc rd, rn, rm`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns tuple of [resulting value, nzcv]. */
+template <typename T>
+std::tuple<T, uint8_t> addCarry_3ops(srcValContainer& sourceValues) {
+  const uint8_t carry = sourceValues[0].get<uint8_t>() & 0b0010;
+  const T n = sourceValues[1].get<T>();
+  const T m = sourceValues[2].get<T>();
+  return addWithCarry(n, m, carry);
+}
 
-  /** Helper function for instructions with the format `add rd, rn, rm{, shift
-   * #amount}`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns tuple of [resulting value, nzcv]. */
-  template <typename T>
-  static std::tuple<T, uint8_t> addShift_3ops(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      bool calcNZCV) {
-    const T n = operands[0].get<T>();
-    const T m =
-        shiftValue(operands[1].get<T>(), metadata.operands[2].shift.type,
-                   metadata.operands[2].shift.value);
-    if (calcNZCV) return AuxFunc::addWithCarry(n, m, 0);
-    return {(n + m), 0};
-  }
+/** Helper function for instructions with the format `add rd, rn, rm{, extend
+ * {#amount}}`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns tuple of [resulting value, nzcv]. */
+template <typename T>
+std::tuple<T, uint8_t> addExtend_3ops(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata, bool calcNZCV) {
+  const T n = sourceValues[0].get<T>();
+  const T m = extendValue(sourceValues[1].get<T>(), metadata.operands[2].ext,
+                          metadata.operands[2].shift.value);
+  if (calcNZCV) return addWithCarry(n, m, 0);
+  return {(n + m), 0};
+}
 
-  /** Helper function for instructions with the format `add rd, rn, #imm{, shift
-   * #amount}`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns tuple of [resulting value, nzcv]. */
-  template <typename T>
-  static std::tuple<T, uint8_t> addShift_imm(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      bool calcNZCV) {
-    const T n = operands[0].get<T>();
-    const T m = shiftValue(static_cast<T>(metadata.operands[2].imm),
-                           metadata.operands[2].shift.type,
-                           metadata.operands[2].shift.value);
-    if (calcNZCV) return AuxFunc::addWithCarry(n, m, 0);
-    return {(n + m), 0};
-  }
+/** Helper function for instructions with the format `add rd, rn, rm{, shift
+ * #amount}`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns tuple of [resulting value, nzcv]. */
+template <typename T>
+std::tuple<T, uint8_t> addShift_3ops(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata, bool calcNZCV) {
+  const T n = sourceValues[0].get<T>();
+  const T m =
+      shiftValue(sourceValues[1].get<T>(), metadata.operands[2].shift.type,
+                 metadata.operands[2].shift.value);
+  if (calcNZCV) return addWithCarry(n, m, 0);
+  return {(n + m), 0};
+}
+
+/** Helper function for instructions with the format `add rd, rn, #imm{, shift
+ * #amount}`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns tuple of [resulting value, nzcv]. */
+template <typename T>
+std::tuple<T, uint8_t> addShift_imm(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata, bool calcNZCV) {
+  const T n = sourceValues[0].get<T>();
+  const T m = shiftValue(static_cast<T>(metadata.operands[2].imm),
+                         metadata.operands[2].shift.type,
+                         metadata.operands[2].shift.value);
+  if (calcNZCV) return addWithCarry(n, m, 0);
+  return {(n + m), 0};
+}
 
-  /** Helper function for instructions with the format `clz rd, rn`.
-   * T represents the type of operands (e.g. for xn, T = int64_t).
-   * Returns single value of type T. */
-  template <typename T>
-  static T clz_reg(std::vector<RegisterValue>& operands) {
-    T x = operands[0].get<T>();
-    uint8_t i;
-    for (i = 0; i < (sizeof(T) * 8); i++) {
-      // Left-shift x until it's negative or we run out of bits
-      if (x < 0) {
-        break;
-      }
-      x <<= 1;
+/** Helper function for instructions with the format `clz rd, rn`.
+ * T represents the type of sourceValues (e.g. for xn, T = int64_t).
+ * Returns single value of type T. */
+template <typename T>
+T clz_reg(srcValContainer& sourceValues) {
+  T x = sourceValues[0].get<T>();
+  uint8_t i;
+  for (i = 0; i < (sizeof(T) * 8); i++) {
+    // Left-shift x until it's negative or we run out of bits
+    if (x < 0) {
+      break;
     }
-    return i;
+    x <<= 1;
   }
+  return i;
+}
 
-  /** Helper function for instructions with the format `movk <w,x>d, #imm`.
-   * T represents the type of operands (e.g. for xd, T = uint64_t).
-   * Returns single value of type T. */
-  template <typename T>
-  static T movkShift_imm(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    // Clear 16-bit region offset by `shift` and replace with immediate
-    uint8_t shift = metadata.operands[1].shift.value;
-    T mask = ~(static_cast<T>(0xFFFF) << shift);
-    T value =
-        (operands[0].get<T>() & mask) | (metadata.operands[1].imm << shift);
-    return value;
-  }
+/** Helper function for instructions with the format `movk <w,x>d, #imm`.
+ * T represents the type of sourceValues (e.g. for xd, T = uint64_t).
+ * Returns single value of type T. */
+template <typename T>
+T movkShift_imm(srcValContainer& sourceValues,
+                const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  // Clear 16-bit region offset by `shift` and replace with immediate
+  uint8_t shift = metadata.operands[1].shift.value;
+  T mask = ~(static_cast<T>(0xFFFF) << shift);
+  T value =
+      (sourceValues[0].get<T>() & mask) | (metadata.operands[1].imm << shift);
+  return value;
+}
 
-  /** Helper function for instructions with the format `mov<n,z> <w,x>d, #imm{,
-   * lsl #shift}`.
-   * T represents the type of operands (e.g. for xd, T = uint64_t).
-   * Returns single value og type uint64_t. */
-  template <typename T>
-  static uint64_t movnShift_imm(
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      std::function<T(uint64_t)> func) {
-    uint8_t shift = metadata.operands[1].shift.value;
-    T value = func(static_cast<uint64_t>(metadata.operands[1].imm) << shift);
-    return static_cast<uint64_t>(value);
-  }
+/** Helper function for instructions with the format `mov<n,z> <w,x>d, #imm{,
+ * lsl #shift}`.
+ * T represents the type of sourceValues (e.g. for xd, T = uint64_t).
+ * Returns single value og type uint64_t. */
+template <typename T>
+uint64_t movnShift_imm(
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    std::function<T(uint64_t)> func) {
+  uint8_t shift = metadata.operands[1].shift.value;
+  T value = func(static_cast<uint64_t>(metadata.operands[1].imm) << shift);
+  return static_cast<uint64_t>(value);
+}
 
-  /** Helper function for instructions with the format `msubl xd, wn, wm, xa`.
-   * D represents the type of the destination register (either int64_t or
-   * uint64_t).
-   * N represents the type of the first source register (either
-   * int32_t or uint32_t).
-   * Returns single value of type D. */
-  template <typename D, typename N>
-  static D msubl_4ops(std::vector<RegisterValue>& operands) {
-    const N n = operands[0].get<N>();
-    const N m = operands[1].get<N>();
-    const D a = operands[2].get<D>();
-    return (a - (n * m));
-  }
+/** Helper function for instructions with the format `msubl xd, wn, wm, xa`.
+ * D represents the type of the destination register (either int64_t or
+ * uint64_t).
+ * N represents the type of the first source register (either
+ * int32_t or uint32_t).
+ * Returns single value of type D. */
+template <typename D, typename N>
+D msubl_4ops(srcValContainer& sourceValues) {
+  const N n = sourceValues[0].get<N>();
+  const N m = sourceValues[1].get<N>();
+  const D a = sourceValues[2].get<D>();
+  return (a - (n * m));
+}
 
-  /** Helper function for instructions with the format `sbc rd, rn, rm`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns single value of type T. */
-  template <typename T>
-  static T sbc(std::vector<RegisterValue>& operands) {
-    auto nzcv = operands[0].get<uint8_t>();
-    const T x = operands[1].get<T>();
-    const T y = operands[2].get<T>();
-    T result;
-    std::tie(result, std::ignore) =
-        AuxFunc::addWithCarry(x, ~y, (nzcv >> 1) & 1);
-    return result;
-  }
+/** Helper function for instructions with the format `sbc rd, rn, rm`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns single value of type T. */
+template <typename T>
+T sbc(srcValContainer& sourceValues) {
+  auto nzcv = sourceValues[0].get<uint8_t>();
+  const T x = sourceValues[1].get<T>();
+  const T y = sourceValues[2].get<T>();
+  T result;
+  std::tie(result, std::ignore) = addWithCarry(x, ~y, (nzcv >> 1) & 1);
+  return result;
+}
 
-  /** Helper function for instructions with the format `sub{s} rd, rn, rm{,
-   * extend #amount}`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns tuple of [resulting value, nzcv]. */
-  template <typename T>
-  static std::tuple<T, uint8_t> subExtend_3ops(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      bool calcNZCV) {
-    const T n = operands[0].get<T>();
-    const T m = static_cast<T>(
-        AuxFunc::extendValue(operands[1].get<T>(), metadata.operands[2].ext,
-                             metadata.operands[2].shift.value));
-    if (calcNZCV) return AuxFunc::addWithCarry(n, ~m, true);
-    return {(n - m), 0};
-  }
+/** Helper function for instructions with the format `sub{s} rd, rn, rm{,
+ * extend #amount}`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns tuple of [resulting value, nzcv]. */
+template <typename T>
+std::tuple<T, uint8_t> subExtend_3ops(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata, bool calcNZCV) {
+  const T n = sourceValues[0].get<T>();
+  const T m = static_cast<T>(extendValue(sourceValues[1].get<T>(),
+                                         metadata.operands[2].ext,
+                                         metadata.operands[2].shift.value));
+  if (calcNZCV) return addWithCarry(n, ~m, true);
+  return {(n - m), 0};
+}
 
-  /** Helper function for instructions with the format `sub{s} rd, rn, #imm`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns single value of type T. */
-  template <typename T>
-  static std::tuple<T, uint8_t> subShift_imm(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      bool calcNZCV) {
-    const T n = operands[0].get<T>();
-    const T m = shiftValue(static_cast<T>(metadata.operands[2].imm),
-                           metadata.operands[2].shift.type,
-                           metadata.operands[2].shift.value);
-    if (calcNZCV) return AuxFunc::addWithCarry(n, ~m, true);
-    return {(n - m), 0};
-  }
+/** Helper function for instructions with the format `sub{s} rd, rn, #imm`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns single value of type T. */
+template <typename T>
+std::tuple<T, uint8_t> subShift_imm(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata, bool calcNZCV) {
+  const T n = sourceValues[0].get<T>();
+  const T m = shiftValue(static_cast<T>(metadata.operands[2].imm),
+                         metadata.operands[2].shift.type,
+                         metadata.operands[2].shift.value);
+  if (calcNZCV) return addWithCarry(n, ~m, true);
+  return {(n - m), 0};
+}
+
+/** Helper function for instructions with the format `sub{s} rd, rn, rm{,
+ * shift #amount}`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns tuple of [resulting value, nzcv]. */
+template <typename T>
+std::tuple<T, uint8_t> subShift_3ops(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata, bool calcNZCV) {
+  const T n = sourceValues[0].get<T>();
+  const T m =
+      shiftValue(sourceValues[1].get<T>(), metadata.operands[2].shift.type,
+                 metadata.operands[2].shift.value);
+  if (calcNZCV) return addWithCarry(n, ~m, true);
+  return {(n - m), 0};
+}
 
-  /** Helper function for instructions with the format `sub{s} rd, rn, rm{,
-   * shift #amount}`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns tuple of [resulting value, nzcv]. */
-  template <typename T>
-  static std::tuple<T, uint8_t> subShift_3ops(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      bool calcNZCV) {
-    const T n = operands[0].get<T>();
-    const T m =
-        shiftValue(operands[1].get<T>(), metadata.operands[2].shift.type,
-                   metadata.operands[2].shift.value);
-    if (calcNZCV) return AuxFunc::addWithCarry(n, ~m, true);
-    return {(n - m), 0};
-  }
-};
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/aarch64/helpers/auxiliaryFunctions.hh b/src/include/simeng/arch/aarch64/helpers/auxiliaryFunctions.hh
index 036df3f061..5880a75da9 100644
--- a/src/include/simeng/arch/aarch64/helpers/auxiliaryFunctions.hh
+++ b/src/include/simeng/arch/aarch64/helpers/auxiliaryFunctions.hh
@@ -11,315 +11,339 @@
 namespace simeng {
 namespace arch {
 namespace aarch64 {
-class AuxFunc {
- public:
-  /** Performs a type agnostic add with carry. */
-  template <typename T>
-  static std::tuple<T, uint8_t> addWithCarry(T x, T y, bool carryIn) {
-    T result = x + y + carryIn;
 
-    bool n = (result >> (sizeof(T) * 8 - 1));
-    bool z = (result == 0);
-
-    // Trying to calculate whether `result` overflows (`x + y + carryIn > max`).
-    bool c;
-    if (carryIn && x + 1 == 0) {
-      // Implies `x` is max; with a carry set, it will definitely overflow
-      c = true;
-    } else {
-      // We know x + carryIn <= max, so can safely subtract and compare against
-      // y max > x + y + c == max - x > y + c
-      c = ((std::numeric_limits<T>::max() - x - carryIn) < y);
-    }
-
-    // Calculate whether signed result overflows
-    bool v = false;
-    typedef std::make_signed_t<T> ST;
-    auto sx = static_cast<ST>(x);
-    auto sy = static_cast<ST>(y);
-    if (sx >= 0) {
-      // Check if (x + y + c) > MAX
-      // y > (MAX - x - c)
-      v = sy > (std::numeric_limits<ST>::max() - sx - carryIn);
-    } else {
-      // Check if (x + y + c) < MIN
-      // y < (MIN - x - c)
-      v = sy < (std::numeric_limits<ST>::min() - sx - carryIn);
-    }
-
-    return {result, nzcv(n, z, c, v)};
+/** Returns a correctly formatted nzcv value. */
+inline uint8_t nzcv(bool n, bool z, bool c, bool v) {
+  return (n << 3) | (z << 2) | (c << 1) | v;
+}
+
+/** Performs a type agnostic unsigned add with carry. */
+template <typename T>
+inline std::enable_if_t<std::is_integral_v<T> && std::is_unsigned_v<T>,
+                        std::tuple<T, uint8_t>>
+addWithCarry(T x, T y, bool carryIn) {
+  T result = x + y + carryIn;
+
+  bool n = (result >> (sizeof(T) * 8 - 1));
+  bool z = (result == 0);
+
+  // Trying to calculate whether `result` overflows (`x + y + carryIn > max`).
+  bool c;
+  if (carryIn && x + 1 == 0) {
+    // Implies `x` is max; with a carry set, it will definitely overflow
+    c = true;
+  } else {
+    // We know x + carryIn <= max, so can safely subtract and compare against
+    // y max > x + y + c == max - x > y + c
+    c = ((std::numeric_limits<T>::max() - x - carryIn) < y);
   }
 
-  /** Manipulate the bitfield `value` according to the logic of the (U|S)BFM
-   * Armv9.2-a instructions. */
-  template <typename T>
-  static std::enable_if_t<std::is_integral_v<T> && std::is_unsigned_v<T>, T>
-  bitfieldManipulate(T value, T dest, uint8_t rotateBy, uint8_t sourceBits,
-                     bool signExtend = false) {
-    size_t bits = sizeof(T) * 8;
-
-    T source;
-    T destMask;
-    uint8_t highestBit = sourceBits;
-    if (sourceBits >= rotateBy) {
-      // Mask of values [rotateBy:source+1]
-      destMask = (static_cast<T>(-1) << (sourceBits - rotateBy + 1));
-      source = value >> rotateBy;
-      highestBit -= rotateBy;
-    } else {
-      T upper = (static_cast<T>(-1) << (bits - rotateBy));
-      T lower = (static_cast<T>(-1) >> (rotateBy - sourceBits - 1));
-      destMask = upper ^ lower;
-      source = value << (bits - rotateBy);
-      highestBit += (bits - rotateBy);
-    }
-
-    T result = (dest & destMask) | (source & ~destMask);
-
-    if (!signExtend) {
-      return result;
-    }
-
-    if (highestBit > bits) {
-      // Nothing to do; implicitly sign-extended
-      return result;
-    }
-
-    // Let the compiler do sign-extension for us.
-    uint8_t shiftAmount = bits - highestBit - 1;
-    // Shift the bitfield up, and cast to a signed type, so the highest bit is
-    // now the sign bit
-    auto shifted = static_cast<std::make_signed_t<T>>(result << shiftAmount);
-    // Shift the bitfield back to where it was; as it's a signed type, the
-    // compiler will sign-extend the highest bit
-    return shifted >> shiftAmount;
+  // Calculate whether signed result overflows
+  bool v = false;
+  typedef std::make_signed_t<T> ST;
+  auto sx = static_cast<ST>(x);
+  auto sy = static_cast<ST>(y);
+  if (sx >= 0) {
+    // Check if (x + y + c) > MAX
+    // y > (MAX - x - c)
+    v = sy > (std::numeric_limits<ST>::max() - sx - carryIn);
+  } else {
+    // Check if (x + y + c) < MIN
+    // y < (MIN - x - c)
+    v = sy < (std::numeric_limits<ST>::min() - sx - carryIn);
   }
 
-  /** Function to check if NZCV conditions hold. */
-  static bool conditionHolds(uint8_t cond, uint8_t nzcv) {
-    if (cond == 0b1111) {
-      return true;
-    }
-
-    bool inverse = cond & 1;
-    uint8_t upper = cond >> 1;
-    bool n = (nzcv >> 3) & 1;
-    bool z = (nzcv >> 2) & 1;
-    bool c = (nzcv >> 1) & 1;
-    bool v = nzcv & 1;
-    bool result;
-    switch (upper) {
-      case 0b000:
-        result = z;
-        break;  // EQ/NE
-      case 0b001:
-        result = c;
-        break;  // CS/CC
-      case 0b010:
-        result = n;
-        break;  // MI/PL
-      case 0b011:
-        result = v;
-        break;  // VS/VC
-      case 0b100:
-        result = (c && !z);
-        break;  // HI/LS
-      case 0b101:
-        result = (n == v);
-        break;  // GE/LT
-      case 0b110:
-        result = (n == v && !z);
-        break;  // GT/LE
-      default:  // 0b111, AL
-        result = true;
-    }
-
-    return (inverse ? !result : result);
+  return {result, nzcv(n, z, c, v)};
+}
+
+/** Manipulate the bitfield `value` according to the logic of the (U|S)BFM
+ * Armv9.2-a instructions. */
+template <typename T>
+inline std::enable_if_t<std::is_integral_v<T> && std::is_unsigned_v<T>, T>
+bitfieldManipulate(T value, T dest, uint8_t rotateBy, uint8_t sourceBits,
+                   bool signExtend = false) {
+  size_t bits = sizeof(T) * 8;
+
+  if (rotateBy >= bits) {
+    std::cerr
+        << "\n[SimEng:auxiliaryFunctions] Attempted to use a rotate amount of "
+        << unsigned(rotateBy)
+        << " in bitfieldManipulate which is greater than or equal to the "
+           "data type size of "
+        << bits << "b in use. Exiting." << std::endl;
+    exit(1);
+  } else if (sourceBits >= bits) {
+    std::cerr << "\n[SimEng:auxiliaryFunctions] Attempted to use a source bit "
+                 "position "
+                 "value of "
+              << unsigned(sourceBits)
+              << " in bitfieldManipulate which is greater than or equal to the "
+                 "data type size of "
+              << bits << "b in use. Exiting." << std::endl;
+    exit(1);
   }
 
-  // Rounding function that rounds a double to nearest integer (64-bit). In
-  // event of a tie (i.e. 7.5) it will be rounded to the nearest even number.
-  template <typename IN, typename OUT>
-  static OUT roundToNearestTiesToEven(IN input) {
-    IN half = static_cast<IN>(0.5);
-    if (std::fabs(input - std::trunc(input)) == half) {
-      OUT truncd = static_cast<OUT>(std::trunc(input));
-      // if value is negative, then may need to -1 from truncd, else may need to
-      // +1.
-      OUT addand = (truncd > 0) ? 1 : -1;
-      return ((truncd % 2 == 0) ? truncd : (truncd + addand));
-    }
-    // Otherwise round to nearest
-    return static_cast<OUT>(std::round(input));
+  T source;
+  T destMask;
+  uint8_t highestBit = sourceBits;
+  if (sourceBits >= rotateBy) {
+    // Mask of values [rotateBy:source+1]
+    uint8_t bitMaskSize = sourceBits - rotateBy + 1;
+    destMask = (bitMaskSize == bits) ? 0 : (static_cast<T>(-1) << bitMaskSize);
+    source = value >> rotateBy;
+    highestBit -= rotateBy;
+  } else {
+    uint8_t upperSize = bits - rotateBy;
+    T upper = (upperSize == bits) ? 0 : (static_cast<T>(-1) << upperSize);
+    uint8_t lowerSize = rotateBy - sourceBits - 1;
+    T lower = (lowerSize == bits) ? 0 : (static_cast<T>(-1) >> lowerSize);
+    destMask = upper ^ lower;
+    source = value << (bits - rotateBy);
+    highestBit += (bits - rotateBy);
   }
 
-  /** Extend `value` according to `extendType`, and left-shift the result by
-   * `shift`. Replicated from Instruction.cc */
-  static uint64_t extendValue(uint64_t value, uint8_t extendType,
-                              uint8_t shift) {
-    if (extendType == ARM64_EXT_INVALID && shift == 0) {
-      // Special case: an invalid shift type with a shift amount of 0 implies an
-      // identity operation
-      return value;
-    }
-
-    uint64_t extended;
-    switch (extendType) {
-      case ARM64_EXT_UXTB:
-        extended = static_cast<uint8_t>(value);
-        break;
-      case ARM64_EXT_UXTH:
-        extended = static_cast<uint16_t>(value);
-        break;
-      case ARM64_EXT_UXTW:
-        extended = static_cast<uint32_t>(value);
-        break;
-      case ARM64_EXT_UXTX:
-        extended = value;
-        break;
-      case ARM64_EXT_SXTB:
-        extended = static_cast<int8_t>(value);
-        break;
-      case ARM64_EXT_SXTH:
-        extended = static_cast<int16_t>(value);
-        break;
-      case ARM64_EXT_SXTW:
-        extended = static_cast<int32_t>(value);
-        break;
-      case ARM64_EXT_SXTX:
-        extended = value;
-        break;
-      default:
-        assert(false && "Invalid extension type");
-        return 0;
-    }
+  T result = (dest & destMask) | (source & ~destMask);
 
-    return extended << shift;
+  if (!signExtend) {
+    return result;
   }
 
-  // Rounding function that rounds a float to nearest integer (32-bit). In event
-  // of a tie (i.e. 7.5) it will be rounded to the nearest even number.
-  static int32_t floatRoundToNearestTiesToEven(float input) {
-    if (std::fabs(input - std::trunc(input)) == 0.5f) {
-      if (static_cast<int32_t>(input - 0.5f) % 2 == 0) {
-        return static_cast<int32_t>(input - 0.5f);
-      } else {
-        return static_cast<int32_t>(input + 0.5f);
-      }
-    }
-    // Otherwise round to nearest
-    return static_cast<int32_t>(std::round(input));
+  if (highestBit > bits) {
+    // Nothing to do; implicitly sign-extended
+    return result;
   }
 
-  /** Calculate the corresponding NZCV values from select SVE instructions that
-   * set the First(N), None(Z), !Last(C) condition flags based on the predicate
-   * result, and the V flag to 0. */
-  static uint8_t getNZCVfromPred(std::array<uint64_t, 4> predResult,
-                                 uint64_t VL_bits, int byteCount) {
-    uint8_t N = (predResult[0] & 1);
-    uint8_t Z = 1;
-    // (int)(VL_bits - 1)/512 derives which block of 64-bits within the
-    // predicate register we're working in. 1ull << (VL_bits / 8) - byteCount)
-    // derives a 1 in the last position of the current predicate. Both
-    // dictated by vector length.
-    uint8_t C = !(predResult[(int)((VL_bits - 1) / 512)] &
-                  1ull << (((VL_bits / 8) - byteCount) % 64));
-    for (int i = 0; i < (int)((VL_bits - 1) / 512) + 1; i++) {
-      if (predResult[i]) {
-        Z = 0;
-        break;
-      }
-    }
-    return nzcv(N, Z, C, 0);
+  // Let the compiler do sign-extension for us.
+  uint8_t shiftAmount = bits - highestBit - 1;
+  // Shift the bitfield up, and cast to a signed type, so the highest bit is
+  // now the sign bit
+  auto shifted = static_cast<std::make_signed_t<T>>(result << shiftAmount);
+  // Shift the bitfield back to where it was; as it's a signed type, the
+  // compiler will sign-extend the highest bit
+  return shifted >> shiftAmount;
+}
+
+/** Function to check if NZCV conditions hold. */
+inline bool conditionHolds(uint8_t cond, uint8_t nzcv) {
+  bool inverse = cond & 1;
+  uint8_t upper = cond >> 1;
+  bool n = (nzcv >> 3) & 1;
+  bool z = (nzcv >> 2) & 1;
+  bool c = (nzcv >> 1) & 1;
+  bool v = nzcv & 1;
+  bool result;
+  switch (upper) {
+    case 0b000:
+      result = z;
+      break;  // EQ/NE
+    case 0b001:
+      result = c;
+      break;  // CS/CC
+    case 0b010:
+      result = n;
+      break;  // MI/PL
+    case 0b011:
+      result = v;
+      break;  // VS/VC
+    case 0b100:
+      result = (c && !z);
+      break;  // HI/LS
+    case 0b101:
+      result = (n == v);
+      break;  // GE/LT
+    case 0b110:
+      result = (n == v && !z);
+      break;  // GT/LE
+    default:  // 0b111, AL
+      // AL returns true regardless of inverse value
+      result = (true ^ inverse);
   }
-
-  /** Multiply `a` and `b`, and return the high 64 bits of the result.
-   * https://stackoverflow.com/a/28904636 */
-  static uint64_t mulhi(uint64_t a, uint64_t b) {
-    uint64_t a_lo = (uint32_t)a;
-    uint64_t a_hi = a >> 32;
-    uint64_t b_lo = (uint32_t)b;
-    uint64_t b_hi = b >> 32;
-
-    uint64_t a_x_b_hi = a_hi * b_hi;
-    uint64_t a_x_b_mid = a_hi * b_lo;
-    uint64_t b_x_a_mid = b_hi * a_lo;
-    uint64_t a_x_b_lo = a_lo * b_lo;
-
-    uint64_t carry_bit = ((uint64_t)(uint32_t)a_x_b_mid +
-                          (uint64_t)(uint32_t)b_x_a_mid + (a_x_b_lo >> 32)) >>
-                         32;
-
-    uint64_t multhi =
-        a_x_b_hi + (a_x_b_mid >> 32) + (b_x_a_mid >> 32) + carry_bit;
-
-    return multhi;
+  return (result ^ inverse);
+}
+
+/** Extend `value` according to `extendType`, and left-shift the result by
+ * `shift`. Replicated from Instruction.cc */
+inline uint64_t extendValue(uint64_t value, uint8_t extendType, uint8_t shift) {
+  if (extendType == AARCH64_EXT_INVALID && shift == 0) {
+    // Special case: an invalid shift type with a shift amount of 0 implies an
+    // identity operation
+    return value;
   }
 
-  /** Returns a correctly formatted nzcv value. */
-  static uint8_t nzcv(bool n, bool z, bool c, bool v) {
-    return (n << 3) | (z << 2) | (c << 1) | v;
+  uint64_t extended;
+  switch (extendType) {
+    case AARCH64_EXT_UXTB:
+      extended = static_cast<uint8_t>(value);
+      break;
+    case AARCH64_EXT_UXTH:
+      extended = static_cast<uint16_t>(value);
+      break;
+    case AARCH64_EXT_UXTW:
+      extended = static_cast<uint32_t>(value);
+      break;
+    case AARCH64_EXT_UXTX:
+      extended = value;
+      break;
+    case AARCH64_EXT_SXTB:
+      extended = static_cast<int8_t>(value);
+      break;
+    case AARCH64_EXT_SXTH:
+      extended = static_cast<int16_t>(value);
+      break;
+    case AARCH64_EXT_SXTW:
+      extended = static_cast<int32_t>(value);
+      break;
+    case AARCH64_EXT_SXTX:
+      extended = value;
+      break;
+    default:
+      assert(false && "Invalid extension type");
+      return 0;
   }
 
-  /** Decode the instruction pattern from OperandStr. */
-  static uint16_t sveGetPattern(const std::string operandStr,
-                                const uint8_t esize, const uint16_t VL_) {
-    const uint16_t elements = VL_ / esize;
-    const std::vector<std::string> patterns = {
-        "pow2", "vl1",  "vl2",  "vl3",   "vl4",   "vl5",  "vl6",  "vl7", "vl8",
-        "vl16", "vl32", "vl64", "vl128", "vl256", "mul3", "mul4", "all"};
+  return extended << shift;
+}
 
-    // If no pattern present in operandStr then same behaviour as ALL
-    std::string pattern = "all";
-    for (uint8_t i = 0; i < patterns.size(); i++) {
-      if (operandStr.find(patterns[i]) != std::string::npos) {
-        pattern = patterns[i];
-        // Don't break when pattern found as vl1 will be found in vl128 etc
-      }
+/** Extend `value` using extension/shifting rules defined in `op`. */
+inline uint64_t extendOffset(uint64_t value, const cs_aarch64_op& op) {
+  if (op.ext == 0) {
+    if (op.shift.value == 0) {
+      return value;
     }
-
-    if (pattern == "all")
+    if (op.shift.type == 1) {
+      return extendValue(value, AARCH64_EXT_UXTX, op.shift.value);
+    }
+  }
+  return extendValue(value, op.ext, op.shift.value);
+}
+
+/** Calculate the corresponding NZCV values from select SVE instructions that
+ * set the First(N), None(Z), !Last(C) condition flags based on the predicate
+ * result, and the V flag to 0. */
+inline uint8_t getNZCVfromPred(std::array<uint64_t, 4> predResult,
+                               uint64_t VL_bits, int byteCount) {
+  uint8_t N = (predResult[0] & 1);
+  uint8_t Z = 1;
+  // (int)(VL_bits - 1)/512 derives which block of 64-bits within the
+  // predicate register we're working in. 1ull << (VL_bits / 8) - byteCount)
+  // derives a 1 in the last position of the current predicate. Both
+  // dictated by vector length.
+  uint8_t C = !(predResult[(int)((VL_bits - 1) / 512)] &
+                1ull << (((VL_bits / 8) - byteCount) % 64));
+  for (int i = 0; i < (int)((VL_bits - 1) / 512) + 1; i++) {
+    if (predResult[i]) {
+      Z = 0;
+      break;
+    }
+  }
+  return nzcv(N, Z, C, 0);
+}
+
+/** Multiply `a` and `b`, and return the high 64 bits of the result.
+ * https://stackoverflow.com/a/28904636 */
+inline uint64_t mulhi(uint64_t a, uint64_t b) {
+  uint64_t a_lo = (uint32_t)a;
+  uint64_t a_hi = a >> 32;
+  uint64_t b_lo = (uint32_t)b;
+  uint64_t b_hi = b >> 32;
+
+  uint64_t a_x_b_hi = a_hi * b_hi;
+  uint64_t a_x_b_mid = a_hi * b_lo;
+  uint64_t b_x_a_mid = b_hi * a_lo;
+  uint64_t a_x_b_lo = a_lo * b_lo;
+
+  uint64_t carry_bit = ((uint64_t)(uint32_t)a_x_b_mid +
+                        (uint64_t)(uint32_t)b_x_a_mid + (a_x_b_lo >> 32)) >>
+                       32;
+
+  uint64_t multhi =
+      a_x_b_hi + (a_x_b_mid >> 32) + (b_x_a_mid >> 32) + carry_bit;
+
+  return multhi;
+}
+
+/** Get the number of elements to work on for SVE instructions. */
+inline uint16_t getElemsFromPattern(const aarch64_svepredpat svePattern,
+                                    const uint8_t esize, const uint16_t VL_) {
+  const uint16_t elements = VL_ / esize;
+  switch (svePattern) {
+    case AARCH64_SVEPREDPAT_ALL:
       return elements;
-    else if (pattern == "pow2") {
+    case AARCH64_SVEPREDPAT_MUL3:
+      return elements - (elements % 3);
+    case AARCH64_SVEPREDPAT_MUL4:
+      return elements - (elements % 4);
+    case AARCH64_SVEPREDPAT_POW2: {
       int n = 1;
       while (elements >= std::pow(2, n)) {
         n = n + 1;
       }
       return std::pow(2, n - 1);
-    } else if (pattern == "vl1")
+    }
+    case AARCH64_SVEPREDPAT_VL1:
       return (elements >= 1) ? 1 : 0;
-    else if (pattern == "vl2")
+    case AARCH64_SVEPREDPAT_VL128:
+      return (elements >= 128) ? 128 : 0;
+    case AARCH64_SVEPREDPAT_VL16:
+      return (elements >= 16) ? 16 : 0;
+    case AARCH64_SVEPREDPAT_VL2:
       return (elements >= 2) ? 2 : 0;
-    else if (pattern == "vl3")
+    case AARCH64_SVEPREDPAT_VL256:
+      return (elements >= 256) ? 256 : 0;
+    case AARCH64_SVEPREDPAT_VL3:
       return (elements >= 3) ? 3 : 0;
-    else if (pattern == "vl4")
+    case AARCH64_SVEPREDPAT_VL32:
+      return (elements >= 32) ? 32 : 0;
+    case AARCH64_SVEPREDPAT_VL4:
       return (elements >= 4) ? 4 : 0;
-    else if (pattern == "vl5")
+    case AARCH64_SVEPREDPAT_VL5:
       return (elements >= 5) ? 5 : 0;
-    else if (pattern == "vl6")
+    case AARCH64_SVEPREDPAT_VL6:
       return (elements >= 6) ? 6 : 0;
-    else if (pattern == "vl7")
+    case AARCH64_SVEPREDPAT_VL64:
+      return (elements >= 64) ? 64 : 0;
+    case AARCH64_SVEPREDPAT_VL7:
       return (elements >= 7) ? 7 : 0;
-    else if (pattern == "vl8")
+    case AARCH64_SVEPREDPAT_VL8:
       return (elements >= 8) ? 8 : 0;
-    else if (pattern == "vl16")
-      return (elements >= 16) ? 16 : 0;
-    else if (pattern == "vl32")
-      return (elements >= 32) ? 32 : 0;
-    else if (pattern == "vl64")
-      return (elements >= 64) ? 64 : 0;
-    else if (pattern == "vl128")
-      return (elements >= 128) ? 128 : 0;
-    else if (pattern == "vl256")
-      return (elements >= 256) ? 256 : 0;
-    else if (pattern == "mul4")
-      return elements - (elements % 4);
-    else if (pattern == "mul3")
-      return elements - (elements % 3);
-
-    return 0;
+    default:
+      assert(false && "Unknown SVE Predicate Pattern.");
+      return 0;
   }
-};
+}
+
+/** Apply the shift specified by `shiftType` to the unsigned integer `value`,
+ * shifting by `amount`. */
+template <typename T>
+inline std::enable_if_t<std::is_integral_v<T> && std::is_unsigned_v<T>, T>
+shiftValue(T value, uint8_t shiftType, uint8_t amount) {
+  switch (shiftType) {
+    case AARCH64_SFT_LSL:
+      return value << amount;
+    case AARCH64_SFT_LSR:
+      return value >> amount;
+    case AARCH64_SFT_ASR:
+      return static_cast<std::make_signed_t<T>>(value) >> amount;
+    case AARCH64_SFT_ROR: {
+      // Assuming sizeof(T) is a power of 2.
+      const T mask = sizeof(T) * 8 - 1;
+      assert((amount <= mask) && "Rotate amount exceeds type width");
+      amount &= mask;
+      return (value >> amount) | (value << ((-amount) & mask));
+    }
+    case AARCH64_SFT_MSL: {
+      // pad in with ones instead of zeros
+      const T mask = (static_cast<T>(1) << static_cast<T>(amount)) - 1;
+      return (value << amount) | mask;
+    }
+    case AARCH64_SFT_INVALID:
+      return value;
+    default:
+      assert(false && "Unknown shift type");
+      return 0;
+  }
+}
+
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/aarch64/helpers/bitmanip.hh b/src/include/simeng/arch/aarch64/helpers/bitmanip.hh
index 1316e5ab1f..0f5de798dc 100644
--- a/src/include/simeng/arch/aarch64/helpers/bitmanip.hh
+++ b/src/include/simeng/arch/aarch64/helpers/bitmanip.hh
@@ -5,79 +5,76 @@
 namespace simeng {
 namespace arch {
 namespace aarch64 {
-class bitmanipHelp {
- public:
-  /** Helper function for instructions with the format `bfm rd, rn, #immr,
-   * #imms`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns single value of type T. */
-  template <typename T>
-  static T bfm_2imms(std::vector<RegisterValue>& operands,
-                     const simeng::arch::aarch64::InstructionMetadata& metadata,
-                     bool signExtend, bool zeroDestReg) {
-    uint8_t r = metadata.operands[2].imm;
-    uint8_t s = metadata.operands[3].imm;
-    T dest, source;
-    if (!zeroDestReg) {
-      dest = operands[0].get<T>();
-      source = operands[1].get<T>();
-    } else {
-      dest = 0;
-      source = operands[0].get<T>();
-    }
-    return AuxFunc::bitfieldManipulate(source, dest, r, s, signExtend);
-  }
 
-  /** Helper function for instructions with the format `extr rd, rn, rm, #lsb`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns single value of type T. */
-  template <typename T>
-  static T extrLSB_registers(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    T n = operands[0].get<T>();
-    T m = operands[1].get<T>();
-    int64_t lsb = metadata.operands[3].imm;
-    if (lsb == 0) return m;
-    return (m >> lsb) | (n << ((sizeof(T) * 8) - lsb));
+/** Helper function for instructions with the format `bfm rd, rn, #immr,
+ * #imms`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns single value of type T. */
+template <typename T>
+T bfm_2imms(srcValContainer& sourceValues,
+            const simeng::arch::aarch64::InstructionMetadata& metadata,
+            bool signExtend, bool zeroDestReg) {
+  uint8_t r = metadata.operands[2].imm;
+  uint8_t s = metadata.operands[3].imm;
+  T dest, source;
+  if (!zeroDestReg) {
+    dest = sourceValues[0].get<T>();
+    source = sourceValues[1].get<T>();
+  } else {
+    dest = 0;
+    source = sourceValues[0].get<T>();
   }
+  return bitfieldManipulate(source, dest, r, s, signExtend);
+}
 
-  /** Helper function for instructions with the format `rbit rd, rn`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns single value of type uint64_t. */
-  template <typename T>
-  static uint64_t rbit(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    int width = sizeof(T) * 8;
+/** Helper function for instructions with the format `extr rd, rn, rm, #lsb`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns single value of type T. */
+template <typename T>
+T extrLSB_registers(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  T n = sourceValues[0].get<T>();
+  T m = sourceValues[1].get<T>();
+  int64_t lsb = metadata.operands[3].imm;
+  if (lsb == 0) return m;
+  return (m >> lsb) | (n << ((sizeof(T) * 8) - lsb));
+}
 
-    static uint8_t reversedNibble[16] = {
-        0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110,
-        0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111};
+/** Helper function for instructions with the format `rbit rd, rn`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns single value of type uint64_t. */
+template <typename T>
+uint64_t rbit(srcValContainer& sourceValues,
+              const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  int width = sizeof(T) * 8;
 
-    uint64_t n = operands[0].get<uint64_t>();
-    uint64_t result = 0;
-    for (int i = 0; i < width; i += 4) {
-      result <<= 4;
-      result |= reversedNibble[n & 0b1111];
-      n >>= 4;
-    }
-    return result;
-  }
+  uint8_t reversedNibble[16] = {0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010,
+                                0b0110, 0b1110, 0b0001, 0b1001, 0b0101, 0b1101,
+                                0b0011, 0b1011, 0b0111, 0b1111};
 
-  /** Helper function for instructions with the format `rev rd, rn`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns array of uint8_t with number of elements = bytes in T. */
-  template <typename T>
-  static std::array<uint8_t, sizeof(T)> rev(
-      std::vector<RegisterValue>& operands) {
-    auto bytes = operands[0].getAsVector<uint8_t>();
-    std::array<uint8_t, sizeof(T)> reversed;
-    // Copy `bytes` backwards onto `reversed`
-    std::copy(bytes, bytes + sizeof(T), std::rbegin(reversed));
-    return reversed;
+  uint64_t n = sourceValues[0].get<uint64_t>();
+  uint64_t result = 0;
+  for (int i = 0; i < width; i += 4) {
+    result <<= 4;
+    result |= reversedNibble[n & 0b1111];
+    n >>= 4;
   }
-};
+  return result;
+}
+
+/** Helper function for instructions with the format `rev rd, rn`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns array of uint8_t with number of elements = bytes in T. */
+template <typename T>
+std::array<uint8_t, sizeof(T)> rev(srcValContainer& sourceValues) {
+  auto bytes = sourceValues[0].getAsVector<uint8_t>();
+  std::array<uint8_t, sizeof(T)> reversed;
+  // Copy `bytes` backwards onto `reversed`
+  std::copy(bytes, bytes + sizeof(T), std::rbegin(reversed));
+  return reversed;
+}
+
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/aarch64/helpers/comparison.hh b/src/include/simeng/arch/aarch64/helpers/comparison.hh
index 04ea68ee1a..32d1a3115e 100644
--- a/src/include/simeng/arch/aarch64/helpers/comparison.hh
+++ b/src/include/simeng/arch/aarch64/helpers/comparison.hh
@@ -5,23 +5,21 @@
 namespace simeng {
 namespace arch {
 namespace aarch64 {
-class comparisonHelp {
- public:
-  /** Helper function for instructions with the format `orr rd, rn, rm {shift
-   * #amount}`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns single value of type T. */
-  template <typename T>
-  static T orrShift_3ops(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    const T n = operands[0].get<T>();
-    const T m =
-        shiftValue(operands[1].get<T>(), metadata.operands[2].shift.type,
-                   metadata.operands[2].shift.value);
-    return (n | m);
-  }
-};
+
+/** Helper function for instructions with the format `orr rd, rn, rm {shift
+ * #amount}`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns single value of type T. */
+template <typename T>
+T orrShift_3ops(srcValContainer& sourceValues,
+                const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  const T n = sourceValues[0].get<T>();
+  const T m =
+      shiftValue(sourceValues[1].get<T>(), metadata.operands[2].shift.type,
+                 metadata.operands[2].shift.value);
+  return (n | m);
+}
+
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/aarch64/helpers/conditional.hh b/src/include/simeng/arch/aarch64/helpers/conditional.hh
index 4cdfce4061..e541eb276a 100644
--- a/src/include/simeng/arch/aarch64/helpers/conditional.hh
+++ b/src/include/simeng/arch/aarch64/helpers/conditional.hh
@@ -5,114 +5,110 @@
 namespace simeng {
 namespace arch {
 namespace aarch64 {
-class conditionalHelp {
- public:
-  /** Helper function for instructions with the format `ccmn rn, #imm #nzcv,
-   * cc`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns single value of type uint8_t. */
-  template <typename T>
-  static uint8_t ccmn_imm(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    if (AuxFunc::conditionHolds(metadata.cc, operands[0].get<uint8_t>())) {
-      uint8_t nzcv;
-      std::tie(std::ignore, nzcv) = AuxFunc::addWithCarry(
-          operands[1].get<T>(), static_cast<T>(metadata.operands[1].imm), 0);
-      return nzcv;
-    }
-    return static_cast<uint8_t>(metadata.operands[2].imm);
+
+/** Helper function for instructions with the format `ccmn rn, #imm #nzcv,
+ * cc`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns single value of type uint8_t. */
+template <typename T>
+uint8_t ccmn_imm(srcValContainer& sourceValues,
+                 const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  if (conditionHolds(metadata.cc, sourceValues[0].get<uint8_t>())) {
+    uint8_t nzcv;
+    std::tie(std::ignore, nzcv) = addWithCarry(
+        sourceValues[1].get<T>(), static_cast<T>(metadata.operands[1].imm), 0);
+    return nzcv;
   }
+  return static_cast<uint8_t>(metadata.operands[2].imm);
+}
 
-  /** Helper function for instructions with the format `ccmp rn, #imm #nzcv,
-   * cc`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns single value of type uint8_t. */
-  template <typename T>
-  static uint8_t ccmp_imm(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    if (AuxFunc::conditionHolds(metadata.cc, operands[0].get<uint8_t>())) {
-      uint8_t nzcv;
-      std::tie(std::ignore, nzcv) = AuxFunc::addWithCarry(
-          operands[1].get<T>(), ~static_cast<T>(metadata.operands[1].imm), 1);
-      return nzcv;
-    }
-    return static_cast<uint8_t>(metadata.operands[2].imm);
+/** Helper function for instructions with the format `ccmp rn, #imm #nzcv,
+ * cc`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns single value of type uint8_t. */
+template <typename T>
+uint8_t ccmp_imm(srcValContainer& sourceValues,
+                 const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  if (conditionHolds(metadata.cc, sourceValues[0].get<uint8_t>())) {
+    uint8_t nzcv;
+    std::tie(std::ignore, nzcv) = addWithCarry(
+        sourceValues[1].get<T>(), ~static_cast<T>(metadata.operands[1].imm), 1);
+    return nzcv;
   }
+  return static_cast<uint8_t>(metadata.operands[2].imm);
+}
 
-  /** Helper function for instructions with the format `ccmp rn, rm, #nzcv,
-   * cc`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns single value of type uint8_t. */
-  template <typename T>
-  static uint8_t ccmp_reg(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    if (AuxFunc::conditionHolds(metadata.cc, operands[0].get<uint8_t>())) {
-      uint8_t nzcv;
-      std::tie(std::ignore, nzcv) =
-          AuxFunc::addWithCarry(operands[1].get<T>(), ~operands[2].get<T>(), 1);
-      return nzcv;
-    }
-    return static_cast<uint8_t>(metadata.operands[2].imm);
+/** Helper function for instructions with the format `ccmp rn, rm, #nzcv,
+ * cc`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns single value of type uint8_t. */
+template <typename T>
+uint8_t ccmp_reg(srcValContainer& sourceValues,
+                 const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  if (conditionHolds(metadata.cc, sourceValues[0].get<uint8_t>())) {
+    uint8_t nzcv;
+    std::tie(std::ignore, nzcv) =
+        addWithCarry(sourceValues[1].get<T>(), ~sourceValues[2].get<T>(), 1);
+    return nzcv;
   }
+  return static_cast<uint8_t>(metadata.operands[2].imm);
+}
 
-  /** Helper function for instructions with the format `cb<z,nz> rn, #imm`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns tuple of type [bool branch taken, uint64_t address]. */
-  template <typename T>
-  static std::tuple<bool, uint64_t> condBranch_cmpToZero(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      uint64_t instructionAddress, std::function<bool(T)> func) {
-    bool branchTaken;
-    uint64_t branchAddress;
-    if (func(operands[0].get<T>())) {
-      branchTaken = true;
-      branchAddress = instructionAddress + metadata.operands[1].imm;
-    } else {
-      branchTaken = false;
-      branchAddress = instructionAddress + 4;
-    }
-    return {branchTaken, branchAddress};
+/** Helper function for instructions with the format `cb<z,nz> rn, #imm`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns tuple of type [bool branch taken, uint64_t address]. */
+template <typename T>
+std::tuple<bool, uint64_t> condBranch_cmpToZero(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    uint64_t instructionAddress, std::function<bool(T)> func) {
+  bool branchTaken;
+  uint64_t branchAddress;
+  if (func(sourceValues[0].get<T>())) {
+    branchTaken = true;
+    branchAddress = instructionAddress + metadata.operands[1].imm;
+  } else {
+    branchTaken = false;
+    branchAddress = instructionAddress + 4;
   }
+  return {branchTaken, branchAddress};
+}
 
-  /** Helper function for instructions with the format `cs<el, neg, inc, inv>
-   * rd, rn, rm, cc`.
-   * T represents the type of operands (e.g. for xd, T = uint64_t).
-   * Returns single value of type T. */
-  template <typename T>
-  static T cs_4ops(std::vector<RegisterValue>& operands,
-                   const simeng::arch::aarch64::InstructionMetadata& metadata,
-                   std::function<T(T)> func) {
-    if (AuxFunc::conditionHolds(metadata.cc, operands[0].get<uint8_t>())) {
-      return operands[1].get<T>();
-    }
-    return func(operands[2].get<T>());
+/** Helper function for instructions with the format `cs<el, neg, inc, inv>
+ * rd, rn, rm, cc`.
+ * T represents the type of sourceValues (e.g. for xd, T = uint64_t).
+ * Returns single value of type T. */
+template <typename T>
+T cs_4ops(srcValContainer& sourceValues,
+          const simeng::arch::aarch64::InstructionMetadata& metadata,
+          std::function<T(T)> func) {
+  if (conditionHolds(metadata.cc, sourceValues[0].get<uint8_t>())) {
+    return sourceValues[1].get<T>();
   }
+  return func(sourceValues[2].get<T>());
+}
 
-  /** Helper function for instructions with the format `tb<z,nz> rn, #imm,
-   * label`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns tuple of type [bool branch taken, uint64_t address]. */
-  template <typename T>
-  static std::tuple<bool, uint64_t> tbnz_tbz(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      uint64_t instructionAddress, bool isNZ) {
-    bool branchTaken;
-    uint64_t branchAddress = instructionAddress;
-    if (operands[0].get<T>() &
-        (static_cast<T>(1) << metadata.operands[1].imm)) {
-      branchTaken = isNZ;
-    } else {
-      branchTaken = !isNZ;
-    }
-    branchAddress += branchTaken ? metadata.operands[2].imm : 4;
-    return {branchTaken, branchAddress};
+/** Helper function for instructions with the format `tb<z,nz> rn, #imm,
+ * label`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns tuple of type [bool branch taken, uint64_t address]. */
+template <typename T>
+std::tuple<bool, uint64_t> tbnz_tbz(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    uint64_t instructionAddress, bool isNZ) {
+  bool branchTaken;
+  uint64_t branchAddress = instructionAddress;
+  if (sourceValues[0].get<T>() &
+      (static_cast<T>(1) << metadata.operands[1].imm)) {
+    branchTaken = isNZ;
+  } else {
+    branchTaken = !isNZ;
   }
-};
+  branchAddress += branchTaken ? metadata.operands[2].imm : 4;
+  return {branchTaken, branchAddress};
+}
+
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/aarch64/helpers/divide.hh b/src/include/simeng/arch/aarch64/helpers/divide.hh
index f4e226a0b6..02dae07d9e 100644
--- a/src/include/simeng/arch/aarch64/helpers/divide.hh
+++ b/src/include/simeng/arch/aarch64/helpers/divide.hh
@@ -5,19 +5,18 @@
 namespace simeng {
 namespace arch {
 namespace aarch64 {
-class divideHelp {
- public:
-  /** Helper function for instructions with the format `div rd, rn, rm`.
-   * T represents the type of operands (e.g. for xd, T = uint64_t).
-   * Returns single value of type T. */
-  template <typename T>
-  static T div_3ops(std::vector<RegisterValue>& operands) {
-    const T n = operands[0].get<T>();
-    const T m = operands[1].get<T>();
-    if (m == 0) return 0;
-    return (n / m);
-  }
-};
+
+/** Helper function for instructions with the format `div rd, rn, rm`.
+ * T represents the type of sourceValues (e.g. for xd, T = uint64_t).
+ * Returns single value of type T. */
+template <typename T>
+T div_3ops(srcValContainer& sourceValues) {
+  const T n = sourceValues[0].get<T>();
+  const T m = sourceValues[1].get<T>();
+  if (m == 0) return 0;
+  return (n / m);
+}
+
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/aarch64/helpers/float.hh b/src/include/simeng/arch/aarch64/helpers/float.hh
index 05261173de..454f50070c 100644
--- a/src/include/simeng/arch/aarch64/helpers/float.hh
+++ b/src/include/simeng/arch/aarch64/helpers/float.hh
@@ -1,156 +1,199 @@
 #pragma once
 
+#include <limits>
+
 #include "auxiliaryFunctions.hh"
 
 namespace simeng {
 namespace arch {
 namespace aarch64 {
-class floatHelp {
- public:
-  /** Helper function for instructions with the format `fabd rd, rn, rm`.
-   * T represents the type of operands (e.g. for sd T = float).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue fabd_3ops(std::vector<RegisterValue>& operands) {
-    const T n = operands[0].get<T>();
-    const T m = operands[1].get<T>();
-    return {std::fabs(n - m), 256};
-  }
 
-  /** Helper function for instructions with the format `fabs rd, rn`.
-   * T represents the type of operands (e.g. for sd T = float).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue fabs_2ops(std::vector<RegisterValue>& operands) {
-    const T n = operands[0].get<T>();
-    return {std::fabs(n), 256};
-  }
+/** Helper function for instructions with the format `fabd rd, rn, rm`.
+ * T represents the type of sourceValues (e.g. for sd T = float).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue fabd_3ops(srcValContainer& sourceValues) {
+  const T n = sourceValues[0].get<T>();
+  const T m = sourceValues[1].get<T>();
+  return {std::fabs(n - m), 256};
+}
 
-  /** Helper function for instructions with the format `fccmp rn, rm, #nzcv,
-   * cc`.
-   * T represents the type of operands (e.g. for sn T = float).
-   * Returns single value of type uint8_t. */
-  template <typename T>
-  static uint8_t fccmp(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    if (AuxFunc::conditionHolds(metadata.cc, operands[0].get<uint8_t>())) {
-      T a = operands[1].get<T>();
-      T b = operands[2].get<T>();
-      if (std::isnan(a) || std::isnan(b)) {
-        // TODO: Raise exception if NaNs are signalling or fcmpe
-        return AuxFunc::nzcv(false, false, true, true);
-      } else if (a == b) {
-        return AuxFunc::nzcv(false, true, true, false);
-      } else if (a < b) {
-        return AuxFunc::nzcv(true, false, false, false);
-      } else {
-        return AuxFunc::nzcv(false, false, true, false);
-      }
-    }
-    return static_cast<uint8_t>(metadata.operands[2].imm);
-  }
+/** Helper function for instructions with the format `fabs rd, rn`.
+ * T represents the type of sourceValues (e.g. for sd T = float).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue fabs_2ops(srcValContainer& sourceValues) {
+  const T n = sourceValues[0].get<T>();
+  return {std::fabs(n), 256};
+}
 
-  /** Helper function for instructions with the format `fcmp rn, <rm, #imm>`.
-   * T represents the type of operands (e.g. for sn T = float).
-   * Returns single value of type uint8_t. */
-  template <typename T>
-  static uint8_t fcmp(std::vector<RegisterValue>& operands, bool useImm) {
-    T a = operands[0].get<T>();
-    // Dont need to fetch imm as will always be 0.0
-    T b = useImm ? 0 : operands[1].get<T>();
+/** Helper function for instructions with the format `fccmp rn, rm, #nzcv,
+ * cc`.
+ * T represents the type of sourceValues (e.g. for sn T = float).
+ * Returns single value of type uint8_t. */
+template <typename T>
+uint8_t fccmp(srcValContainer& sourceValues,
+              const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  if (conditionHolds(metadata.cc, sourceValues[0].get<uint8_t>())) {
+    T a = sourceValues[1].get<T>();
+    T b = sourceValues[2].get<T>();
     if (std::isnan(a) || std::isnan(b)) {
       // TODO: Raise exception if NaNs are signalling or fcmpe
-      return AuxFunc::nzcv(false, false, true, true);
+      return nzcv(false, false, true, true);
     } else if (a == b) {
-      return AuxFunc::nzcv(false, true, true, false);
+      return nzcv(false, true, true, false);
     } else if (a < b) {
-      return AuxFunc::nzcv(true, false, false, false);
+      return nzcv(true, false, false, false);
+    } else {
+      return nzcv(false, false, true, false);
     }
-    return AuxFunc::nzcv(false, false, true, false);
   }
+  return static_cast<uint8_t>(metadata.operands[2].imm);
+}
 
-  /** Helper function for instructions with the format `fmaxnm rd, rn, rm`.
-   * T represents the type of operands (e.g. for sd T = float).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue fmaxnm_3ops(std::vector<RegisterValue>& operands) {
-    const T n = operands[0].get<T>();
-    const T m = operands[1].get<T>();
-    return {std::fmax(n, m), 256};
+/** Helper function for instructions with the format `fcmp rn, <rm, #imm>`.
+ * T represents the type of sourceValues (e.g. for sn T = float).
+ * Returns single value of type uint8_t. */
+template <typename T>
+uint8_t fcmp(srcValContainer& sourceValues, bool useImm) {
+  T a = sourceValues[0].get<T>();
+  // Dont need to fetch imm as will always be 0.0
+  T b = useImm ? 0 : sourceValues[1].get<T>();
+  if (std::isnan(a) || std::isnan(b)) {
+    // TODO: Raise exception if NaNs are signalling or fcmpe
+    return nzcv(false, false, true, true);
+  } else if (a == b) {
+    return nzcv(false, true, true, false);
+  } else if (a < b) {
+    return nzcv(true, false, false, false);
   }
+  return nzcv(false, false, true, false);
+}
 
-  /** Helper function for instructions with the format `fmaxnm rd, rn, rm`.
-   * T represents the type of operands (e.g. for sd T = float).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue fminnm_3ops(std::vector<RegisterValue>& operands) {
-    const T n = operands[0].get<T>();
-    const T m = operands[1].get<T>();
-    return {std::fmin(n, m), 256};
-  }
+/** Helper function for instructions with the format `fmaxnm rd, rn, rm`.
+ * T represents the type of sourceValues (e.g. for sd T = float).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue fmaxnm_3ops(srcValContainer& sourceValues) {
+  const T n = sourceValues[0].get<T>();
+  const T m = sourceValues[1].get<T>();
+  return {std::fmax(n, m), 256};
+}
 
-  /** Helper function for NEON instructions with the format `fnmsub rd, rn, rm,
-   * ra`.
-   * T represents the type of operands (e.g. for sd T = float).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue fnmsub_4ops(std::vector<RegisterValue>& operands) {
-    T n = operands[0].get<T>();
-    T m = operands[1].get<T>();
-    T a = operands[2].get<T>();
-    return {std::fma(n, m, -a), 256};
-  }
+/** Helper function for instructions with the format `fmaxnm rd, rn, rm`.
+ * T represents the type of sourceValues (e.g. for sd T = float).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue fminnm_3ops(srcValContainer& sourceValues) {
+  const T n = sourceValues[0].get<T>();
+  const T m = sourceValues[1].get<T>();
+  return {std::fmin(n, m), 256};
+}
 
-  /** Helper function for NEON instructions with the format `fnmadd rd, rn, rm,
-   * ra`.
-   * T represents the type of operands (e.g. for sd T = float).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue fnmadd_4ops(std::vector<RegisterValue>& operands) {
-    T n = operands[0].get<T>();
-    T m = operands[1].get<T>();
-    T a = operands[2].get<T>();
-    return {std::fma(-n, m, -a), 256};
-  }
+/** Helper function for NEON instructions with the format `fnmsub rd, rn, rm,
+ * ra`.
+ * T represents the type of sourceValues (e.g. for sd T = float).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue fnmsub_4ops(srcValContainer& sourceValues) {
+  T n = sourceValues[0].get<T>();
+  T m = sourceValues[1].get<T>();
+  T a = sourceValues[2].get<T>();
+  return {std::fma(n, m, -a), 256};
+}
 
-  /** Helper function for NEON instructions with the format `frintp rd, rn`.
-   * T represents the type of operands (e.g. for dd T = double).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue frintpScalar_2ops(std::vector<RegisterValue>& operands) {
-    T n = operands[0].get<T>();
-
-    // Merge always = false due to assumption that FPCR.nep bit = 0
-    // (In SimEng the value of this register is not manually set)
-    T out = 0;
-    // Input of Infinity or 0 gives output of the same sign
-    if (n == 0.0 || n == -0.0 || n == INFINITY || n == -INFINITY)
-      out = n;
-    else
-      out = std::ceil(n);
-
-    return {out, 256};
-  }
+/** Helper function for NEON instructions with the format `fnmadd rd, rn, rm,
+ * ra`.
+ * T represents the type of sourceValues (e.g. for sd T = float).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue fnmadd_4ops(srcValContainer& sourceValues) {
+  T n = sourceValues[0].get<T>();
+  T m = sourceValues[1].get<T>();
+  T a = sourceValues[2].get<T>();
+  return {std::fma(-n, m, -a), 256};
+}
+
+/** Helper function for NEON instructions with the format `frintp rd, rn`.
+ * T represents the type of sourceValues (e.g. for dd T = double).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue frintpScalar_2ops(srcValContainer& sourceValues) {
+  T n = sourceValues[0].get<T>();
+
+  // Merge always = false due to assumption that FPCR.nep bit = 0
+  // (In SimEng the value of this register is not manually set)
+  T out = 0;
+  // Input of Infinity or 0 gives output of the same sign
+  if (n == 0.0 || n == -0.0 || n == INFINITY || n == -INFINITY)
+    out = n;
+  else
+    out = std::ceil(n);
 
-  /** Helper function for NEON instructions with the format `scvtf rd,
-   *  <w,x>n`, #fbits.
-   * D represents the destination vector register type (e.g. for dd, D =
-   * double).
-   * N represents the source vector register type (e.g. for wn, N = int32_t).
-   * Returns correctly formated RegisterValue. */
-  template <typename D, typename N>
-  static RegisterValue scvtf_FixedPoint(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    N n = operands[0].get<N>();
-    const uint8_t fbits = metadata.operands[2].imm;
-
-    D out = static_cast<D>(n) / std::pow(2, fbits);
-
-    return {out, 256};
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `scvtf rd,
+ *  <w,x>n`, #fbits.
+ * D represents the destination vector register type (e.g. for dd, D =
+ * double).
+ * N represents the source vector register type (e.g. for wn, N = int32_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename D, typename N>
+RegisterValue scvtf_FixedPoint(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  N n = sourceValues[0].get<N>();
+  const uint8_t fbits = metadata.operands[2].imm;
+
+  D out = static_cast<D>(n) / std::pow(2, fbits);
+
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format fcvtzu rd, rn.
+ * D represents the destination register type (e.g. for Xd, D = uint64_t).
+ * N represents the source register type (e.g. for Sd, N = float).
+ * Returns single value of type D. */
+template <typename D, typename N>
+D fcvtzu_integer(srcValContainer& sourceValues) {
+  // Ensure types so that we know behaviour of inaccurate type conversions
+  static_assert((std::is_same<float, N>() || std::is_same<double, N>()) &&
+                "N is not a valid type which should be float or double");
+  static_assert((std::is_same<uint32_t, D>() || std::is_same<uint64_t, D>()) &&
+                "D is not a valid type which should be int32_t or int64_t");
+
+  N input = sourceValues[0].get<N>();
+  D result = static_cast<D>(0);
+
+  // Check for nan and less than 0
+  if (!std::isnan(input) && (input > static_cast<N>(0))) {
+    if (std::isinf(input)) {
+      // Account for Infinity
+      result = std::numeric_limits<D>::max();
+    } else if (static_cast<double>(input) >=
+               static_cast<double>(std::numeric_limits<D>::max())) {
+      // Cast to double to ensure no precision errors. Float can't store uint32
+      // or uint64 max values accurately as not enough bits available. This
+      // causes unwanted comparison behaviour
+      //
+      // max() will be either 4294967295 or 18446744073709551615
+      // Casting to float results in the following (incorrect) values 4294967296
+      // (+1) or 18446744073709551616 (+1)
+      //
+      // Casting to double results in no erroneous conversion.
+
+      // Account for the source value being larger than the
+      // destination register can support
+      result = std::numeric_limits<D>::max();
+    } else {
+      result = static_cast<D>(std::trunc(input));
+    }
   }
-};
+
+  return result;
+}
+
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/aarch64/helpers/load.hh b/src/include/simeng/arch/aarch64/helpers/load.hh
deleted file mode 100644
index 08f68d726a..0000000000
--- a/src/include/simeng/arch/aarch64/helpers/load.hh
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-
-#include "auxiliaryFunctions.hh"
-
-namespace simeng {
-namespace arch {
-namespace aarch64 {
-class loadHelp {
- public:
-  static void tempFunc() { return; }
-};
-}  // namespace aarch64
-}  // namespace arch
-}  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/aarch64/helpers/logical.hh b/src/include/simeng/arch/aarch64/helpers/logical.hh
index 154bf2e59a..c2b36c74f9 100644
--- a/src/include/simeng/arch/aarch64/helpers/logical.hh
+++ b/src/include/simeng/arch/aarch64/helpers/logical.hh
@@ -5,105 +5,102 @@
 namespace simeng {
 namespace arch {
 namespace aarch64 {
-class logicalHelp {
- public:
-  /** Helper function for instructions with the format `asrv rd, rn, rm`.
-   * T represents the type of operands (e.g. for xn, T = int64_t).
-   * Returns single value of type T. */
-  template <typename T>
-  static T asrv_3gpr(std::vector<RegisterValue>& operands) {
-    const T n = operands[0].get<T>();
-    const T m = operands[1].get<typename std::make_unsigned<T>::type>();
-    return n >> (m % (sizeof(T) * 8));
-  }
 
-  /** Helper function for instructions with the format `bic rd, rn, rm{, shift
-   * #amount}`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns tuple of [resulting value, nzcv]. */
-  template <typename T>
-  static std::tuple<T, uint8_t> bicShift_3ops(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      bool calcNZCV) {
-    const T x = operands[0].get<T>();
-    const T y =
-        ~shiftValue(operands[1].get<T>(), metadata.operands[2].shift.type,
-                    metadata.operands[2].shift.value);
-    T result = x & y;
-    bool n = sizeof(T) == 8 ? (static_cast<int64_t>(result) < 0)
-                            : (static_cast<int32_t>(result) < 0);
-    bool z = (result == 0);
-    uint8_t nzcv = calcNZCV ? AuxFunc::nzcv(n, z, false, false) : 0;
-    return {result, nzcv};
-  }
+/** Helper function for instructions with the format `asrv rd, rn, rm`.
+ * T represents the type of sourceValues (e.g. for xn, T = int64_t).
+ * Returns single value of type T. */
+template <typename T>
+T asrv_3gpr(srcValContainer& sourceValues) {
+  const T n = sourceValues[0].get<T>();
+  const T m = sourceValues[1].get<typename std::make_unsigned<T>::type>();
+  return n >> (m % (sizeof(T) * 8));
+}
 
-  /** Helper function for instructions with the format `<and, eor, ...> rd, rn,
-   * #imm`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns tuple of [resulting value, nzcv]. */
-  template <typename T>
-  static std::tuple<T, uint8_t> logicOp_imm(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata, bool calcNZCV,
-      std::function<T(T, T)> func) {
-    const T n = operands[0].get<T>();
-    const T m = static_cast<T>(metadata.operands[2].imm);
-    T result = func(n, m);
-    uint8_t nzcv = calcNZCV ? AuxFunc::nzcv(result >> ((sizeof(T) * 8) - 1),
-                                            result == 0, false, false)
-                            : 0;
-    return {result, nzcv};
-  }
+/** Helper function for instructions with the format `bic rd, rn, rm{, shift
+ * #amount}`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns tuple of [resulting value, nzcv]. */
+template <typename T>
+std::tuple<T, uint8_t> bicShift_3ops(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata, bool calcNZCV) {
+  const T x = sourceValues[0].get<T>();
+  const T y =
+      ~shiftValue(sourceValues[1].get<T>(), metadata.operands[2].shift.type,
+                  metadata.operands[2].shift.value);
+  T result = x & y;
+  bool n = sizeof(T) == 8 ? (static_cast<int64_t>(result) < 0)
+                          : (static_cast<int32_t>(result) < 0);
+  bool z = (result == 0);
+  uint8_t nzcv_ = calcNZCV ? nzcv(n, z, false, false) : 0;
+  return {result, nzcv_};
+}
 
-  /** Helper function for instructions with the format `<and, eor, ...> rd, rn,
-   * rm{, shift #amount}`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns tuple of [resulting value, nzcv]. */
-  template <typename T>
-  static std::tuple<T, uint8_t> logicOpShift_3ops(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata, bool calcNZCV,
-      std::function<T(T, T)> func) {
-    const T n = operands[0].get<T>();
-    const T m =
-        shiftValue(operands[1].get<T>(), metadata.operands[2].shift.type,
-                   metadata.operands[2].shift.value);
-    T result = func(n, m);
-    uint8_t nzcv = calcNZCV ? AuxFunc::nzcv(result >> ((sizeof(T) * 8) - 1),
-                                            result == 0, false, false)
-                            : 0;
-    return {result, nzcv};
-  }
+/** Helper function for instructions with the format `<and, eor, ...> rd, rn,
+ * #imm`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns tuple of [resulting value, nzcv]. */
+template <typename T>
+std::tuple<T, uint8_t> logicOp_imm(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata, bool calcNZCV,
+    std::function<T(T, T)> func) {
+  const T n = sourceValues[0].get<T>();
+  const T m = static_cast<T>(metadata.operands[2].imm);
+  T result = func(n, m);
+  uint8_t nzcv_ = calcNZCV ? nzcv(result >> ((sizeof(T) * 8) - 1), result == 0,
+                                  false, false)
+                           : 0;
+  return {result, nzcv_};
+}
 
-  /** Helper function for instructions with the format `ls<l,r>v rd, rn, rm`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns single value of type uint64_t. */
-  template <typename T>
-  static uint64_t logicalShiftLR_3ops(std::vector<RegisterValue>& operands,
-                                      bool isLSL) {
-    const T n = operands[0].get<T>();
-    const T m = operands[1].get<T>() & ((sizeof(T) * 8) - 1);
-    uint64_t result = static_cast<uint64_t>(isLSL ? n << m : n >> m);
-    return result;
-  }
+/** Helper function for instructions with the format `<and, eor, ...> rd, rn,
+ * rm{, shift #amount}`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns tuple of [resulting value, nzcv]. */
+template <typename T>
+std::tuple<T, uint8_t> logicOpShift_3ops(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata, bool calcNZCV,
+    std::function<T(T, T)> func) {
+  const T n = sourceValues[0].get<T>();
+  const T m =
+      shiftValue(sourceValues[1].get<T>(), metadata.operands[2].shift.type,
+                 metadata.operands[2].shift.value);
+  T result = func(n, m);
+  uint8_t nzcv_ = calcNZCV ? nzcv(result >> ((sizeof(T) * 8) - 1), result == 0,
+                                  false, false)
+                           : 0;
+  return {result, nzcv_};
+}
 
-  /** Helper function for instructions with the format `rorv rd, rn, rm`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns single value of type T. */
-  template <typename T>
-  static T rorv_3ops(std::vector<RegisterValue>& operands) {
-    const T n = operands[0].get<T>();
-    const T m = operands[1].get<T>();
+/** Helper function for instructions with the format `ls<l,r>v rd, rn, rm`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns single value of type uint64_t. */
+template <typename T>
+uint64_t logicalShiftLR_3ops(srcValContainer& sourceValues, bool isLSL) {
+  const T n = sourceValues[0].get<T>();
+  const T m = sourceValues[1].get<T>() & ((sizeof(T) * 8) - 1);
+  uint64_t result = static_cast<uint64_t>(isLSL ? n << m : n >> m);
+  return result;
+}
 
-    const uint16_t data_size = sizeof(T) * 8;
-    T remainder = m % data_size;
+/** Helper function for instructions with the format `rorv rd, rn, rm`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns single value of type T. */
+template <typename T>
+T rorv_3ops(srcValContainer& sourceValues) {
+  const T n = sourceValues[0].get<T>();
+  const T m = sourceValues[1].get<T>();
+
+  const uint16_t data_size = sizeof(T) * 8;
+  T remainder = m % data_size;
+
+  // Check if any rotation done at all
+  if (remainder == 0) return n;
+  return (n >> remainder) + (n << (data_size - remainder));
+}
 
-    // Check if any rotation done at all
-    if (remainder == 0) return n;
-    return (n >> remainder) + (n << (data_size - remainder));
-  }
-};
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/aarch64/helpers/multiply.hh b/src/include/simeng/arch/aarch64/helpers/multiply.hh
index d5466bac04..2c1464e2d5 100644
--- a/src/include/simeng/arch/aarch64/helpers/multiply.hh
+++ b/src/include/simeng/arch/aarch64/helpers/multiply.hh
@@ -5,54 +5,53 @@
 namespace simeng {
 namespace arch {
 namespace aarch64 {
-class multiplyHelp {
- public:
-  /** Helper function for instructions with the format `madd rd, rn, rm, ra`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns single value of type T. */
-  template <typename T>
-  static T madd_4ops(std::vector<RegisterValue>& operands) {
-    const T n = operands[0].get<T>();
-    const T m = operands[1].get<T>();
-    const T a = operands[2].get<T>();
-    return (a + (n * m));
-  }
 
-  /** Helper function for instructions with the format `maddl xd, wn, wm, xa`.
-   * D represents the type of the destination register (either int64_t or
-   * uint64_t).
-   * N represents the type of the first source register (either
-   * int32_t or uint32_t).
-   * Returns single value of type D. */
-  template <typename D, typename N>
-  static D maddl_4ops(std::vector<RegisterValue>& operands) {
-    const D n = static_cast<D>(operands[0].get<N>());
-    const D m = static_cast<D>(operands[1].get<N>());
-    const D a = operands[2].get<D>();
-    return (a + (n * m));
-  }
+/** Helper function for instructions with the format `madd rd, rn, rm, ra`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns single value of type T. */
+template <typename T>
+T madd_4ops(srcValContainer& sourceValues) {
+  const T n = sourceValues[0].get<T>();
+  const T m = sourceValues[1].get<T>();
+  const T a = sourceValues[2].get<T>();
+  return (a + (n * m));
+}
 
-  /** Helper function for instructions with the format `mul rd, rn, rm`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns single value of type T. */
-  template <typename T>
-  static T mul_3ops(std::vector<RegisterValue>& operands) {
-    const T n = operands[0].get<T>();
-    const T m = operands[1].get<T>();
-    return (n * m);
-  }
+/** Helper function for instructions with the format `maddl xd, wn, wm, xa`.
+ * D represents the type of the destination register (either int64_t or
+ * uint64_t).
+ * N represents the type of the first source register (either
+ * int32_t or uint32_t).
+ * Returns single value of type D. */
+template <typename D, typename N>
+D maddl_4ops(srcValContainer& sourceValues) {
+  const D n = static_cast<D>(sourceValues[0].get<N>());
+  const D m = static_cast<D>(sourceValues[1].get<N>());
+  const D a = sourceValues[2].get<D>();
+  return (a + (n * m));
+}
+
+/** Helper function for instructions with the format `mul rd, rn, rm`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns single value of type T. */
+template <typename T>
+T mul_3ops(srcValContainer& sourceValues) {
+  const T n = sourceValues[0].get<T>();
+  const T m = sourceValues[1].get<T>();
+  return (n * m);
+}
+
+/** Helper function for instructions with the format `msub rd, rn, rm, ra`.
+ * T represents the type of sourceValues (e.g. for xn, T = uint64_t).
+ * Returns single value of type T. */
+template <typename T>
+T msub_4ops(srcValContainer& sourceValues) {
+  const T n = sourceValues[0].get<T>();
+  const T m = sourceValues[1].get<T>();
+  const T a = sourceValues[2].get<T>();
+  return (a - (n * m));
+}
 
-  /** Helper function for instructions with the format `msub rd, rn, rm, ra`.
-   * T represents the type of operands (e.g. for xn, T = uint64_t).
-   * Returns single value of type T. */
-  template <typename T>
-  static T msub_4ops(std::vector<RegisterValue>& operands) {
-    const T n = operands[0].get<T>();
-    const T m = operands[1].get<T>();
-    const T a = operands[2].get<T>();
-    return (a - (n * m));
-  }
-};
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh
index f3707232ed..cc9aa03461 100644
--- a/src/include/simeng/arch/aarch64/helpers/neon.hh
+++ b/src/include/simeng/arch/aarch64/helpers/neon.hh
@@ -5,891 +5,974 @@
 namespace simeng {
 namespace arch {
 namespace aarch64 {
-class neonHelp {
- public:
-  /** Helper function for NEON instructions with the format `add vd, vn, vm`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted Register Value. */
-  template <typename T, int I>
-  static RegisterValue vecAdd_3ops(std::vector<RegisterValue>& operands) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = static_cast<T>(n[i] + m[i]);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `addp vd, vn, vm`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted Register Value. */
-  template <typename T, int I>
-  static RegisterValue vecAddp_3ops(std::vector<RegisterValue>& operands) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-    T out[16 / sizeof(T)] = {0};
-    uint8_t offset = I / 2;
-    for (int i = 0; i < I; i++) {
-      if (i < offset) {
-        out[i] = static_cast<T>(n[i * 2] + n[(i * 2) + 1]);
-      } else {
-        out[i] =
-            static_cast<T>(m[(i - offset) * 2] + m[((i - offset) * 2) + 1]);
-      }
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `bic vd, vn, vm`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted Register Value. */
-  template <typename T, int I>
-  static RegisterValue vecBic_3ops(std::vector<RegisterValue>& operands) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = n[i] & ~m[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `bic vd, #imm{, lsl
-   * #shift}`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted Register Value. */
-  template <typename T, int I>
-  static RegisterValue vecBicShift_imm(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    const T* d = operands[0].getAsVector<T>();
-    T imm = ~shiftValue(static_cast<T>(metadata.operands[1].imm),
-                        metadata.operands[1].shift.type,
-                        metadata.operands[1].shift.value);
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = d[i] & imm;
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `bi<f,t> vd, vn,
-   * vm`.
-   * I represents the number of elements in the output array to be updated
-   * (e.g. for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <int I>
-  static RegisterValue vecBitwiseInsert(std::vector<RegisterValue>& operands,
-                                        bool isBif) {
-    const uint64_t* d = operands[0].getAsVector<uint64_t>();
-    const uint64_t* n = operands[1].getAsVector<uint64_t>();
-    const uint64_t* m = operands[2].getAsVector<uint64_t>();
-    uint64_t out[2] = {0};
-    for (int i = 0; i < (I / 8); i++) {
-      out[i] = isBif ? (d[i] & m[i]) | (n[i] & ~m[i])
-                     : (d[i] & ~m[i]) | (n[i] & m[i]);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `bsl vd, vn,
-   * vm`.
-   * I represents the number of elements in the output array to be updated
-   * (e.g. for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <int I>
-  static RegisterValue vecBsl(std::vector<RegisterValue>& operands) {
-    const uint64_t* d = operands[0].getAsVector<uint64_t>();
-    const uint64_t* n = operands[1].getAsVector<uint64_t>();
-    const uint64_t* m = operands[2].getAsVector<uint64_t>();
-    uint64_t out[2] = {0};
-    for (int i = 0; i < (I / 8); i++) {
-      out[i] = (d[i] & n[i]) | (~d[i] & m[i]);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for instructions with the format `cm<eq, ge, gt, hi, hs,
-   *le, lt> vd, vn, <vm, #0>`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecCompare(std::vector<RegisterValue>& operands,
-                                  bool cmpToZero,
-                                  std::function<bool(T, T)> func) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m;
-    if (!cmpToZero) m = operands[1].getAsVector<T>();
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = func(n[i], cmpToZero ? static_cast<T>(0) : m[i])
-                   ? static_cast<T>(-1)
-                   : 0;
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for instructions with the format `cnt vd, vn`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecCountPerByte(std::vector<RegisterValue>& operands) {
-    const uint8_t* n = operands[0].getAsVector<uint8_t>();
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      for (int j = 0; j < (sizeof(T) * 8); j++) {
-        // Move queried bit to LSB and extract via an AND operator
-        out[i] += ((n[i] >> j) & 1);
-      }
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `dup <rd, vd>,
-   * <vn[index], rn>`.
-   * T represents the type of operands (e.g. for vd.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecDup_gprOrIndex(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata, bool useGpr) {
-    int index = useGpr ? 0 : metadata.operands[1].vector_index;
-    T element =
-        useGpr ? operands[0].get<T>() : operands[0].getAsVector<T>()[index];
-    T out[16 / sizeof(T)] = {0};
-    std::fill_n(std::begin(out), I, element);
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `ext vd,
-   *  vn, vm, #index`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecExtVecs_index(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-    const uint64_t index = static_cast<uint64_t>(metadata.operands[3].imm);
-    T out[16 / sizeof(T)] = {0};
-
-    for (int i = index; i < I; i++) {
-      out[i - index] = n[i];
-    }
-    for (int i = 0; i < index; i++) {
-      out[I - index + i] = m[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `fabs vd, vn`.
-   * T represents the type of operands (e.g. for vn.2d, T = double).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecFabs_2ops(std::vector<RegisterValue>& operands) {
-    const T* n = operands[0].getAsVector<T>();
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = std::fabs(n[i]);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for instructions with the format `fcm<eq, ge, gt, hi, hs,
-   *le, lt> vd, vn, <vm, #0>`.
-   * T represents operand type (e.g. vd.2d is double).
-   * C represents comparison type (e.g. for T=float, comparison type is
-   * uint32_t).
-   * I represents the number of elements in the output array to be
-   * updated (e.g. for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, typename C, int I>
-  static RegisterValue vecFCompare(std::vector<RegisterValue>& operands,
-                                   bool cmpToZero,
-                                   std::function<bool(T, T)> func) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m;
-    if (!cmpToZero) m = operands[1].getAsVector<T>();
-    C out[16 / sizeof(C)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = func(n[i], cmpToZero ? static_cast<T>(0) : m[i])
-                   ? static_cast<C>(-1)
-                   : 0;
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `fcvtl{2} vd, vn`.
-   * D represents the dest. vector register type (e.g. vd.2d would be double).
-   * N represents the source vector register type (e.g. vd.4s would be float).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename D, typename N, int I>
-  static RegisterValue vecFcvtl(std::vector<RegisterValue>& operands,
-                                bool isFcvtl2) {
-    const N* n = operands[0].getAsVector<N>();
-    D out[16 / sizeof(D)] = {0};
-    for (int i = (isFcvtl2 ? I : 0); i < (isFcvtl2 ? (I * 2) : I); i++) {
-      out[isFcvtl2 ? (i - I) : i] = static_cast<D>(n[i]);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `fcvtn{2} vd, vn`.
-   * D represents the dest. vector register type (e.g. vd.2s would be float).
-   * N represents the source vector register type (e.g. vd.2d would be double).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename D, typename N, int I>
-  static RegisterValue vecFcvtn(std::vector<RegisterValue>& operands,
-                                bool isFcvtn2) {
-    const N* n = operands[0].getAsVector<N>();
-    D out[16 / sizeof(D)] = {0};
-    for (int i = (isFcvtn2 ? (I / 2) : 0); i < I; i++) {
-      out[i] = static_cast<D>(n[isFcvtn2 ? (i - (I / 2)) : i]);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `fcvtzs vd, vn`.
-   * D represents the dest. vector register type (e.g. vd.2s would be float).
-   * N represents the source vector register type (e.g. vd.2d would be double).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename D, typename N, int I>
-  static RegisterValue vecFcvtzs(std::vector<RegisterValue>& operands) {
-    const N* n = operands[0].getAsVector<N>();
-    D out[16 / sizeof(D)] = {0};
-    // TODO: Handle NaNs, denorms, and saturation
-    for (int i = 0; i < I; i++) {
-      out[i] = static_cast<D>(std::trunc(n[i]));
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `fmla vd,
-   *  vn, vm`.
-   * T represents the type of operands (e.g. for vn.2d, T = double).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecFmla_3vecs(std::vector<RegisterValue>& operands) {
-    const T* d = operands[0].getAsVector<T>();
-    const T* n = operands[1].getAsVector<T>();
-    const T* m = operands[2].getAsVector<T>();
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = d[i] + n[i] * m[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `fmla vd,
-   *  vn, vm[index]`.
-   * T represents the type of operands (e.g. for vn.2d, T = double).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecFmlaIndexed_3vecs(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    const T* d = operands[0].getAsVector<T>();
-    const T* n = operands[1].getAsVector<T>();
-    int index = metadata.operands[2].vector_index;
-    const T m = operands[2].getAsVector<T>()[index];
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = d[i] + n[i] * m;
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `fmls vd,
-   *  vn, vm`.
-   * T represents the type of operands (e.g. for vn.2d, T = double).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecFmls_3vecs(std::vector<RegisterValue>& operands) {
-    const T* d = operands[0].getAsVector<T>();
-    const T* n = operands[1].getAsVector<T>();
-    const T* m = operands[2].getAsVector<T>();
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = d[i] - (n[i] * m[i]);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `fmls vd,
-   *  vn, vm[index]`.
-   * T represents the type of operands (e.g. for vn.2d, T = double).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecFmlsIndexed_3vecs(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    const T* d = operands[0].getAsVector<T>();
-    const T* n = operands[1].getAsVector<T>();
-    int index = metadata.operands[2].vector_index;
-    const T m = operands[2].getAsVector<T>()[index];
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = d[i] - n[i] * m;
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `fmul rd,
-   *  rn, vm[index]`.
-   * T represents the type of operands (e.g. for vn.2d, T = double).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecFmulIndexed_vecs(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    int index = metadata.operands[2].vector_index;
-    const T* n = operands[0].getAsVector<T>();
-    const T m = operands[1].getAsVector<T>()[index];
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = n[i] * m;
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `fneg vd, vn`.
-   * T represents the type of operands (e.g. for vn.2d, T = double).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecFneg_2ops(std::vector<RegisterValue>& operands) {
-    const T* n = operands[0].getAsVector<T>();
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = -n[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `fsqrt vd, vn`.
-   * T represents the type of operands (e.g. for vn.2d, T = double).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecFsqrt_2ops(std::vector<RegisterValue>& operands) {
-    const T* n = operands[0].getAsVector<T>();
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = ::sqrt(n[i]);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `frsqrte vd, vn`.
-   * T represents the type of operands (e.g. for vn.2d, T = double).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecFrsqrte_2ops(std::vector<RegisterValue>& operands) {
-    const T* n = operands[0].getAsVector<T>();
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = 1.0f / sqrtf(n[i]);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `frsqrts vd, vn,
-   * vm`.
-   * T represents the type of operands (e.g. for vn.2d, T = double).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecFrsqrts_3ops(std::vector<RegisterValue>& operands) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = (3.0f - n[i] * m[i]) / 2.0f;
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `ins vd[index],
-   *  vn[index]`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecIns_2Index(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    const T* d = operands[0].getAsVector<T>();
-    const T* n = operands[1].getAsVector<T>();
-
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = d[i];
-    }
-    out[metadata.operands[0].vector_index] =
-        n[metadata.operands[1].vector_index];
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `ins vd[index],
-   *  rn`.
-   * T represents the vector register type (e.g. vd.16b would be uint8_t).
-   * R represents the type of the GPR (e.g. wn would be uint32_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, typename R, int I>
-  static RegisterValue vecInsIndex_gpr(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    const T* d = operands[0].getAsVector<T>();
-    const T n = operands[1].get<R>();
-    T out[16 / sizeof(T)] = {0};
-
-    for (int i = 0; i < I; i++) {
+
+/** Helper function for NEON instructions with the format `add vd, vn, vm`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted Register Value. */
+template <typename T, int I>
+RegisterValue vecAdd_3ops(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = static_cast<T>(n[i] + m[i]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `addp vd, vn, vm`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted Register Value. */
+template <typename T, int I>
+RegisterValue vecAddp_3ops(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+  T out[16 / sizeof(T)] = {0};
+  uint8_t offset = I / 2;
+  for (int i = 0; i < I; i++) {
+    if (i < offset) {
+      out[i] = static_cast<T>(n[i * 2] + n[(i * 2) + 1]);
+    } else {
+      out[i] = static_cast<T>(m[(i - offset) * 2] + m[((i - offset) * 2) + 1]);
+    }
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `bic vd, vn, vm`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted Register Value. */
+template <typename T, int I>
+RegisterValue vecBic_3ops(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = n[i] & ~m[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `bic vd, #imm{, lsl
+ * #shift}`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted Register Value. */
+template <typename T, int I>
+RegisterValue vecBicShift_imm(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  T imm = ~shiftValue(static_cast<T>(metadata.operands[1].imm),
+                      metadata.operands[1].shift.type,
+                      metadata.operands[1].shift.value);
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = d[i] & imm;
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `bi<f,t> vd, vn,
+ * vm`.
+ * I represents the number of elements in the output array to be updated
+ * (e.g. for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <int I>
+RegisterValue vecBitwiseInsert(srcValContainer& sourceValues, bool isBif) {
+  const uint64_t* d = sourceValues[0].getAsVector<uint64_t>();
+  const uint64_t* n = sourceValues[1].getAsVector<uint64_t>();
+  const uint64_t* m = sourceValues[2].getAsVector<uint64_t>();
+  uint64_t out[2] = {0};
+  for (int i = 0; i < (I / 8); i++) {
+    out[i] =
+        isBif ? (d[i] & m[i]) | (n[i] & ~m[i]) : (d[i] & ~m[i]) | (n[i] & m[i]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `bsl vd, vn,
+ * vm`.
+ * I represents the number of elements in the output array to be updated
+ * (e.g. for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <int I>
+RegisterValue vecBsl(srcValContainer& sourceValues) {
+  const uint64_t* d = sourceValues[0].getAsVector<uint64_t>();
+  const uint64_t* n = sourceValues[1].getAsVector<uint64_t>();
+  const uint64_t* m = sourceValues[2].getAsVector<uint64_t>();
+  uint64_t out[2] = {0};
+  for (int i = 0; i < (I / 8); i++) {
+    out[i] = (d[i] & n[i]) | (~d[i] & m[i]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for instructions with the format `cm<eq, ge, gt, hi, hs,
+ *le, lt> vd, vn, <vm, #0>`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecCompare(srcValContainer& sourceValues, bool cmpToZero,
+                         std::function<bool(T, T)> func) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m;
+  if (!cmpToZero) m = sourceValues[1].getAsVector<T>();
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = func(n[i], cmpToZero ? static_cast<T>(0) : m[i])
+                 ? static_cast<T>(-1)
+                 : 0;
+  }
+  return {out, 256};
+}
+
+/** Helper function for instructions with the format `cnt vd, vn`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecCountPerByte(srcValContainer& sourceValues) {
+  const uint8_t* n = sourceValues[0].getAsVector<uint8_t>();
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    for (size_t j = 0; j < (sizeof(T) * 8); j++) {
+      // Move queried bit to LSB and extract via an AND operator
+      out[i] += ((n[i] >> j) & 1);
+    }
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `dup <rd, vd>,
+ * <vn[index], rn>`.
+ * T represents the type of sourceValues (e.g. for vd.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecDup_gprOrIndex(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata, bool useGpr) {
+  int index = useGpr ? 0 : metadata.operands[1].vector_index;
+  T element = useGpr ? sourceValues[0].get<T>()
+                     : sourceValues[0].getAsVector<T>()[index];
+  T out[16 / sizeof(T)] = {0};
+  std::fill_n(std::begin(out), I, element);
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `ext vd,
+ *  vn, vm, #index`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecExtVecs_index(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+  const uint64_t index = static_cast<uint64_t>(metadata.operands[3].imm);
+  T out[16 / sizeof(T)] = {0};
+
+  for (uint64_t i = index; i < I; i++) {
+    out[i - index] = n[i];
+  }
+  for (uint64_t i = 0; i < index; i++) {
+    out[I - index + i] = m[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `fabd vd.T, vn.T,
+ * vm.T`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = double).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecFabd(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = std::fabs(n[i] - m[i]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `fabs vd, vn`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = double).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecFabs_2ops(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = std::fabs(n[i]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for instructions with the format `fcm<eq, ge, gt, hi, hs,
+ *le, lt> vd, vn, <vm, #0>`.
+ * T represents operand type (e.g. vd.2d is double).
+ * C represents comparison type (e.g. for T=float, comparison type is
+ * uint32_t).
+ * I represents the number of elements in the output array to be
+ * updated (e.g. for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, typename C, int I>
+RegisterValue vecFCompare(srcValContainer& sourceValues, bool cmpToZero,
+                          std::function<bool(T, T)> func) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m;
+  if (!cmpToZero) m = sourceValues[1].getAsVector<T>();
+  C out[16 / sizeof(C)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = func(n[i], cmpToZero ? static_cast<T>(0) : m[i])
+                 ? static_cast<C>(-1)
+                 : 0;
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `fcvtl{2} vd, vn`.
+ * D represents the dest. vector register type (e.g. vd.2d would be double).
+ * N represents the source vector register type (e.g. vd.4s would be float).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename D, typename N, int I>
+RegisterValue vecFcvtl(srcValContainer& sourceValues, bool isFcvtl2) {
+  const N* n = sourceValues[0].getAsVector<N>();
+  D out[16 / sizeof(D)] = {0};
+  for (int i = (isFcvtl2 ? I : 0); i < (isFcvtl2 ? (I * 2) : I); i++) {
+    out[isFcvtl2 ? (i - I) : i] = static_cast<D>(n[i]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `fcvtn{2} vd, vn`.
+ * D represents the dest. vector register type (e.g. vd.2s would be float).
+ * N represents the source vector register type (e.g. vd.2d would be double).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename D, typename N, int I>
+RegisterValue vecFcvtn(srcValContainer& sourceValues, bool isFcvtn2) {
+  const N* n = sourceValues[0].getAsVector<N>();
+  D out[16 / sizeof(D)] = {0};
+  for (int i = (isFcvtn2 ? (I / 2) : 0); i < I; i++) {
+    out[i] = static_cast<D>(n[isFcvtn2 ? (i - (I / 2)) : i]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `fcvtzs vd, vn`.
+ * D represents the dest. vector register type (e.g. vd.2s would be float).
+ * N represents the source vector register type (e.g. vd.2d would be double).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename D, typename N, int I>
+RegisterValue vecFcvtzs(srcValContainer& sourceValues) {
+  const N* n = sourceValues[0].getAsVector<N>();
+  D out[16 / sizeof(D)] = {0};
+  // TODO: Handle NaNs, denorms, and saturation
+  for (int i = 0; i < I; i++) {
+    out[i] = static_cast<D>(std::trunc(n[i]));
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `fmla vd,
+ *  vn, vm`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = double).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecFmla_3vecs(srcValContainer& sourceValues) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  const T* n = sourceValues[1].getAsVector<T>();
+  const T* m = sourceValues[2].getAsVector<T>();
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = d[i] + n[i] * m[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `fdiv vd, vn, vm`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+std::enable_if_t<std::is_floating_point_v<T>, RegisterValue> vecFDiv(
+    srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    if (m[i] == 0)
+      out[i] = sizeof(T) == 8 ? std::nan("") : std::nanf("");
+    else
+      out[i] = n[i] / m[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `fmla vd,
+ *  vn, vm[index]`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = double).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecFmlaIndexed_3vecs(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  const T* n = sourceValues[1].getAsVector<T>();
+  int index = metadata.operands[2].vector_index;
+  const T m = sourceValues[2].getAsVector<T>()[index];
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = d[i] + n[i] * m;
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `fmls vd,
+ *  vn, vm`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = double).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecFmls_3vecs(srcValContainer& sourceValues) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  const T* n = sourceValues[1].getAsVector<T>();
+  const T* m = sourceValues[2].getAsVector<T>();
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = d[i] - (n[i] * m[i]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `fmls vd,
+ *  vn, vm[index]`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = double).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecFmlsIndexed_3vecs(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  const T* n = sourceValues[1].getAsVector<T>();
+  int index = metadata.operands[2].vector_index;
+  const T m = sourceValues[2].getAsVector<T>()[index];
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = d[i] - n[i] * m;
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `fmul rd,
+ *  rn, vm[index]`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = double).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecFmulIndexed_vecs(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  int index = metadata.operands[2].vector_index;
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T m = sourceValues[1].getAsVector<T>()[index];
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = n[i] * m;
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `fneg vd, vn`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = double).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecFneg_2ops(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = -n[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `fsqrt vd, vn`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = double).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecFsqrt_2ops(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = ::sqrt(n[i]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `frsqrte vd, vn`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = double).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecFrsqrte_2ops(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = 1.0f / sqrtf(n[i]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `frsqrts vd, vn,
+ * vm`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = double).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecFrsqrts_3ops(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = (3.0f - n[i] * m[i]) / 2.0f;
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `ins vd[index],
+ *  vn[index]`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecIns_2Index(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  const T* n = sourceValues[1].getAsVector<T>();
+
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = d[i];
+  }
+  out[metadata.operands[0].vector_index] = n[metadata.operands[1].vector_index];
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `ins vd[index],
+ *  rn`.
+ * T represents the vector register type (e.g. vd.16b would be uint8_t).
+ * R represents the type of the GPR (e.g. wn would be uint32_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, typename R, int I>
+RegisterValue vecInsIndex_gpr(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  const T n = sourceValues[1].get<R>();
+  T out[16 / sizeof(T)] = {0};
+
+  for (int i = 0; i < I; i++) {
+    out[i] = d[i];
+  }
+  out[metadata.operands[0].vector_index] = n;
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `<NOT, ...> vd,
+ *  vn`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecLogicOp_2vecs(srcValContainer& sourceValues,
+                               std::function<T(T)> func) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = func(n[i]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `<AND, EOR, ...> vd,
+ *  vn, vm`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecLogicOp_3vecs(srcValContainer& sourceValues,
+                               std::function<T(T, T)> func) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = func(n[i], m[i]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `umaxp vd, vn, vm`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecUMaxP(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+
+  // Concatenate the vectors
+  T temp[2 * I];
+  memcpy(temp, n, sizeof(T) * I);
+  memcpy(temp + (sizeof(T) * I), m, sizeof(T) * I);
+  // Compare each adjacent pair of elements
+  T out[I];
+  for (int i = 0; i < I; i++) {
+    out[i] = std::max(temp[2 * i], temp[2 * i + 1]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `uminp vd, vn, vm`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecUMinP(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+
+  // Concatenate the vectors
+  T temp[2 * I];
+  memcpy(temp, n, sizeof(T) * I);
+  memcpy(temp + (sizeof(T) * I), m, sizeof(T) * I);
+
+  T out[I];
+  for (int i = 0; i < I; i++) {
+    out[i] = std::min(temp[2 * i], temp[2 * i + 1]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `maxnmp rd, vn`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecMaxnmp_2ops(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  bool isFP = std::is_floating_point<T>::value;
+
+  T out = n[0];
+  for (int i = 1; i < I; i++) {
+    out = isFP ? std::fmax(n[i], out) : std::max(n[i], out);
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `sminv sd, vn`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecMinv_2ops(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  bool isFP = std::is_floating_point<T>::value;
+
+  T out = n[0];
+  for (int i = 1; i < I; i++) {
+    out = isFP ? std::fmin(n[i], out) : std::min(n[i], out);
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `movi vd, #imm`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecMovi_imm(
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  bool isFP = std::is_floating_point<T>::value;
+  const T imm =
+      isFP ? metadata.operands[1].fp : static_cast<T>(metadata.operands[1].imm);
+  T out[16 / sizeof(T)] = {0};
+  std::fill_n(std::begin(out), I, imm);
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `movi vd, #imm{, lsl
+ * #shift}`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecMoviShift_imm(
+    const simeng::arch::aarch64::InstructionMetadata& metadata, bool negate) {
+  const T bits = shiftValue(static_cast<T>(metadata.operands[1].imm),
+                            metadata.operands[1].shift.type,
+                            metadata.operands[1].shift.value);
+  T out[16 / sizeof(T)] = {0};
+  std::fill_n(std::begin(out), I, negate ? ~bits : bits);
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `scvtf vd,
+ *  vn`.
+ * D represents the destination vector register type (e.g. for vd.2d, D =
+ * double).
+ * N represents the source vector register type (e.g. for vn.2s N = int32_t).
+ * I represents the number of elements in the output array to be
+ * updated (e.g. for vd.8b I = 8).
+ * Returns correctly formated RegisterValue. */
+template <typename D, typename N, int I>
+RegisterValue vecScvtf_2vecs(srcValContainer& sourceValues,
+                             std::function<D(N)> func) {
+  const N* n = sourceValues[0].getAsVector<N>();
+  D out[16 / sizeof(D)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = static_cast<D>(n[i]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `shl vd, vn, #imm`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecShlShift_vecImm(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  int64_t shift = metadata.operands[2].imm;
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = static_cast<T>(n[i] << shift);
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `shll{2} vd, vn,
+ * #imm`.
+ * D represents the destination register type (e.g. for vd.2d D = int64_t).
+ * N represents the source register type (e.g. for vd.4s D = int32_t).
+ * I represents the number of elements in the output array to be
+ * updated (e.g. for vd.8h the I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename D, typename N, int I>
+RegisterValue vecShllShift_vecImm(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata, bool isShll2) {
+  const N* n = sourceValues[0].getAsVector<N>();
+  uint64_t shift = metadata.operands[2].imm;
+  D out[16 / sizeof(D)] = {0};
+  int index = isShll2 ? I : 0;
+  for (int i = 0; i < I; i++) {
+    out[i] = n[index] << shift;
+    index++;
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `shrn vd, vn, #imm`.
+ * Ta represents the type of source operand (e.g. for vn.2d, Ta = uint64_t).
+ * Tb represents the type of destination operand (e.g. for vd.2s, Tb =
+ * uint32_t).
+ * I represents the number of elements in the output array to be
+ * updated (e.g. for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue.
+ */
+template <typename Ta, typename Tb, int I>
+RegisterValue vecShrnShift_imm(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    bool shrn2 = false) {
+  const Ta* n = sourceValues[0].getAsVector<Ta>();
+
+  uint64_t shift = metadata.operands[2].imm;
+
+  Tb out[16 / sizeof(Tb)] = {0};
+  int index = shrn2 ? I : 0;
+  for (int i = 0; i < I; i++) {
+    out[index + i] = static_cast<Tb>(std::trunc(n[i] >> shift));
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `sshr vd, vn, #imm`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecSshrShift_imm(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  uint64_t shift = metadata.operands[2].imm;
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I; i++) {
+    out[i] = static_cast<T>(std::trunc(n[i] >> shift));
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `addp rd, vn`.
+ * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecSumElems_2ops(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  T out = 0;
+  for (int i = 0; i < I; i++) {
+    out += n[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `xtn{2} vd, vn`.
+ * D represents the type of the dest. register (e.g. for vd.s, D = uint32_t).
+ * N represents the type of the source register (e.g. for vn.d, N = uint64_t).
+ * I represents the number of elements in the output vector to be
+ * updated (i.e. for vd.4s I = 4).
+ * Returns correctly formatted RegisterValue. */
+template <typename D, typename N, int I>
+RegisterValue vecXtn(srcValContainer& sourceValues, bool isXtn2) {
+  const D* d;
+  if (isXtn2) d = sourceValues[0].getAsVector<D>();
+  const N* n = sourceValues[isXtn2 ? 1 : 0].getAsVector<N>();
+
+  D out[16 / sizeof(D)] = {0};
+  int index = 0;
+
+  for (int i = 0; i < I; i++) {
+    if (isXtn2 & (i < (I / 2))) {
       out[i] = d[i];
-    }
-    out[metadata.operands[0].vector_index] = n;
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `<NOT, ...> vd,
-   *  vn`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecLogicOp_2vecs(std::vector<RegisterValue>& operands,
-                                        std::function<T(T)> func) {
-    const T* n = operands[0].getAsVector<T>();
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = func(n[i]);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `<AND, EOR, ...> vd,
-   *  vn, vm`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecLogicOp_3vecs(std::vector<RegisterValue>& operands,
-                                        std::function<T(T, T)> func) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = func(n[i], m[i]);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `umaxp vd, vn, vm`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecUMaxP(std::vector<RegisterValue>& operands) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-
-    T out[I];
-    for (int i = 0; i < I; i++) {
-      out[i] = std::max(n[i], m[i]);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `uminp vd, vn, vm`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecUMinP(std::vector<RegisterValue>& operands) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-
-    T out[I];
-    for (int i = 0; i < I; i++) {
-      out[i] = std::min(n[i], m[i]);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `maxnmp rd, vn`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecMaxnmp_2ops(std::vector<RegisterValue>& operands) {
-    const T* n = operands[0].getAsVector<T>();
-    bool isFP = std::is_floating_point<T>::value;
-
-    T out = n[0];
-    for (int i = 1; i < I; i++) {
-      out = isFP ? std::fmax(n[i], out) : std::max(n[i], out);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `sminv sd, vn`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecMinv_2ops(std::vector<RegisterValue>& operands) {
-    const T* n = operands[0].getAsVector<T>();
-    bool isFP = std::is_floating_point<T>::value;
-
-    T out = n[0];
-    for (int i = 1; i < I; i++) {
-      out = isFP ? std::fmin(n[i], out) : std::min(n[i], out);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `movi vd, #imm`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecMovi_imm(
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    bool isFP = std::is_floating_point<T>::value;
-    const T imm = isFP ? metadata.operands[1].fp
-                       : static_cast<T>(metadata.operands[1].imm);
-    T out[16 / sizeof(T)] = {0};
-    std::fill_n(std::begin(out), I, imm);
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `movi vd, #imm{, lsl
-   * #shift}`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecMoviShift_imm(
-      const simeng::arch::aarch64::InstructionMetadata& metadata, bool negate) {
-    const T bits = shiftValue(static_cast<T>(metadata.operands[1].imm),
-                              metadata.operands[1].shift.type,
-                              metadata.operands[1].shift.value);
-    T out[16 / sizeof(T)] = {0};
-    std::fill_n(std::begin(out), I, negate ? ~bits : bits);
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `scvtf vd,
-   *  vn`.
-   * D represents the destination vector register type (e.g. for vd.2d, D =
-   * double).
-   * N represents the source vector register type (e.g. for vn.2s N = int32_t).
-   * I represents the number of elements in the output array to be
-   * updated (e.g. for vd.8b I = 8).
-   * Returns correctly formated RegisterValue. */
-  template <typename D, typename N, int I>
-  static RegisterValue vecScvtf_2vecs(std::vector<RegisterValue>& operands,
-                                      std::function<D(N)> func) {
-    const N* n = operands[0].getAsVector<N>();
-    D out[16 / sizeof(D)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = static_cast<D>(n[i]);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `shl vd, vn, #imm`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecShlShift_vecImm(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    const T* n = operands[0].getAsVector<T>();
-    int64_t shift = metadata.operands[2].imm;
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = static_cast<T>(n[i] << shift);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `shll{2} vd, vn,
-   * #imm`.
-   * D represents the destination register type (e.g. for vd.2d D = int64_t).
-   * N represents the source register type (e.g. for vd.4s D = int32_t).
-   * I represents the number of elements in the output array to be
-   * updated (e.g. for vd.8h the I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename D, typename N, int I>
-  static RegisterValue vecShllShift_vecImm(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      bool isShll2) {
-    const N* n = operands[0].getAsVector<N>();
-    uint64_t shift = metadata.operands[2].imm;
-    D out[16 / sizeof(D)] = {0};
-    int index = isShll2 ? I : 0;
-    for (int i = 0; i < I; i++) {
-      out[i] = n[index] << shift;
+    } else {
+      out[i] = static_cast<D>(n[index]);
       index++;
     }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `sshr vd, vn, #imm`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecSshrShift_imm(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    const T* n = operands[1].getAsVector<T>();
-    uint64_t shift = metadata.operands[2].imm;
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I; i++) {
-      out[i] = static_cast<T>(std::trunc(n[i] >> shift));
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `addp rd, vn`.
-   * T represents the type of operands (e.g. for vn.2d, T = uint64_t).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int I>
-  static RegisterValue vecSumElems_2ops(std::vector<RegisterValue>& operands) {
-    const T* n = operands[0].getAsVector<T>();
-    T out = 0;
-    for (int i = 0; i < I; i++) {
-      out += n[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `xtn{2} vd, vn`.
-   * D represents the type of the dest. register (e.g. for vd.s, D = uint32_t).
-   * N represents the type of the source register (e.g. for vn.d, N = uint64_t).
-   * I represents the number of elements in the output vector to be
-   * updated (i.e. for vd.4s I = 4).
-   * Returns correctly formatted RegisterValue. */
-  template <typename D, typename N, int I>
-  static RegisterValue vecXtn(std::vector<RegisterValue>& operands,
-                              bool isXtn2) {
-    const D* d;
-    if (isXtn2) d = operands[0].getAsVector<D>();
-    const N* n = operands[isXtn2 ? 1 : 0].getAsVector<N>();
-
-    D out[16 / sizeof(D)] = {0};
-    int index = 0;
-
-    for (int i = 0; i < I; i++) {
-      if (isXtn2 & (i < (I / 2))) {
-        out[i] = d[i];
-      } else {
-        out[i] = static_cast<D>(n[index]);
-        index++;
-      }
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `tbl Vd.Ta, {Vn.16b,
-   * ... Vn+3.16b}, Vm.Ta`.
-   * I represents the number of elements in the output vector to be updated
-   * (i.e. for vd.8b I = 8, vd.16b I = 16). Only 8 or 16 is valid for TBL
-   * instructions.
-   * Returns correctly formatted RegisterValue. */
-  template <int I>
-  static RegisterValue vecTbl(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    // Vd and Vm are only valid in format 8b or 16b
-    assert(I == 8 || I == 16);
-
-    // Vm contains the indices to fetch from table
-    const int8_t* Vm =
-        operands[metadata.operandCount - 2]
-            .getAsVector<int8_t>();  // final operand is vecMovi_imm
-
-    // All operands except the first and last are the vector registers to
-    // construct the table from
-    const uint8_t n_table_regs = metadata.operandCount - 2;
-
-    // Create table from vectors. All table operands must be of 16b format.
-    int tableSize = 16 * n_table_regs;
-    uint8_t table[tableSize];
-    for (int i = 0; i < n_table_regs; i++) {
-      const int8_t* currentVector = operands[i].getAsVector<int8_t>();
-      for (int j = 0; j < 16; j++) {
-        table[16 * i + j] = currentVector[j];
-      }
-    }
-
-    int8_t out[16 / sizeof(int8_t)] = {0};
-    for (int i = 0; i < I; i++) {
-      unsigned int index = Vm[i];
-
-      // If an index is out of range for the table, the result for that lookup
-      // is 0
-      if (index >= tableSize) {
-        out[i] = 0;
-        continue;
-      }
-
-      out[i] = table[index];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `rev<16,32,64> Vd.T,
-   * Vn.T`.
-   * T represents the type of elements to be reversed (e.g. for Vn.d, T =
-   * uint64_t).
-   * V represents the variant: 16-bit, 32-bit, 64-bit. (e.g. for 64-bit each
-   * doubleword of the vector will be reversed).
-   * I represents the number of elements in the output array to be updated (e.g.
-   * for vd.8b I = 8).
-   * It is only valid for T to be a same or smaller width than V.
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, int V, int I>
-  static RegisterValue vecRev(std::vector<RegisterValue>& operands) {
-    const T* source = operands[0].getAsVector<T>();
-    int element_size = (sizeof(T) * 8);
-    int datasize = I * element_size;
-    int container_size = V;
-    int n_containers = datasize / container_size;
-    int elements_per_container = container_size / element_size;
-
-    int element = 0;
-    int rev_element;
-    T out[16 / sizeof(T)] = {0};
-    for (int c = 0; c < n_containers; c++) {
-      rev_element = element + elements_per_container - 1;
-      for (int e = 0; e < elements_per_container; e++) {
-        out[rev_element] = source[element];
-        element++;
-        rev_element--;
-      }
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `trn1 Vd.T, Vn.T,
-   * Vm.T`.
-   * T represents the type of operands (e.g. for vn.d, T = uint64_t).
-   * I represents the number of operands (e.g. for vn.8b, I = 8).
-   * Returns formatted Register Value. */
-  template <typename T, int I>
-  static RegisterValue vecTrn1(std::vector<RegisterValue>& operands) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I / 2; i++) {
-      out[2 * i] = n[2 * i];
-      out[(2 * i) + 1] = m[2 * i];
-    }
-
-    return {out, 256};
   }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `tbl Vd.Ta, {Vn.16b,
+ * ... Vn+3.16b}, Vm.Ta`.
+ * I represents the number of elements in the output vector to be updated
+ * (i.e. for vd.8b I = 8, vd.16b I = 16). Only 8 or 16 is valid for TBL
+ * instructions.
+ * Returns correctly formatted RegisterValue. */
+template <int I>
+RegisterValue vecTbl(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  // Vd and Vm are only valid in format 8b or 16b
+  assert(I == 8 || I == 16);
+
+  // Vm contains the indices to fetch from table
+  const uint8_t* Vm =
+      sourceValues[metadata.operandCount - 2]
+          .getAsVector<uint8_t>();  // final operand is vecMovi_imm
+
+  // All sourceValues except the first and last are the vector registers to
+  // construct the table from
+  const uint8_t n_table_regs = metadata.operandCount - 2;
+
+  // Create table from vectors. All table sourceValues must be of 16b format.
+  const uint16_t tableSize = 16 * n_table_regs;
+  std::vector<uint8_t> table(tableSize, 0);
+  for (uint8_t i = 0; i < n_table_regs; i++) {
+    const uint8_t* currentVector = sourceValues[i].getAsVector<uint8_t>();
+    for (uint8_t j = 0; j < 16; j++) {
+      table[16 * i + j] = currentVector[j];
+    }
+  }
+
+  uint8_t out[16 / sizeof(uint8_t)] = {0};
+  for (int i = 0; i < I; i++) {
+    uint8_t index = Vm[i];
+
+    // If an index is out of range for the table, the result for that lookup
+    // is 0
+    if (index >= tableSize) {
+      out[i] = 0;
+      continue;
+    }
+
+    out[i] = table[index];
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `rev<16,32,64> Vd.T,
+ * Vn.T`.
+ * T represents the type of elements to be reversed (e.g. for Vn.d, T =
+ * uint64_t).
+ * V represents the variant: 16-bit, 32-bit, 64-bit. (e.g. for 64-bit each
+ * doubleword of the vector will be reversed).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * It is only valid for T to be a same or smaller width than V.
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int V, int I>
+RegisterValue vecRev(srcValContainer& sourceValues) {
+  const T* source = sourceValues[0].getAsVector<T>();
+  int element_size = (sizeof(T) * 8);
+  int datasize = I * element_size;
+  int container_size = V;
+  int n_containers = datasize / container_size;
+  int elements_per_container = container_size / element_size;
+
+  int element = 0;
+  int rev_element;
+  T out[16 / sizeof(T)] = {0};
+  for (int c = 0; c < n_containers; c++) {
+    rev_element = element + elements_per_container - 1;
+    for (int e = 0; e < elements_per_container; e++) {
+      out[rev_element] = source[element];
+      element++;
+      rev_element--;
+    }
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `trn1 Vd.T, Vn.T,
+ * Vm.T`.
+ * T represents the type of sourceValues (e.g. for vn.d, T = uint64_t).
+ * I represents the number of sourceValues (e.g. for vn.8b, I = 8).
+ * Returns formatted Register Value. */
+template <typename T, int I>
+RegisterValue vecTrn1(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I / 2; i++) {
+    out[2 * i] = n[2 * i];
+    out[(2 * i) + 1] = m[2 * i];
+  }
+
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `trn2 Vd.T, Vn.T,
+ * Vm.T`.
+ * T represents the type of sourceValues (e.g. for Vn.d, T = uint64_t).
+ * I represents the number of sourceValues (e.g. for Vn.8b, I = 8).
+ * Returns formatted Register Value. */
+template <typename T, int I>
+RegisterValue vecTrn2(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I / 2; i++) {
+    out[2 * i] = n[(2 * i) + 1];
+    out[(2 * i) + 1] = m[(2 * i) + 1];
+  }
+
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `uzp<1,2> Vd.T,
+ * Vn.T, Vm.T`.
+ * T represents the type of sourceValues (e.g. for Vn.d, T = uint64_t).
+ * I represents the number of sourceValues (e.g. for Vn.8b, I = 8).
+ * Returns formatted Register Value. */
+template <typename T, int I>
+RegisterValue vecUzp(srcValContainer& sourceValues, bool isUzp1) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+
+  T out[16 / sizeof(T)] = {0};
+  for (int i = 0; i < I / 2; i++) {
+    int index = isUzp1 ? (2 * i) : (2 * i) + 1;
+    out[i] = n[index];
+    out[(I / 2) + i] = m[index];
+  }
+
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `zip<1,2> vd.T,
+ * vn.T, vm.T`.
+ * T represents the type of sourceValues (e.g. for vn.d, T = uint64_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vn.8b, I = 8).
+ * Returns formatted Register Value. */
+template <typename T, int I>
+RegisterValue vecZip(srcValContainer& sourceValues, bool isZip2) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+
+  T out[16 / sizeof(T)] = {0};
+  int index = isZip2 ? (I / 2) : 0;
+  for (int i = 0; i < I / 2; i++) {
+    out[2 * i] = n[index];
+    out[(2 * i) + 1] = m[index];
+    index++;
+  }
+
+  return {out, 256};
+}
 
-  /** Helper function for NEON instructions with the format `trn2 Vd.T, Vn.T,
-   * Vm.T`.
-   * T represents the type of operands (e.g. for Vn.d, T = uint64_t).
-   * I represents the number of operands (e.g. for Vn.8b, I = 8).
-   * Returns formatted Register Value. */
-  template <typename T, int I>
-  static RegisterValue vecTrn2(std::vector<RegisterValue>& operands) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I / 2; i++) {
-      out[2 * i] = n[(2 * i) + 1];
-      out[(2 * i) + 1] = m[(2 * i) + 1];
-    }
-
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `uzp<1,2> Vd.T,
-   * Vn.T, Vm.T`.
-   * T represents the type of operands (e.g. for Vn.d, T = uint64_t).
-   * I represents the number of operands (e.g. for Vn.8b, I = 8).
-   * Returns formatted Register Value. */
-  template <typename T, int I>
-  static RegisterValue vecUzp(std::vector<RegisterValue>& operands,
-                              bool isUzp1) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-
-    T out[16 / sizeof(T)] = {0};
-    for (int i = 0; i < I / 2; i++) {
-      int index = isUzp1 ? (2 * i) : (2 * i) + 1;
-      out[i] = n[index];
-      out[(I / 2) + i] = m[index];
-    }
-
-    return {out, 256};
-  }
-};
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/aarch64/helpers/store.hh b/src/include/simeng/arch/aarch64/helpers/store.hh
deleted file mode 100644
index 18d3d6f915..0000000000
--- a/src/include/simeng/arch/aarch64/helpers/store.hh
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-
-#include "auxiliaryFunctions.hh"
-
-namespace simeng {
-namespace arch {
-namespace aarch64 {
-class storeHelp {
- public:
-  static void tempFunc() { return; }
-};
-}  // namespace aarch64
-}  // namespace arch
-}  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index f53a466914..2c33ccfbe6 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -8,1671 +8,1763 @@
 namespace simeng {
 namespace arch {
 namespace aarch64 {
-class sveHelp {
- public:
-  /** Helper function for SVE instructions with the format `add zd, zn, zm`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveAdd_3ops(std::vector<RegisterValue>& operands,
-                                   const uint16_t VL_bits) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-    for (int i = 0; i < partition_num; i++) {
-      out[i] = n[i] + m[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `add zdn, pg/m, zdn,
-   * const`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveAddPredicated_const(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits) {
-    bool isFP = std::is_floating_point<T>::value;
-    const uint64_t* p = operands[0].getAsVector<uint64_t>();
-    const T* d = operands[1].getAsVector<T>();
-    const auto con = isFP ? metadata.operands[3].fp : metadata.operands[3].imm;
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active)
-        out[i] = d[i] + con;
-      else
-        out[i] = d[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `add zdn, pg/m, zdn,
-   * zm`.
-   * T represents the type of operands (e.g. for zdn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveAddPredicated_vecs(
-      std::vector<RegisterValue>& operands, const uint16_t VL_bits) {
-    const uint64_t* p = operands[0].getAsVector<uint64_t>();
-    const T* d = operands[1].getAsVector<T>();
-    const T* m = operands[2].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active)
-        out[i] = d[i] + m[i];
-      else
-        out[i] = d[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for NEON instructions with the format `addv dd, pg, zn`.
-   * T represents the type of operands (e.g. for zn.s, T = uint32_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveAddvPredicated(std::vector<RegisterValue>& operands,
-                                         const uint16_t VL_bits) {
-    const uint64_t* p = operands[0].getAsVector<uint64_t>();
-    const T* n = operands[1].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    uint64_t out = 0;
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active)
-        out += static_cast<uint64_t>(n[i]);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `adr zd, [zn, zm{,
-   * lsl #<1,2,3>}]`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveAdr_packedOffsets(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    const int mbytes = 1 << metadata.operands[2].shift.value;
-    for (int i = 0; i < partition_num; i++) {
-      out[i] = n[i] + (m[i] * mbytes);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for instructions with the format `cmp<eq, ge, gt, hi, hs,
-   *le, lo, ls, lt, ne> pd, pg/z, zn, <zm, #imm>`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns tuple of type [pred result (array of 4 uint64_t), nzcv]. */
-  template <typename T>
-  static std::tuple<std::array<uint64_t, 4>, uint8_t> sveCmpPredicated_toPred(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits, bool cmpToImm, std::function<bool(T, T)> func) {
-    const uint64_t* p = operands[0].getAsVector<uint64_t>();
-    const T* n = operands[1].getAsVector<T>();
-    const T* m;
-    T imm;
-    if (cmpToImm)
-      imm = static_cast<T>(metadata.operands[3].imm);
-    else
-      m = operands[2].getAsVector<T>();
 
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    std::array<uint64_t, 4> out = {0, 0, 0, 0};
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active) {
-        if (cmpToImm)
-          out[i / (64 / sizeof(T))] |= (func(n[i], imm)) ? (shifted_active) : 0;
-        else
-          out[i / (64 / sizeof(T))] |=
-              (func(n[i], m[i])) ? (shifted_active) : 0;
-      }
-    }
-    // Byte count = sizeof(T) as destination predicate is predicate of T bytes.
-    return {out, AuxFunc::getNZCVfromPred(out, VL_bits, sizeof(T))};
-  }
-
-  /** Helper function for SVE instructions with the format `cnt<b,d,h,s> rd{,
-   * pattern{, #imm}}`.
-   * T represents the type of operation (e.g. for CNTD, T = uint64_t).
-   * Returns single value of type uint64_t. */
-  template <typename T>
-  static uint64_t sveCnt_gpr(
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits) {
-    const uint8_t imm = static_cast<uint8_t>(metadata.operands[1].imm);
-
-    const uint16_t elems =
-        AuxFunc::sveGetPattern(metadata.operandStr, (sizeof(T) * 8), VL_bits);
-    return (uint64_t)(elems * imm);
-  }
-
-  /** Helper function for SVE instructions with the format `cntp xd, pg, pn`.
-   * T represents the type of operands (e.g. for pn.d, T = uint64_t).
-   * Returns single value of type uint64_t. */
-  template <typename T>
-  static uint64_t sveCntp(std::vector<RegisterValue>& operands,
+/** Helper function for SVE instructions with the format `add zd, zn, zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveAdd_3ops(srcValContainer& sourceValues,
                           const uint16_t VL_bits) {
-    const uint64_t* pg = operands[0].getAsVector<uint64_t>();
-    const uint64_t* pn = operands[1].getAsVector<uint64_t>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    uint64_t count = 0;
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (pg[i / (64 / sizeof(T))] & shifted_active) {
-        count += (pn[i / (64 / sizeof(T))] & shifted_active) ? 1 : 0;
-      }
-    }
-    return count;
-  }
-
-  /** Helper function for SVE instructions with the format `fcm<ge, lt,...> pd,
-   * pg/z, zn, zm`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns an array of 4 uint64_t elements. */
-  template <typename T>
-  static std::array<uint64_t, 4> sveComparePredicated_vecsToPred(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits, bool cmpToZero, std::function<bool(T, T)> func) {
-    const uint64_t* p = operands[0].getAsVector<uint64_t>();
-    const T* n = operands[1].getAsVector<T>();
-    const T* m;
-    if (!cmpToZero) m = operands[2].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    std::array<uint64_t, 4> out = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active) {
-        out[i / (64 / sizeof(T))] |=
-            (func(n[i], cmpToZero ? 0.0 : m[i])) ? shifted_active : 0;
-      }
-    }
-    return out;
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    out[i] = n[i] + m[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `add zd, zn, #imm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveAdd_imm(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T imm = static_cast<T>(metadata.operands[2].imm);
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    out[i] = n[i] + imm;
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `add zdn, pg/m, zdn,
+ * const`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveAddPredicated_const(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  bool isFP = std::is_floating_point<T>::value;
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* d = sourceValues[1].getAsVector<T>();
+  const auto con = isFP ? metadata.operands[3].fp : metadata.operands[3].imm;
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active)
+      out[i] = d[i] + con;
+    else
+      out[i] = d[i];
   }
+  return {out, 256};
+}
 
-  /** Helper function for SVE instructions with the format `cpy zd, pg/z, #imm{,
-   * shift}`.
-   * T represents the type of operands (e.g. for zd.d, T = int64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveCpy_imm(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits) {
-    const uint64_t* p = operands[0].getAsVector<uint64_t>();
-    const int16_t imm = metadata.operands[2].imm;
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active) {
-        out[i] = imm;
-      } else {
-        out[i] = 0;
-      }
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `dec<b,d,h,s> xdn{,
-   * pattern{, MUL #imm}}`.
-   * T represents the type of operation (e.g. for DECD, T = uint64_t).
-   * Returns single value of type uint64_t. */
-  template <typename T>
-  static int64_t sveDec_scalar(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits) {
-    const int64_t n = operands[0].get<int64_t>();
-    const uint8_t imm = static_cast<uint8_t>(metadata.operands[1].imm);
-    const uint16_t elems =
-        AuxFunc::sveGetPattern(metadata.operandStr, sizeof(T) * 8, VL_bits);
-    return (n - static_cast<int64_t>(elems * imm));
-  }
-
-  /** Helper function for SVE instructions with the format `dup zd, <#imm{,
-   * shift}, <w,x>n>`.
-   * T represents the type of operands (e.g. for zd.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveDup_immOrScalar(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits, bool useImm) {
-    bool isFP = std::is_floating_point<T>::value;
-    T imm;
-    if (useImm)
-      imm = isFP ? metadata.operands[1].fp
-                 : static_cast<int8_t>(metadata.operands[1].imm);
+/** Helper function for SVE instructions with the format `add zdn, pg/m, zdn,
+ * zm`.
+ * T represents the type of sourceValues (e.g. for zdn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveAddPredicated_vecs(srcValContainer& sourceValues,
+                                    const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* d = sourceValues[1].getAsVector<T>();
+  const T* m = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active)
+      out[i] = d[i] + m[i];
     else
-      imm = operands[0].get<T>();
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      out[i] = imm;
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `dup zd, zn[#imm]`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveDup_vecIndexed(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits) {
-    const uint16_t index =
-        static_cast<uint16_t>(metadata.operands[1].vector_index);
-    const T* n = operands[0].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    if (index < (VL_bits / (sizeof(T) * 8))) {
-      const T element = n[index];
-      for (int i = 0; i < partition_num; i++) {
-        out[i] = element;
-      }
-    }
-    return {out, 256};
+      out[i] = d[i];
   }
+  return {out, 256};
+}
 
-  /** Helper function for SVE instructions with the format `fabs zd,
-   * pg/z, zn`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveFabsPredicated(std::vector<RegisterValue>& operands,
-                                         const uint16_t VL_bits) {
-    const T* d = operands[0].getAsVector<T>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-    const T* n = operands[2].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
+/** Helper function for NEON instructions with the format `addv dd, pg, zn`.
+ * T represents the type of sourceValues (e.g. for zn.s, T = uint32_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveAddvPredicated(srcValContainer& sourceValues,
+                                const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* n = sourceValues[1].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  uint64_t out = 0;
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active)
+      out += static_cast<uint64_t>(n[i]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `adr zd, [zn, zm{,
+ * lsl #<1,2,3>}]`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveAdr_packedOffsets(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  const int mbytes = 1 << metadata.operands[2].shift.value;
+  for (int i = 0; i < partition_num; i++) {
+    out[i] = n[i] + (m[i] * mbytes);
+  }
+  return {out, 256};
+}
+
+/** Helper function for instructions with the format `cmp<eq, ge, gt, hi, hs,
+ *le, lo, ls, lt, ne> pd, pg/z, zn, <zm, #imm>`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns tuple of type [pred result (array of 4 uint64_t), nzcv]. */
+template <typename T>
+std::tuple<std::array<uint64_t, 4>, uint8_t> sveCmpPredicated_toPred(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits, bool cmpToImm, std::function<bool(T, T)> func) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* n = sourceValues[1].getAsVector<T>();
+  const T* m;
+  T imm;
+  if (cmpToImm)
+    imm = static_cast<T>(metadata.operands[3].imm);
+  else
+    m = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  std::array<uint64_t, 4> out = {0, 0, 0, 0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      if (cmpToImm)
+        out[i / (64 / sizeof(T))] |= (func(n[i], imm)) ? (shifted_active) : 0;
+      else
+        out[i / (64 / sizeof(T))] |= (func(n[i], m[i])) ? (shifted_active) : 0;
+    }
+  }
+  // Byte count = sizeof(T) as destination predicate is predicate of T bytes.
+  return {out, getNZCVfromPred(out, VL_bits, sizeof(T))};
+}
+
+/** Helper function for SVE instructions with the format `cnt<b,d,h,s> rd{,
+ * pattern{, #imm}}`.
+ * T represents the type of operation (e.g. for CNTD, T = uint64_t).
+ * Returns single value of type uint64_t. */
+template <typename T>
+uint64_t sveCnt_gpr(const simeng::arch::aarch64::InstructionMetadata& metadata,
+                    const uint16_t VL_bits) {
+  const uint8_t imm = static_cast<uint8_t>(metadata.operands[2].imm);
+
+  const uint16_t elems = getElemsFromPattern(
+      metadata.operands[1].sysop.alias.svepredpat, (sizeof(T) * 8), VL_bits);
+  return (uint64_t)(elems * imm);
+}
+
+/** Helper function for SVE instructions with the format `cntp xd, pg, pn`.
+ * T represents the type of sourceValues (e.g. for pn.d, T = uint64_t).
+ * Returns single value of type uint64_t. */
+template <typename T>
+uint64_t sveCntp(srcValContainer& sourceValues, const uint16_t VL_bits) {
+  const uint64_t* pg = sourceValues[0].getAsVector<uint64_t>();
+  const uint64_t* pn = sourceValues[1].getAsVector<uint64_t>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  uint64_t count = 0;
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (pg[i / (64 / sizeof(T))] & shifted_active) {
+      count += (pn[i / (64 / sizeof(T))] & shifted_active) ? 1 : 0;
+    }
+  }
+  return count;
+}
+
+/** Helper function for SVE instructions with the format `fcm<ge, lt,...> pd,
+ * pg/z, zn, zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns an array of 4 uint64_t elements. */
+template <typename T>
+std::array<uint64_t, 4> sveComparePredicated_vecsToPred(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits, bool cmpToZero, std::function<bool(T, T)> func) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* n = sourceValues[1].getAsVector<T>();
+  const T* m;
+  if (!cmpToZero) m = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  std::array<uint64_t, 4> out = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      out[i / (64 / sizeof(T))] |=
+          (func(n[i], cmpToZero ? 0.0 : m[i])) ? shifted_active : 0;
+    }
+  }
+  return out;
+}
+
+/** Helper function for SVE instructions with the format `cpy zd, pg/z, #imm{,
+ * shift}`.
+ * T represents the type of sourceValues (e.g. for zd.d, T = int64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveCpy_imm(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const int16_t imm = metadata.operands[2].imm;
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      out[i] = imm;
+    } else {
+      out[i] = 0;
+    }
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `dec<b,d,h,s> xdn{,
+ * pattern{, MUL #imm}}`.
+ * T represents the type of operation (e.g. for DECD, T = uint64_t).
+ * Returns single value of type uint64_t. */
+template <typename T>
+int64_t sveDec_scalar(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const int64_t n = sourceValues[0].get<int64_t>();
+  const uint8_t imm = static_cast<uint8_t>(metadata.operands[2].imm);
+  const uint16_t elems = getElemsFromPattern(
+      metadata.operands[1].sysop.alias.svepredpat, sizeof(T) * 8, VL_bits);
+  return (n - static_cast<int64_t>(elems * imm));
+}
+
+/** Helper function for SVE instructions with the format `dup zd, <#imm{,
+ * shift}, <w,x>n>`.
+ * T represents the type of sourceValues (e.g. for zd.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveDup_immOrScalar(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits, bool useImm) {
+  bool isFP = std::is_floating_point<T>::value;
+  T imm;
+  if (useImm)
+    imm = isFP ? metadata.operands[1].fp
+               : static_cast<int8_t>(metadata.operands[1].imm);
+  else
+    imm = sourceValues[0].get<T>();
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    out[i] = imm;
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `dup zd, zn[#imm]`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveDup_vecIndexed(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const uint16_t index =
+      static_cast<uint16_t>(metadata.operands[1].vector_index);
+  const T* n = sourceValues[0].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  if (index < (VL_bits / (sizeof(T) * 8))) {
+    const T element = n[index];
     for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active) {
-        out[i] = ::fabs(n[i]);
-      } else {
-        out[i] = d[i];
-      }
+      out[i] = element;
     }
-    return {out, 256};
   }
+  return {out, 256};
+}
 
-  /** Helper function for SVE instructions with the format `fadda rd,
-   * pg, rn, zm`.
-   * T represents the type of operands (e.g. for zm.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveFaddaPredicated(std::vector<RegisterValue>& operands,
-                                          const uint16_t VL_bits) {
-    const uint64_t* p = operands[0].getAsVector<uint64_t>();
-    const T n = operands[1].get<T>();
-    const T* m = operands[2].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-    out[0] = n;
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active) {
-        out[0] += m[i];
+/** Helper function for SVE instructions with the format `fabs zd,
+ * pg/z, zn`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveFabsPredicated(srcValContainer& sourceValues,
+                                const uint16_t VL_bits) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const T* n = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      out[i] = ::fabs(n[i]);
+    } else {
+      out[i] = d[i];
+    }
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `fadda rd,
+ * pg, rn, zm`.
+ * T represents the type of sourceValues (e.g. for zm.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveFaddaPredicated(srcValContainer& sourceValues,
+                                 const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T n = sourceValues[1].get<T>();
+  const T* m = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  out[0] = n;
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      out[0] += m[i];
+    }
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `fcadd zdn, pg/m,
+ * zdn, zm, #imm`.
+ * T represents the type of sourceValues (e.g. for zm.d, T = double).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveFcaddPredicated(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* dn = sourceValues[1].getAsVector<T>();
+  const T* m = sourceValues[2].getAsVector<T>();
+  const uint32_t imm = metadata.operands[4].imm;
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < (partition_num / 2); i++) {
+    T acc_r = dn[2 * i];
+    T acc_i = dn[2 * i + 1];
+    T elt2_r = m[2 * i];
+    T elt2_i = m[2 * i + 1];
+
+    uint64_t shifted_active1 = 1ull
+                               << (((2 * i) % (64 / sizeof(T))) * sizeof(T));
+    uint64_t shifted_active2 =
+        1ull << (((2 * i + 1) % (64 / sizeof(T))) * sizeof(T));
+    if (p[(2 * i) / (64 / sizeof(T))] & shifted_active1) {
+      if (imm == 90) {
+        elt2_i = 0.0 - elt2_i;
       }
+      acc_r = acc_r + elt2_i;
     }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `fcadd zdn, pg/m,
-   * zdn, zm, #imm`.
-   * T represents the type of operands (e.g. for zm.d, T = double).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveFcaddPredicated(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits) {
-    const uint64_t* p = operands[0].getAsVector<uint64_t>();
-    const T* dn = operands[1].getAsVector<T>();
-    const T* m = operands[2].getAsVector<T>();
-    const uint32_t imm = metadata.operands[4].imm;
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < (partition_num / 2); i++) {
-      T acc_r = dn[2 * i];
-      T acc_i = dn[2 * i + 1];
-      T elt2_r = m[2 * i];
-      T elt2_i = m[2 * i + 1];
-
-      uint64_t shifted_active1 = 1ull
-                                 << (((2 * i) % (64 / sizeof(T))) * sizeof(T));
-      uint64_t shifted_active2 =
-          1ull << (((2 * i + 1) % (64 / sizeof(T))) * sizeof(T));
-      if (p[(2 * i) / (64 / sizeof(T))] & shifted_active1) {
-        if (imm == 90) {
-          elt2_i = 0.0 - elt2_i;
-        }
-        acc_r = acc_r + elt2_i;
+    if (p[(2 * i + 1) / (64 / sizeof(T))] & shifted_active2) {
+      if (imm == 270) {
+        elt2_r = 0.0 - elt2_r;
       }
-      if (p[(2 * i + 1) / (64 / sizeof(T))] & shifted_active2) {
-        if (imm == 270) {
-          elt2_r = 0.0 - elt2_r;
-        }
-        acc_i = acc_i + elt2_r;
+      acc_i = acc_i + elt2_r;
+    }
+    out[2 * i] = acc_r;
+    out[2 * i + 1] = acc_i;
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `fcmla zda, pg/m,
+ * zn, zm, #imm`.
+ * T represents the type of sourceValues (e.g. for zm.d, T = double).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveFcmlaPredicated(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const T* da = sourceValues[0].getAsVector<T>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const T* n = sourceValues[2].getAsVector<T>();
+  const T* m = sourceValues[3].getAsVector<T>();
+  const uint32_t imm = metadata.operands[4].imm;
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  int sel_a = (imm == 0 || imm == 180) ? 0 : 1;
+  int sel_b = (imm == 0 || imm == 180) ? 1 : 0;
+  bool neg_i = (imm == 180 || imm == 270) ? true : false;
+  bool neg_r = (imm == 90 || imm == 180) ? true : false;
+  for (int i = 0; i < (partition_num / 2); i++) {
+    T addend_r = da[2 * i];
+    T addend_i = da[2 * i + 1];
+    T elt1_a = n[2 * i + sel_a];
+    T elt2_a = m[2 * i + sel_a];
+    T elt2_b = m[2 * i + sel_b];
+    uint64_t shifted_active1 = 1ull
+                               << (((2 * i) % (64 / sizeof(T))) * sizeof(T));
+    uint64_t shifted_active2 =
+        1ull << (((2 * i + 1) % (64 / sizeof(T))) * sizeof(T));
+    if (p[(2 * i) / (64 / sizeof(T))] & shifted_active1) {
+      if (neg_r) {
+        elt2_a = 0.0 - elt2_a;
       }
-      out[2 * i] = acc_r;
-      out[2 * i + 1] = acc_i;
+      addend_r = addend_r + (elt1_a * elt2_a);
     }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `fcmla zda, pg/m,
-   * zn, zm, #imm`.
-   * T represents the type of operands (e.g. for zm.d, T = double).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveFcmlaPredicated(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits) {
-    const T* da = operands[0].getAsVector<T>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-    const T* n = operands[2].getAsVector<T>();
-    const T* m = operands[3].getAsVector<T>();
-    const uint32_t imm = metadata.operands[4].imm;
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    int sel_a = (imm == 0 || imm == 180) ? 0 : 1;
-    int sel_b = (imm == 0 || imm == 180) ? 1 : 0;
-    bool neg_i = (imm == 180 || imm == 270) ? true : false;
-    bool neg_r = (imm == 90 || imm == 180) ? true : false;
-    for (int i = 0; i < (partition_num / 2); i++) {
-      T addend_r = da[2 * i];
-      T addend_i = da[2 * i + 1];
-      T elt1_a = n[2 * i + sel_a];
-      T elt2_a = m[2 * i + sel_a];
-      T elt2_b = m[2 * i + sel_b];
-      uint64_t shifted_active1 = 1ull
-                                 << (((2 * i) % (64 / sizeof(T))) * sizeof(T));
-      uint64_t shifted_active2 =
-          1ull << (((2 * i + 1) % (64 / sizeof(T))) * sizeof(T));
-      if (p[(2 * i) / (64 / sizeof(T))] & shifted_active1) {
-        if (neg_r) {
-          elt2_a = 0.0 - elt2_a;
-        }
-        addend_r = addend_r + (elt1_a * elt2_a);
+    if (p[(2 * i + 1) / (64 / sizeof(T))] & shifted_active2) {
+      if (neg_i) {
+        elt2_b = 0.0 - elt2_b;
       }
-      if (p[(2 * i + 1) / (64 / sizeof(T))] & shifted_active2) {
-        if (neg_i) {
-          elt2_b = 0.0 - elt2_b;
-        }
-        addend_i = addend_i + (elt1_a * elt2_b);
-      }
-      out[2 * i] = addend_r;
-      out[2 * i + 1] = addend_i;
+      addend_i = addend_i + (elt1_a * elt2_b);
+    }
+    out[2 * i] = addend_r;
+    out[2 * i + 1] = addend_i;
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `fcpy zd, pg/m,
+ * #const`.
+ * T represents the type of sourceValues (e.g. for zd.d, T = double).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveFcpy_imm(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const T* dn = sourceValues[0].getAsVector<T>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const T imm = metadata.operands[2].fp;
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      out[i] = imm;
+    } else {
+      out[i] = dn[i];
     }
-    return {out, 256};
   }
+  return {out, 256};
+}
 
-  /** Helper function for SVE instructions with the format `fcpy zd, pg/m,
-   * #const`.
-   * T represents the type of operands (e.g. for zd.d, T = double).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveFcpy_imm(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits) {
-    const T* dn = operands[0].getAsVector<T>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-    const T imm = metadata.operands[2].fp;
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active) {
-        out[i] = imm;
-      } else {
-        out[i] = dn[i];
-      }
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `fcvt zd,
-   * pg/m, zn`.
-   * D represents the destination vector register type (e.g. zd.s would be
-   * int32_t).
-   * N represents the source vector register type (e.g. zn.d would be double).
-   * Returns correctly formatted RegisterValue. */
-  template <typename D, typename N>
-  static RegisterValue sveFcvtPredicated(std::vector<RegisterValue>& operands,
-                                         const uint16_t VL_bits) {
-    const D* d = operands[0].getAsVector<D>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-    const N* n = operands[2].getAsVector<N>();
-
-    // Stores size of largest type out of D and N
-    int lts = std::max(sizeof(D), sizeof(N));
-    bool sourceLarger = (sizeof(D) < sizeof(N)) ? true : false;
-    bool sameDandN = (sizeof(D) == sizeof(N)) ? true : false;
-
-    const uint16_t partition_num = VL_bits / (lts * 8);
-    D out[256 / sizeof(D)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / lts)) * lts);
-      int indexOut = (sourceLarger) ? (2 * i) : i;
-      int indexN = (!sameDandN) && (!sourceLarger) ? (2 * i) : i;
-
-      if (p[i / (64 / lts)] & shifted_active) {
-        if (n[indexN] > std::numeric_limits<D>::max())
-          out[indexOut] = std::numeric_limits<D>::max();
-        else if (n[indexN] < std::numeric_limits<D>::lowest())
-          out[indexOut] = std::numeric_limits<D>::lowest();
-        else
-          out[indexOut] = static_cast<D>(n[indexN]);
-      } else {
-        out[indexOut] = d[indexOut];
-      }
-      if (sourceLarger) out[indexOut + 1] = d[indexOut + 1];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `fcvtzs zd,
-   * pg/m, zn`.
-   * D represents the destination vector register type (e.g. zd.s would be
-   * int32_t).
-   * N represents the source vector register type (e.g. zn.d would be double).
-   * Returns correctly formatted RegisterValue. */
-  template <typename D, typename N>
-  static RegisterValue sveFcvtzsPredicated(std::vector<RegisterValue>& operands,
-                                           const uint16_t VL_bits) {
-    const D* d = operands[0].getAsVector<D>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-    const N* n = operands[2].getAsVector<N>();
-
-    // Stores size of largest type out of D and N
-    int lts = std::max(sizeof(D), sizeof(N));
-    bool sameType = (sizeof(D) == sizeof(N)) ? true : false;
-    bool sourceLarger = (sizeof(D) < sizeof(N)) ? true : false;
-
-    const uint16_t partition_num = VL_bits / (lts * 8);
-    D out[256 / sizeof(D)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / lts)) * lts);
-      int indexOut = (sourceLarger) ? (2 * i) : i;
-      int indexN = ((!sourceLarger) & (!sameType)) ? (2 * i) : i;
-
-      if (p[i / (64 / lts)] & shifted_active) {
-        if (n[indexN] > std::numeric_limits<D>::max())
-          out[indexOut] = std::numeric_limits<D>::max();
-        else if (n[indexN] < std::numeric_limits<D>::lowest())
-          out[indexOut] = std::numeric_limits<D>::lowest();
-        else
-          out[indexOut] = static_cast<D>(std::trunc(n[indexN]));
-        // Can be set to 0xFFFFFFFF as will only occur when D=int32_t.
-        if (sourceLarger) out[indexOut + 1] = (n[indexN] < 0) ? 0xFFFFFFFFu : 0;
-      } else {
-        out[indexOut] = d[indexOut];
-        if (sourceLarger) out[indexOut + 1] = d[indexOut + 1];
-      }
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `fmad zd, pg/m, zn,
-   * zm`.
-   * T represents the type of operands (e.g. for zn.d, T = double).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveFmadPredicated_vecs(
-      std::vector<RegisterValue>& operands, const uint16_t VL_bits) {
-    const T* d = operands[0].getAsVector<T>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-    const T* n = operands[2].getAsVector<T>();
-    const T* m = operands[3].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active)
-        out[i] = m[i] + (d[i] * n[i]);
-      else
-        out[i] = d[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `fmls zd, pg/m, zn,
-   * zm`.
-   * T represents the type of operands (e.g. for zn.d, T = double).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveFmlsPredicated_vecs(
-      std::vector<RegisterValue>& operands, const uint16_t VL_bits) {
-    const T* d = operands[0].getAsVector<T>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-    const T* n = operands[2].getAsVector<T>();
-    const T* m = operands[3].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active)
-        out[i] = d[i] + (-n[i] * m[i]);
+/** Helper function for SVE instructions with the format `fcvt zd,
+ * pg/m, zn`.
+ * D represents the destination vector register type (e.g. zd.s would be
+ * int32_t).
+ * N represents the source vector register type (e.g. zn.d would be double).
+ * Returns correctly formatted RegisterValue. */
+template <typename D, typename N>
+RegisterValue sveFcvtPredicated(srcValContainer& sourceValues,
+                                const uint16_t VL_bits) {
+  const D* d = sourceValues[0].getAsVector<D>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const N* n = sourceValues[2].getAsVector<N>();
+
+  // Stores size of largest type out of D and N
+  int lts = std::max(sizeof(D), sizeof(N));
+  bool sourceLarger = (sizeof(D) < sizeof(N));
+  bool sameDandN = (sizeof(D) == sizeof(N));
+
+  const uint16_t partition_num = VL_bits / (lts * 8);
+  D out[256 / sizeof(D)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / lts)) * lts);
+    int indexOut = (sourceLarger) ? (2 * i) : i;
+    int indexN = (!sameDandN) && (!sourceLarger) ? (2 * i) : i;
+
+    if (p[i / (64 / lts)] & shifted_active) {
+      if (n[indexN] > std::numeric_limits<D>::max())
+        out[indexOut] = std::numeric_limits<D>::max();
+      else if (n[indexN] < std::numeric_limits<D>::lowest())
+        out[indexOut] = std::numeric_limits<D>::lowest();
       else
-        out[i] = d[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `fmsb zd, pg/m, zn,
-   * zm`.
-   * T represents the type of operands (e.g. for zn.d, T = double).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveFmsbPredicated_vecs(
-      std::vector<RegisterValue>& operands, const uint16_t VL_bits) {
-    const T* d = operands[0].getAsVector<T>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-    const T* n = operands[2].getAsVector<T>();
-    const T* m = operands[3].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active)
-        out[i] = m[i] + (-d[i] * n[i]);
+        out[indexOut] = static_cast<D>(n[indexN]);
+    } else {
+      out[indexOut] = d[indexOut];
+    }
+    if (sourceLarger) out[indexOut + 1] = d[indexOut + 1];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `fcvtzs zd,
+ * pg/m, zn`.
+ * D represents the destination vector register type (e.g. zd.s would be
+ * int32_t).
+ * N represents the source vector register type (e.g. zn.d would be double).
+ * Returns correctly formatted RegisterValue. */
+template <typename D, typename N>
+RegisterValue sveFcvtzsPredicated(srcValContainer& sourceValues,
+                                  const uint16_t VL_bits) {
+  static_assert((std::is_same<float, N>() || std::is_same<double, N>()) &&
+                "N is not a valid type which should be float or double");
+  static_assert((std::is_same<int32_t, D>() || std::is_same<int64_t, D>()) &&
+                "D is not a valid type which should be int32_t or int64_t");
+
+  const D* d = sourceValues[0].getAsVector<D>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const N* n = sourceValues[2].getAsVector<N>();
+
+  // Stores size of largest type out of D and N
+  int lts = std::max(sizeof(D), sizeof(N));
+  bool sameType = (sizeof(D) == sizeof(N));
+  bool sourceLarger = (sizeof(D) < sizeof(N));
+
+  const uint16_t partition_num = VL_bits / (lts * 8);
+  D out[256 / sizeof(D)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / lts)) * lts);
+    int indexOut = (sourceLarger) ? (2 * i) : i;
+    int indexN = ((!sourceLarger) & (!sameType)) ? (2 * i) : i;
+
+    if (p[i / (64 / lts)] & shifted_active) {
+      if (static_cast<double>(n[indexN]) >=
+          static_cast<double>(std::numeric_limits<D>::max()))
+        // Cast to double to reduce precision errors. Float can't store int32
+        // or int64 max values accurately as not enough bits available. This
+        // causes unwanted comparison behaviour. Double also can't accurately
+        // represent int64.MaxValue. Non-strict comparison used to capture this
+        // case
+        //
+        // max() will be either 2147483647 or 9223372036854775807
+        // Casting to float results in the following (incorrect) values
+        // 2147483648 (+1) or 9223372036854775808 (+1)
+        //
+        // Casting to double results in 2147483647 (+0) or incorrect
+        // 9223372036854775808(+1)
+
+        out[indexOut] = std::numeric_limits<D>::max();
+      else if (n[indexN] < std::numeric_limits<D>::lowest())
+        out[indexOut] = std::numeric_limits<D>::lowest();
       else
-        out[i] = d[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `fmul zd, zn, zm`.
-   * T represents the type of operands (e.g. for zn.d, T = double).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveFmul_3ops(std::vector<RegisterValue>& operands,
-                                    const uint16_t VL_bits) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-    for (int i = 0; i < partition_num; i++) {
-      out[i] = n[i] * m[i];
+        out[indexOut] = static_cast<D>(std::trunc(n[indexN]));
+      // Can be set to 0xFFFFFFFF as will only occur when D=int32_t.
+      if (sourceLarger) out[indexOut + 1] = (n[indexN] < 0) ? 0xFFFFFFFFu : 0;
+    } else {
+      out[indexOut] = d[indexOut];
+      if (sourceLarger) out[indexOut + 1] = d[indexOut + 1];
     }
-    return {out, 256};
   }
-
-  /** Helper function for SVE instructions with the format `fneg zd, pg/m, zn`.
-   * T represents the type of operands (e.g. for zn.d, T = double).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveFnegPredicated(std::vector<RegisterValue>& operands,
-                                         const uint16_t VL_bits) {
-    const T* d = operands[0].getAsVector<T>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-    const T* n = operands[2].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active)
-        out[i] = -n[i];
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `<fdiv, fdivr>
+ * zd, pg/m, zn, zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Reversed represents whether the opcode is fdivr and thus the input
+ * sourceValues should be reversed. Returns correctly formatted RegisterValue.
+ */
+template <typename T, bool Reversed = false>
+std::enable_if_t<std::is_floating_point_v<T>, RegisterValue> sveFDivPredicated(
+    srcValContainer& sourceValues, const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* dn = sourceValues[1].getAsVector<T>();
+  const T* m = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      const T op1 = Reversed ? m[i] : dn[i];
+      const T op2 = Reversed ? dn[i] : m[i];
+      if (op2 == 0)
+        out[i] = sizeof(T) == 8 ? std::nan("") : std::nanf("");
       else
-        out[i] = d[i];
-    }
-    return {out, 256};
+        out[i] = op1 / op2;
+    } else
+      out[i] = dn[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `fmad zd, pg/m, zn,
+ * zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = double).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveFmadPredicated_vecs(srcValContainer& sourceValues,
+                                     const uint16_t VL_bits) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const T* n = sourceValues[2].getAsVector<T>();
+  const T* m = sourceValues[3].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active)
+      out[i] = m[i] + (d[i] * n[i]);
+    else
+      out[i] = d[i];
   }
+  return {out, 256};
+}
 
-  /** Helper function for SVE instructions with the format `fnmls zd, pg/m, zn,
-   * zm`.
-   * T represents the type of operands (e.g. for zn.d, T = double).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveFnmlsPredicated(std::vector<RegisterValue>& operands,
-                                          const uint16_t VL_bits) {
-    const T* d = operands[0].getAsVector<T>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-    const T* n = operands[2].getAsVector<T>();
-    const T* m = operands[3].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active)
-        out[i] = -d[i] + (n[i] * m[i]);
-      else
-        out[i] = d[i];
-    }
-    return {out, 256};
+/** Helper function for SVE instructions with the format `fmls zd, pg/m, zn,
+ * zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = double).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveFmlsPredicated_vecs(srcValContainer& sourceValues,
+                                     const uint16_t VL_bits) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const T* n = sourceValues[2].getAsVector<T>();
+  const T* m = sourceValues[3].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active)
+      out[i] = d[i] + (-n[i] * m[i]);
+    else
+      out[i] = d[i];
   }
+  return {out, 256};
+}
 
-  /** Helper function for SVE instructions with the format `fnmsb zdn, pg/m, zm,
-   * za`.
-   * T represents the type of operands (e.g. for zdn.d, T = double).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveFnmsbPredicated(std::vector<RegisterValue>& operands,
-                                          const uint16_t VL_bits) {
-    const T* n = operands[0].getAsVector<T>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-    const T* m = operands[2].getAsVector<T>();
-    const T* a = operands[3].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
+/** Helper function for SVE instructions with the format `fmsb zd, pg/m, zn,
+ * zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = double).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveFmsbPredicated_vecs(srcValContainer& sourceValues,
+                                     const uint16_t VL_bits) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const T* n = sourceValues[2].getAsVector<T>();
+  const T* m = sourceValues[3].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active)
+      out[i] = m[i] + (-d[i] * n[i]);
+    else
+      out[i] = d[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `fmul zd, zn, zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = double).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveFmul_3ops(srcValContainer& sourceValues,
+                           const uint16_t VL_bits) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    out[i] = n[i] * m[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `fneg zd, pg/m, zn`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = double).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveFnegPredicated(srcValContainer& sourceValues,
+                                const uint16_t VL_bits) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const T* n = sourceValues[2].getAsVector<T>();
 
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active)
-        out[i] = -a[i] + n[i] * m[i];
-      else
-        out[i] = n[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `frintn zd, pg/m,
-   * zn`.
-   * D represents the destination vector register type (e.g. zd.s would be
-   * int32_t).
-   * N represents the source vector register type (e.g. zn.d would be
-   * double).
-   * Returns correctly formatted RegisterValue. */
-  template <typename D, typename N>
-  static RegisterValue sveFrintnPredicated(std::vector<RegisterValue>& operands,
-                                           const uint16_t VL_bits) {
-    const D* d = operands[0].getAsVector<D>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-    const N* n = operands[2].getAsVector<N>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(N) * 8);
-    D out[256 / sizeof(D)] = {0};
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
 
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(N))) * sizeof(N));
-      if (p[i / (64 / sizeof(N))] & shifted_active) {
-        out[i] = AuxFunc::roundToNearestTiesToEven<N, D>(n[i]);
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active)
+      out[i] = -n[i];
+    else
+      out[i] = d[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `fnmls zd, pg/m, zn,
+ * zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = double).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveFnmlsPredicated(srcValContainer& sourceValues,
+                                 const uint16_t VL_bits) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const T* n = sourceValues[2].getAsVector<T>();
+  const T* m = sourceValues[3].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active)
+      out[i] = -d[i] + (n[i] * m[i]);
+    else
+      out[i] = d[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `fnmsb zdn, pg/m, zm,
+ * za`.
+ * T represents the type of sourceValues (e.g. for zdn.d, T = double).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveFnmsbPredicated(srcValContainer& sourceValues,
+                                 const uint16_t VL_bits) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const T* m = sourceValues[2].getAsVector<T>();
+  const T* a = sourceValues[3].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active)
+      out[i] = -a[i] + n[i] * m[i];
+    else
+      out[i] = n[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `frintn zd, pg/m,
+ * zn`.
+ * T represents the vector type (e.g. zd.s would be float).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+std::enable_if_t<std::is_floating_point_v<T>, RegisterValue>
+sveFrintnPredicated(srcValContainer& sourceValues, const uint16_t VL_bits) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const T* n = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      // Get truncation
+      T trunc = std::trunc(n[i]);
+      // On tie, round to nearest even
+      if (std::fabs(n[i] - trunc) == static_cast<T>(0.5)) {
+        T addand = (trunc > static_cast<T>(0.0)) ? static_cast<T>(1)
+                                                 : static_cast<T>(-1);
+        // If odd, add the addand
+        out[i] = (std::fmod(trunc, static_cast<T>(2.0)) == static_cast<T>(0.0))
+                     ? trunc
+                     : (trunc + addand);
       } else {
-        out[i] = d[i];
+        // Else, round to nearest
+        out[i] = std::round(n[i]);
       }
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `fsqrt zd,
-   * pg/m, zn`.
-   * T represents the type of operands (e.g. for zn.d, T = double).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveFsqrtPredicated_2vecs(
-      std::vector<RegisterValue>& operands, const uint16_t VL_bits) {
-    const T* d = operands[0].getAsVector<T>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-    const T* n = operands[2].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active)
-        out[i] = ::sqrt(n[i]);
-      else
-        out[i] = d[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `inc<b, d, h, w>
-   * xdn{, pattern{, MUL #imm}}`.
-   * T represents the type of operation (e.g. for INCB, T = int8_t).
-   * Returns single value of type int64_t. */
-  template <typename T>
-  static int64_t sveInc_gprImm(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits) {
-    const int64_t n = operands[0].get<int64_t>();
-    const uint8_t imm = static_cast<uint8_t>(metadata.operands[1].imm);
-    const uint16_t elems =
-        AuxFunc::sveGetPattern(metadata.operandStr, sizeof(T) * 8, VL_bits);
-    int64_t out = n + (elems * imm);
-    return out;
-  }
-
-  /** Helper function for SVE instructions with the format `inc<b, d, h, w>
-   * zdn{, pattern{, #imm}}`.
-   * T represents the type of operands (e.g. for zdn.d, T = int64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveInc_imm(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits) {
-    const T* n = operands[0].getAsVector<T>();
-    const uint8_t imm = static_cast<uint8_t>(metadata.operands[1].imm);
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    typename std::make_signed<T>::type out[256 / sizeof(T)] = {0};
-    const uint16_t elems =
-        AuxFunc::sveGetPattern(metadata.operandStr, sizeof(T) * 8, VL_bits);
-
-    for (int i = 0; i < partition_num; i++) {
-      out[i] = n[i] + (elems * imm);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `incp xdn, pm`.
-   * T represents the type of operands (e.g. for pm.d, T = uint64_t).
-   * Returns single value of type uint64_t. */
-  template <typename T>
-  static uint64_t sveIncp_gpr(std::vector<RegisterValue>& operands,
-                              const uint16_t VL_bits) {
-    const uint64_t dn = operands[0].get<uint64_t>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    uint64_t count = 0;
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active) {
-        count++;
+    } else {
+      out[i] = d[i];
+    }
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `fsqrt zd,
+ * pg/m, zn`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = double).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveFsqrtPredicated_2vecs(srcValContainer& sourceValues,
+                                       const uint16_t VL_bits) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const T* n = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active)
+      out[i] = ::sqrt(n[i]);
+    else
+      out[i] = d[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `inc<b, d, h, w>
+ * xdn{, pattern{, MUL #imm}}`.
+ * T represents the type of operation (e.g. for INCB, T = int8_t).
+ * Returns single value of type int64_t. */
+template <typename T>
+int64_t sveInc_gprImm(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const int64_t n = sourceValues[0].get<int64_t>();
+
+  const uint8_t imm = static_cast<uint8_t>(metadata.operands[2].imm);
+  const uint16_t elems = getElemsFromPattern(
+      metadata.operands[1].sysop.alias.svepredpat, sizeof(T) * 8, VL_bits);
+  int64_t out = n + (elems * imm);
+  return out;
+}
+
+/** Helper function for SVE instructions with the format `inc<b, d, h, w>
+ * zdn{, pattern{, #imm}}`.
+ * T represents the type of sourceValues (e.g. for zdn.d, T = int64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveInc_imm(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const T* n = sourceValues[0].getAsVector<T>();
+
+  const uint8_t imm = static_cast<uint8_t>(metadata.operands[2].imm);
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  typename std::make_signed<T>::type out[256 / sizeof(T)] = {0};
+  const uint16_t elems = getElemsFromPattern(
+      metadata.operands[1].sysop.alias.svepredpat, sizeof(T) * 8, VL_bits);
+
+  for (int i = 0; i < partition_num; i++) {
+    out[i] = n[i] + (elems * imm);
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `incp xdn, pm`.
+ * T represents the type of sourceValues (e.g. for pm.d, T = uint64_t).
+ * Returns single value of type uint64_t. */
+template <typename T>
+uint64_t sveIncp_gpr(srcValContainer& sourceValues, const uint16_t VL_bits) {
+  const uint64_t dn = sourceValues[0].get<uint64_t>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  uint64_t count = 0;
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      count++;
+    }
+  }
+  return dn + count;
+}
+
+/** Helper function for SVE instructions with the format `index zd, <#imm,
+ * rn>, <#imm, rm>`.
+ * D represents the vector register type (e.g. zd.b would be int8_t).
+ * N represents the GPR type (e.g. for xn, xm, D = int64).
+ * Returns correctly formatted RegisterValue. */
+template <typename D, typename N = int8_t>
+RegisterValue sveIndex(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits, bool op1isImm, bool op2isImm) {
+  const int op2Index = op1isImm ? 0 : 1;
+  const auto n = op1isImm ? static_cast<int8_t>(metadata.operands[1].imm)
+                          : static_cast<N>(sourceValues[0].get<N>());
+  const auto m = op2isImm ? static_cast<int8_t>(metadata.operands[2].imm)
+                          : static_cast<N>(sourceValues[op2Index].get<N>());
+
+  const uint16_t partition_num = VL_bits / (sizeof(D) * 8);
+  D out[256 / sizeof(D)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    out[i] = static_cast<D>(n + (i * m));
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `<AND, EOR, ...>
+ * pd, pg/z, pn, pm`.
+ * T represents the type of sourceValues (e.g. for pn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+std::array<uint64_t, 4> sveLogicOp_preds(
+    srcValContainer& sourceValues, const uint16_t VL_bits,
+    std::function<uint64_t(uint64_t, uint64_t)> func) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const uint64_t* n = sourceValues[1].getAsVector<uint64_t>();
+  const uint64_t* m = sourceValues[2].getAsVector<uint64_t>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  std::array<uint64_t, 4> out = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      out[i / (64 / sizeof(T))] |=
+          (func(n[i / (64 / sizeof(T))], m[i / (64 / sizeof(T))]) &
+           shifted_active);
+    }
+  }
+  return out;
+}
+
+/** Helper function for SVE instructions with the format `<AND, EOR, ...>
+ * zd, pg/m, zn, zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveLogicOpPredicated_3vecs(srcValContainer& sourceValues,
+                                         const uint16_t VL_bits,
+                                         std::function<T(T, T)> func) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* dn = sourceValues[1].getAsVector<T>();
+  const T* m = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active)
+      out[i] = func(dn[i], m[i]);
+    else
+      out[i] = dn[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `<AND, EOR, ...>
+ * zd, zn, zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveLogicOpUnPredicated_3vecs(srcValContainer& sourceValues,
+                                           const uint16_t VL_bits,
+                                           std::function<T(T, T)> func) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    out[i] = func(n[i], m[i]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `lsl sz, zn, #imm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveLsl_imm(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T imm = static_cast<T>(metadata.operands[2].imm);
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  typename std::make_signed<T>::type out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    out[i] = (n[i] << imm);
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `max zdn, zdn,
+ * #imm`.
+ * T represents the type of sourceValues (e.g. for zdn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveMax_vecImm(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  T imm = static_cast<T>(metadata.operands[2].imm);
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    out[i] = std::max(n[i], imm);
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `max zdn, pg/m, zdn,
+ * zm`.
+ * T represents the type of sourceValues (e.g. for zdn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveMaxPredicated_vecs(srcValContainer& sourceValues,
+                                    const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* n = sourceValues[1].getAsVector<T>();
+  const T* m = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      out[i] = std::max(n[i], m[i]);
+    } else
+      out[i] = n[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `fmla zd, pg/m, zn,
+ * zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveMlaPredicated_vecs(srcValContainer& sourceValues,
+                                    const uint16_t VL_bits) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const T* n = sourceValues[2].getAsVector<T>();
+  const T* m = sourceValues[3].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active)
+      out[i] = d[i] + (n[i] * m[i]);
+    else
+      out[i] = d[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `fmla zda, zn,
+ * zm[index]`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveMlaIndexed_vecs(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  const T* n = sourceValues[1].getAsVector<T>();
+  const T* m = sourceValues[2].getAsVector<T>();
+  const size_t index = static_cast<size_t>(metadata.operands[2].vector_index);
+
+  const uint16_t elemsPer128 = 128 / (sizeof(T) * 8);
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (size_t i = 0; i < partition_num; i += elemsPer128) {
+    const T zm_elem = m[i + index];
+    for (size_t j = 0; j < elemsPer128; j++) {
+      out[i + j] = d[i + j] + (n[i + j] * zm_elem);
+    }
+  }
+
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `movprfx zd,
+ * pg/z, zn`.
+ * T represents the type of sourceValues (e.g. for zd.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveMovprfxPredicated_destToZero(srcValContainer& sourceValues,
+                                              const uint16_t VL_bits) {
+  // TODO: Adopt hint logic of the MOVPRFX instruction
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* n = sourceValues[1].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      out[i] = n[i];
+    } else {
+      out[i] = 0;
+    }
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `movprfx zd,
+ * pg/m, zn`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveMovprfxPredicated_destUnchanged(srcValContainer& sourceValues,
+                                                 const uint16_t VL_bits) {
+  // TODO: Adopt hint logic of the MOVPRFX instruction
+  const T* d = sourceValues[0].getAsVector<T>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const T* n = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      out[i] = n[i];
+    } else {
+      out[i] = d[i];
+    }
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `mul zdn, pg/m, zdn,
+ * <zm, #imm>`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveMulPredicated(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits, bool useImm) {
+  bool isFP = std::is_floating_point<T>::value;
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* n = sourceValues[1].getAsVector<T>();
+  const T* m;
+  T imm;
+  if (useImm)
+    imm = isFP ? metadata.operands[3].fp : metadata.operands[3].imm;
+  else
+    m = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      out[i] = n[i] * (useImm ? imm : m[i]);
+    } else
+      out[i] = n[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `mulh zdn, pg/m, zdn,
+ * zm`.
+ * T represents the type of sourceValues (e.g. for zn.s, T = int32_t).
+ * TT represents the type twice the length of T (e.g. for T = int8_t, TT =
+ * int16_T).
+ * Returns correctly formatted RegisterValue. */
+// TODO : Support for int64_t mulh operations.
+template <typename T, typename TT>
+RegisterValue sveMulhPredicated(srcValContainer& sourceValues,
+                                const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* n = sourceValues[1].getAsVector<T>();
+  const T* m = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      bool isNeg = false;
+      T a = n[i];
+      T b = m[i];
+      if (a < 0) {
+        isNeg = !isNeg;
+        a = 0 - a;
       }
-    }
-    return dn + count;
-  }
-
-  /** Helper function for SVE instructions with the format `index zd, <#imm,
-   * rn>, <#imm, rm>`.
-   * D represents the vector register type (e.g. zd.b would be int8_t).
-   * N represents the GPR type (e.g. for xn, xm, D = int64).
-   * Returns correctly formatted RegisterValue. */
-  template <typename D, typename N = int8_t>
-  static RegisterValue sveIndex(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits, bool op1isImm, bool op2isImm) {
-    const int op2Index = op1isImm ? 0 : 1;
-    const auto n = op1isImm ? static_cast<int8_t>(metadata.operands[1].imm)
-                            : static_cast<N>(operands[0].get<N>());
-    const auto m = op2isImm ? static_cast<int8_t>(metadata.operands[2].imm)
-                            : static_cast<N>(operands[op2Index].get<N>());
-
-    const uint16_t partition_num = VL_bits / (sizeof(D) * 8);
-    D out[256 / sizeof(D)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      out[i] = static_cast<D>(n + (i * m));
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `<AND, EOR, ...>
-   * pd, pg/z, pn, pm`.
-   * T represents the type of operands (e.g. for pn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static std::array<uint64_t, 4> sveLogicOp_preds(
-      std::vector<RegisterValue>& operands, const uint16_t VL_bits,
-      std::function<uint64_t(uint64_t, uint64_t)> func) {
-    const uint64_t* p = operands[0].getAsVector<uint64_t>();
-    const uint64_t* n = operands[1].getAsVector<uint64_t>();
-    const uint64_t* m = operands[2].getAsVector<uint64_t>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    std::array<uint64_t, 4> out = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active) {
-        out[i / (64 / sizeof(T))] |=
-            (func(n[i / (64 / sizeof(T))], m[i / (64 / sizeof(T))]) &
-             shifted_active);
+      if (b < 0) {
+        isNeg = !isNeg;
+        b = 0 - b;
       }
-    }
-    return out;
-  }
-
-  /** Helper function for SVE instructions with the format `<AND, EOR, ...>
-   * zd, pg/m, zn, zm`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveLogicOpPredicated_3vecs(
-      std::vector<RegisterValue>& operands, const uint16_t VL_bits,
-      std::function<T(T, T)> func) {
-    const uint64_t* p = operands[0].getAsVector<uint64_t>();
-    const T* dn = operands[1].getAsVector<T>();
-    const T* m = operands[2].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-    for (int i = 0; i < partition_num; i++) {
+      TT tmp = (static_cast<TT>(a) * static_cast<TT>(b));
+      if (isNeg) tmp = 0 - tmp;
+
+      out[i] = static_cast<T>(tmp >> (sizeof(T) * 8));
+    } else
+      out[i] = n[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `orr zd, zn,
+ * zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveOrr_3vecs(srcValContainer& sourceValues,
+                           const uint16_t VL_bits) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    out[i] = n[i] | m[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE2 instructions with the format `psel pd, pn,
+ * pm.t[wa, #imm]`.
+ * T represents the type of sourceValues (e.g. for pm.d, T =
+ * uint64_t). Returns an array of 4 uint64_t elements. */
+template <typename T>
+std::array<uint64_t, 4> svePsel(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const uint64_t* pn = sourceValues[0].getAsVector<uint64_t>();
+  const uint64_t* pm = sourceValues[1].getAsVector<uint64_t>();
+  const uint32_t wa = sourceValues[2].get<uint32_t>();
+  const uint32_t imm =
+      static_cast<uint32_t>(metadata.operands[2].pred.imm_index);
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+
+  uint32_t index = (wa + imm) % partition_num;
+  uint64_t shifted_active = 1ull << ((index % (64 / sizeof(T))) * sizeof(T));
+
+  std::array<uint64_t, 4> out = {0, 0, 0, 0};
+  if (pm[index / (64 / sizeof(T))] & shifted_active) {
+    out = {pn[0], pn[1], pn[2], pn[3]};
+  }
+
+  return out;
+}
+
+/** Helper function for SVE instructions with the format `ptrue pd{,
+ * pattern}.
+ * T represents the type of sourceValues (e.g. for pd.d, T = uint64_t).
+ * Returns an array of 4 uint64_t elements. */
+template <typename T>
+std::array<uint64_t, 4> svePtrue(
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  std::array<uint64_t, 4> out = {0, 0, 0, 0};
+
+  // Get pattern
+  const uint16_t count = getElemsFromPattern(
+      metadata.operands[1].sysop.alias.svepredpat, sizeof(T) * 8, VL_bits);
+  // Exit early if count == 0
+  if (count == 0) return out;
+
+  for (int i = 0; i < partition_num; i++) {
+    if (i < count) {
       uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active)
-        out[i] = func(dn[i], m[i]);
-      else
-        out[i] = dn[i];
+      out[i / (64 / sizeof(T))] |= shifted_active;
     }
-    return {out, 256};
   }
+  return out;
+}
 
-  /** Helper function for SVE instructions with the format `lsl sz, zn, #imm`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveLsl_imm(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits) {
-    const T* n = operands[0].getAsVector<T>();
-    const T imm = static_cast<T>(metadata.operands[2].imm);
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    typename std::make_signed<T>::type out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      out[i] = (n[i] << imm);
-    }
-    return {out, 256};
-  }
+/** Helper function for SVE instructions with the format `punpk<hi,lo> pd.h,
+ * pn.b`.
+ * If `isHI` = false, then PUNPKLO is performed.
+ * Returns an array of 4 uint64_t elements. */
+std::array<uint64_t, 4> svePunpk(srcValContainer& sourceValues,
+                                 const uint16_t VL_bits, bool isHi) {
+  const uint64_t* n = sourceValues[0].getAsVector<uint64_t>();
 
-  /** Helper function for SVE instructions with the format `max zdn, zdn,
-   * #imm`.
-   * T represents the type of operands (e.g. for zdn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveMax_vecImm(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits) {
-    const T* n = operands[0].getAsVector<T>();
-    T imm = static_cast<T>(metadata.operands[2].imm);
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      out[i] = std::max(n[i], imm);
-    }
-    return {out, 256};
-  }
+  const uint16_t partition_num = VL_bits / 8;
+  std::array<uint64_t, 4> out = {0, 0, 0, 0};
+  uint16_t index = isHi ? (partition_num / 2) : 0;
 
-  /** Helper function for SVE instructions with the format `max zdn, zdn,
-   * #imm`.
-   * T represents the type of operands (e.g. for zdn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveMaxPredicated_vecs(
-      std::vector<RegisterValue>& operands, const uint16_t VL_bits) {
-    const T* d = operands[0].getAsVector<T>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-    const T* n = operands[2].getAsVector<T>();
-    const T* m = operands[3].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active) {
-        out[i] = std::max(n[i], m[i]);
-      } else
-        out[i] = d[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `fmla zd, pg/m, zn,
-   * zm`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveMlaPredicated_vecs(
-      std::vector<RegisterValue>& operands, const uint16_t VL_bits) {
-    const T* d = operands[0].getAsVector<T>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-    const T* n = operands[2].getAsVector<T>();
-    const T* m = operands[3].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active)
-        out[i] = d[i] + (n[i] * m[i]);
-      else
-        out[i] = d[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `fmla zda, zn,
-   * zm[index]`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveMlaIndexed_vecs(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits) {
-    const T* d = operands[0].getAsVector<T>();
-    const T* n = operands[1].getAsVector<T>();
-    const T* m = operands[2].getAsVector<T>();
-    const size_t index = static_cast<size_t>(metadata.operands[2].vector_index);
-
-    const uint16_t elemsPer128 = 128 / (sizeof(T) * 8);
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (size_t i = 0; i < partition_num; i += elemsPer128) {
-      const T zm_elem = m[i + index];
-      for (size_t j = 0; j < elemsPer128; j++) {
-        out[i + j] = d[i + j] + (n[i + j] * zm_elem);
-      }
+  for (int i = 0; i < partition_num / 2; i++) {
+    if (n[index / 64] & 1ull << index % 64) {
+      out[i / 32] |= 1ull << ((i * 2) % 64);
     }
-
-    return {out, 256};
+    index++;
   }
+  return out;
+}
 
-  /** Helper function for SVE instructions with the format `movprfx zd,
-   * pg/z, zn`.
-   * T represents the type of operands (e.g. for zd.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveMovprfxPredicated_destToZero(
-      std::vector<RegisterValue>& operands, const uint16_t VL_bits) {
-    // TODO: Adopt hint logic of the MOVPRFX instruction
-    const uint64_t* p = operands[0].getAsVector<uint64_t>();
-    const T* n = operands[1].getAsVector<T>();
+/** Helper function for SVE instructions with the format `rev pd, pn`.
+ * T represents the type of sourceValues (e.g. for pd.d, T = uint64_t).
+ * Returns an array of 4 uint64_t elements. */
+template <typename T>
+std::array<uint64_t, 4> sveRev_predicates(srcValContainer& sourceValues,
+                                          const uint16_t VL_bits) {
+  const uint64_t* n = sourceValues[0].getAsVector<uint64_t>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  std::array<uint64_t, 4> out = {0, 0, 0, 0};
+  uint16_t index = partition_num - 1;
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t rev_shifted_active = 1ull
+                                  << ((index % (64 / sizeof(T))) * sizeof(T));
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    out[index / (64 / (sizeof(T)))] |=
+        ((n[i / (64 / (sizeof(T)))] & shifted_active) == shifted_active)
+            ? rev_shifted_active
+            : 0;
+    index--;
+  }
+  return out;
+}
+
+/** Helper function for SVE instructions with the format `rev zd, zn`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveRev_vecs(srcValContainer& sourceValues,
+                          const uint16_t VL_bits) {
+  const T* n = sourceValues[0].getAsVector<T>();
 
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  uint16_t index = partition_num - 1;
 
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active) {
-        out[i] = n[i];
-      } else {
-        out[i] = 0;
-      }
-    }
-    return {out, 256};
+  for (int i = 0; i < partition_num; i++) {
+    out[i] = n[index];
+    index--;
   }
+  return {out, 256};
+}
 
-  /** Helper function for SVE instructions with the format `movprfx zd,
-   * pg/m, zn`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveMovprfxPredicated_destUnchanged(
-      std::vector<RegisterValue>& operands, const uint16_t VL_bits) {
-    // TODO: Adopt hint logic of the MOVPRFX instruction
-    const T* d = operands[0].getAsVector<T>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-    const T* n = operands[2].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active) {
-        out[i] = n[i];
-      } else {
-        out[i] = d[i];
-      }
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `mul zdn, pg/m, zdn,
-   * <zm, #imm>`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveMulPredicated(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits, bool useImm) {
-    bool isFP = std::is_floating_point<T>::value;
-    const uint64_t* p = operands[0].getAsVector<uint64_t>();
-    const T* n = operands[1].getAsVector<T>();
-    const T* m;
-    T imm;
-    if (useImm)
-      imm = isFP ? metadata.operands[3].fp : metadata.operands[3].imm;
+/** Helper function for SVE instructions with the format `sel zd, pg, zn,
+ * zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveSel_zpzz(srcValContainer& sourceValues,
+                          const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* n = sourceValues[1].getAsVector<T>();
+  const T* m = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active)
+      out[i] = n[i];
     else
-      m = operands[2].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active) {
-        out[i] = n[i] * (useImm ? imm : m[i]);
-      } else
-        out[i] = n[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `mulh zdn, pg/m, zdn,
-   * zm`.
-   * T represents the type of operands (e.g. for zn.s, T = int32_t).
-   * TT represents the type twice the length of T (e.g. for T = int8_t, TT =
-   * int16_T).
-   * Returns correctly formatted RegisterValue. */
-  // TODO : Support for int64_t mulh operations.
-  template <typename T, typename TT>
-  static RegisterValue sveMulhPredicated(std::vector<RegisterValue>& operands,
-                                         const uint16_t VL_bits) {
-    const uint64_t* p = operands[0].getAsVector<uint64_t>();
-    const T* n = operands[1].getAsVector<T>();
-    const T* m = operands[2].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active) {
-        bool isNeg = false;
-        T a = n[i];
-        T b = m[i];
-        if (a < 0) {
-          isNeg = !isNeg;
-          a = 0 - a;
-        }
-        if (b < 0) {
-          isNeg = !isNeg;
-          b = 0 - b;
-        }
-        TT tmp = (static_cast<TT>(a) * static_cast<TT>(b));
-        if (isNeg) tmp = 0 - tmp;
-
-        out[i] = static_cast<T>(tmp >> (sizeof(T) * 8));
-      } else
-        out[i] = n[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `orr zd, zn,
-   * zm`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveOrr_3vecs(std::vector<RegisterValue>& operands,
-                                    const uint16_t VL_bits) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      out[i] = n[i] | m[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE2 instructions with the format `psel pd, pn,
-   * pm.t[wa, #imm]`.
-   * T represents the type of operands (e.g. for pm.d, T =
-   * uint64_t). Returns an array of 4 uint64_t elements. */
-  template <typename T>
-  static std::array<uint64_t, 4> svePsel(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata) {
-    const uint64_t* pn = operands[0].getAsVector<uint64_t>();
-    const uint64_t* pm = operands[1].getAsVector<uint64_t>();
-    const uint32_t wa = operands[2].get<uint32_t>();
-    const uint32_t imm = metadata.operands[2].sme_index.disp;
-
-    uint32_t index = wa + imm;
+      out[i] = m[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `sminv rd, pg, zn`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveSminv(srcValContainer& sourceValues, const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* n = sourceValues[1].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out = std::numeric_limits<T>::max();
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) out = std::min(out, n[i]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `Sub zd, zn,
+ * zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveSub_3vecs(srcValContainer& sourceValues,
+                           const uint16_t VL_bits) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    out[i] = n[i] - m[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `Sub zdn, pg/m, zdn,
+ * zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveSubrPredicated_3vecs(srcValContainer& sourceValues,
+                                      const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* dn = sourceValues[1].getAsVector<T>();
+  const T* m = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      out[i] = m[i] - dn[i];
+    } else {
+      out[i] = dn[i];
+    }
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `Sub zdn, pg/m, zdn,
+ * #imm`.
+ * T represents the type of sourceValues (e.g. for zdn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveSubPredicated_imm(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  bool isFP = std::is_floating_point<T>::value;
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* dn = sourceValues[1].getAsVector<T>();
+  const auto imm = isFP ? metadata.operands[3].fp : metadata.operands[3].imm;
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      out[i] = dn[i] - imm;
+    } else {
+      out[i] = dn[i];
+    }
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `sxt<b,h,w> zd, pg,
+ * zn`.
+ * T represents the type of vector registers (e.g. for zd.d, T = int64_t).
+ * C represents the type of the cast required - is linked to instruction
+ * variant used (i.e. sxtw requires int32_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, typename C>
+RegisterValue sveSxtPredicated(srcValContainer& sourceValues,
+                               const uint16_t VL_bits) {
+  const T* d = sourceValues[0].getAsVector<T>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const T* n = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      // Cast to C to get 'least significant sub-element'
+      // Then cast back to T to sign-extend this 'sub-element'
+      out[i] = static_cast<T>(static_cast<C>(n[i]));
+    } else {
+      out[i] = d[i];
+    }
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `trn1 zd, zn, zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveTrn1_3vecs(srcValContainer& sourceValues,
+                            const uint16_t VL_bits) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < (partition_num / 2); i++) {
+    out[2 * i] = n[(2 * i)];
+    out[(2 * i) + 1] = m[(2 * i)];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `trn2 zd, zn, zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveTrn2_3vecs(srcValContainer& sourceValues,
+                            const uint16_t VL_bits) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < (partition_num / 2); i++) {
+    out[2 * i] = n[(2 * i) + 1];
+    out[(2 * i) + 1] = m[(2 * i) + 1];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `<s,u>unpk>hi,lo> zd,
+ * zn`.
+ * D represents the type of the destination register (e.g. <u>int32_t for
+ * zd.s).
+ * N represents the type of the source register (e.g. <u>int8_t for zn.b).
+ * Returns correctly formatted RegisterValue. */
+template <typename D, typename N>
+RegisterValue sveUnpk_vecs(srcValContainer& sourceValues,
+                           const uint16_t VL_bits, bool isHi) {
+  const N* n = sourceValues[0].getAsVector<N>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(D) * 8);
+  D out[256 / sizeof(D)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    int index = isHi ? (partition_num + i) : i;
+    out[i] = static_cast<D>(n[index]);
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `uqdec<b, d, h, w>
+ * <x,w>d{, pattern{, MUL #imm}}`.
+ * D represents the type of dest. register(e.g. uint32_t for wd).
+ * N represents the type of the operation (e.g. for UQDECH, N = 16u).
+ * Returns single value of type uint64_t. */
+template <typename D, uint64_t N>
+uint64_t sveUqdec(srcValContainer& sourceValues,
+                  const simeng::arch::aarch64::InstructionMetadata& metadata,
+                  const uint16_t VL_bits) {
+  const D d = sourceValues[0].get<D>();
+
+  const uint8_t imm = metadata.operands[2].imm;
+  const uint16_t count = getElemsFromPattern(
+      metadata.operands[1].sysop.alias.svepredpat, N, VL_bits);
+
+  // The range of possible values does not fit in the range of any integral
+  // type, so a double is used as an intermediate value. The end result must
+  // be saturated to fit in uint64_t.
+  auto intermediate = double(d) - (imm * count);
+  if (intermediate < 0) {
+    return (uint64_t)0;
+  }
+  return (uint64_t)(d - (imm * count));
+}
+
+/** Helper function for SVE instructions with the format `uzp<1,2> zd, zn,
+ * zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveUzp_vecs(srcValContainer& sourceValues, const uint16_t VL_bits,
+                          bool isUzp1) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num / 2; i++) {
+    // UZP1 concatenates even elements. UZP2 concatenates odd.
+    int index = isUzp1 ? (2 * i) : (2 * i) + 1;
+    out[i] = n[index];
+  }
+  for (int i = 0; i < partition_num / 2; i++) {
+    int index = isUzp1 ? (2 * i) : (2 * i) + 1;
+    out[partition_num / 2 + i] = m[index];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `whilelo pd,
+ * <w,x>n, <w,x>m`.
+ * T represents the type of sourceValues n and m (e.g. for wn, T = uint32_t).
+ * P represents the type of operand p (e.g. for pd.b, P = uint8_t).
+ * Returns tuple of type [pred results (array of 4 uint64_t), nzcv]. */
+template <typename T, typename P>
+std::tuple<std::array<uint64_t, 4>, uint8_t> sveWhilelo(
+    srcValContainer& sourceValues, const uint16_t VL_bits, bool calcNZCV) {
+  const T n = sourceValues[0].get<T>();
+  const T m = sourceValues[1].get<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(P) * 8);
+  std::array<uint64_t, 4> out = {0, 0, 0, 0};
+  uint16_t index = 0;
+
+  for (int i = 0; i < partition_num; i++) {
+    // Determine whether lane should be active and shift to align with
+    // element in predicate register.
+    uint64_t shifted_active =
+        (n + i) < m ? 1ull << ((i % (64 / (sizeof(P))) * (sizeof(P)))) : 0;
+    out[index / (64 / (sizeof(P)))] =
+        out[index / (64 / (sizeof(P)))] | shifted_active;
+    index++;
+  }
+  // Byte count = sizeof(P) as destination predicate is predicate of P
+  // bytes.
+  uint8_t nzcv = calcNZCV ? getNZCVfromPred(out, VL_bits, sizeof(P)) : 0;
+  return {out, nzcv};
+}
+
+/** Helper function for SVE instructions with the format `zip<1,2> pd, pn,
+ * pm`.
+ * T represents the type of sourceValues (e.g. for pn.d, T = uint64_t).
+ * Returns an array of 4 uint64_t elements. */
+template <typename T>
+std::array<uint64_t, 4> sveZip_preds(srcValContainer& sourceValues,
+                                     const uint16_t VL_bits, bool isZip2) {
+  const uint64_t* n = sourceValues[0].getAsVector<uint64_t>();
+  const uint64_t* m = sourceValues[1].getAsVector<uint64_t>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  std::array<uint64_t, 4> out = {0, 0, 0, 0};
+
+  bool interleave = false;
+  int index = isZip2 ? (partition_num / 2) : 0;
+  for (int i = 0; i < partition_num; i++) {
     uint64_t shifted_active = 1ull << ((index % (64 / sizeof(T))) * sizeof(T));
-
-    std::array<uint64_t, 4> out = {0, 0, 0, 0};
-    if (pm[index / (64 / sizeof(T))] & shifted_active) {
-      out = {pn[0], pn[1], pn[2], pn[3]};
-    }
-
-    return out;
-  }
-
-  /** Helper function for SVE instructions with the format `ptrue pd{,
-   * pattern}.
-   * T represents the type of operands (e.g. for pd.d, T = uint64_t).
-   * Returns an array of 4 uint64_t elements. */
-  template <typename T>
-  static std::array<uint64_t, 4> svePtrue(
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits) {
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    std::array<uint64_t, 4> out = {0, 0, 0, 0};
-
-    // Get pattern
-    const uint16_t count =
-        AuxFunc::sveGetPattern(metadata.operandStr, sizeof(T) * 8, VL_bits);
-    // Exit early if count == 0
-    if (count == 0) return out;
-
-    for (int i = 0; i < partition_num; i++) {
-      if (i < count) {
-        uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-        out[i / (64 / sizeof(T))] |= shifted_active;
-      }
-    }
-    return out;
-  }
-
-  /** Helper function for SVE instructions with the format `punpk<hi,lo> pd.h,
-   * pn.b`.
-   * If `isHI` = false, then PUNPKLO is performed.
-   * Returns an array of 4 uint64_t elements. */
-  static std::array<uint64_t, 4> svePunpk(std::vector<RegisterValue>& operands,
-                                          const uint16_t VL_bits, bool isHi) {
-    const uint64_t* n = operands[0].getAsVector<uint64_t>();
-
-    const uint16_t partition_num = VL_bits / 8;
-    std::array<uint64_t, 4> out = {0, 0, 0, 0};
-    uint16_t index = isHi ? (partition_num / 2) : 0;
-
-    for (int i = 0; i < partition_num / 2; i++) {
-      if (n[index / 64] & 1ull << index % 64) {
-        out[i / 32] |= 1ull << ((i * 2) % 64);
-      }
+    if (interleave) {
+      out[i / (64 / sizeof(T))] |=
+          ((m[index / (64 / sizeof(T))] & shifted_active) == shifted_active)
+              ? static_cast<uint64_t>(1ull
+                                      << ((i % (64 / sizeof(T))) * sizeof(T)))
+              : 0;
       index++;
-    }
-    return out;
-  }
-
-  /** Helper function for SVE instructions with the format `rev pd, pn`.
-   * T represents the type of operands (e.g. for pd.d, T = uint64_t).
-   * Returns an array of 4 uint64_t elements. */
-  template <typename T>
-  static std::array<uint64_t, 4> sveRev_predicates(
-      std::vector<RegisterValue>& operands, const uint16_t VL_bits) {
-    const uint64_t* n = operands[0].getAsVector<uint64_t>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    std::array<uint64_t, 4> out = {0, 0, 0, 0};
-    uint16_t index = partition_num - 1;
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t rev_shifted_active = 1ull
-                                    << ((index % (64 / sizeof(T))) * sizeof(T));
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      out[index / (64 / (sizeof(T)))] |=
-          ((n[i / (64 / (sizeof(T)))] & shifted_active) == shifted_active)
-              ? rev_shifted_active
+    } else {
+      out[i / (64 / sizeof(T))] |=
+          ((n[index / (64 / sizeof(T))] & shifted_active) == shifted_active)
+              ? static_cast<uint64_t>(1ull
+                                      << ((i % (64 / sizeof(T))) * sizeof(T)))
               : 0;
-      index--;
-    }
-    return out;
-  }
-
-  /** Helper function for SVE instructions with the format `rev zd, zn`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveRev_vecs(std::vector<RegisterValue>& operands,
-                                   const uint16_t VL_bits) {
-    const T* n = operands[0].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-    uint16_t index = partition_num - 1;
-
-    for (int i = 0; i < partition_num; i++) {
-      out[i] = n[index];
-      index--;
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `sel zd, pg, zn,
-   * zm`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveSel_zpzz(std::vector<RegisterValue>& operands,
-                                   const uint16_t VL_bits) {
-    const uint64_t* p = operands[0].getAsVector<uint64_t>();
-    const T* n = operands[1].getAsVector<T>();
-    const T* m = operands[2].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active)
-        out[i] = n[i];
-      else
-        out[i] = m[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `sminv rd, pg, zn`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveSminv(std::vector<RegisterValue>& operands,
-                                const uint16_t VL_bits) {
-    const uint64_t* p = operands[0].getAsVector<uint64_t>();
-    const T* n = operands[1].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out = std::numeric_limits<T>::max();
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active) out = std::min(out, n[i]);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `Sub zd, zn,
-   * zm`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveSub_3vecs(std::vector<RegisterValue>& operands,
-                                    const uint16_t VL_bits) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      out[i] = n[i] - m[i];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `Sub zdn, pg/m, zdn,
-   * zm`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveSubrPredicated_3vecs(
-      std::vector<RegisterValue>& operands, const uint16_t VL_bits) {
-    const uint64_t* p = operands[0].getAsVector<uint64_t>();
-    const T* dn = operands[1].getAsVector<T>();
-    const T* m = operands[2].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active) {
-        out[i] = m[i] - dn[i];
-      } else {
-        out[i] = dn[i];
-      }
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `Sub zdn, pg/m, zdn,
-   * #imm`.
-   * T represents the type of operands (e.g. for zdn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveSubPredicated_imm(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits) {
-    bool isFP = std::is_floating_point<T>::value;
-    const uint64_t* p = operands[0].getAsVector<uint64_t>();
-    const T* dn = operands[1].getAsVector<T>();
-    const auto imm = isFP ? metadata.operands[3].fp : metadata.operands[3].imm;
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active) {
-        out[i] = dn[i] - imm;
-      } else {
-        out[i] = dn[i];
-      }
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `sxt<b,h,w> zd, pg,
-   * zn`.
-   * T represents the type of vector registers (e.g. for zd.d, T = int64_t).
-   * C represents the type of the cast required - is linked to instruction
-   * variant used (i.e. sxtw requires int32_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T, typename C>
-  static RegisterValue sveSxtPredicated(std::vector<RegisterValue>& operands,
-                                        const uint16_t VL_bits) {
-    const T* d = operands[0].getAsVector<T>();
-    const uint64_t* p = operands[1].getAsVector<uint64_t>();
-    const T* n = operands[2].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (p[i / (64 / sizeof(T))] & shifted_active) {
-        // Cast to C to get 'least significant sub-element'
-        // Then cast back to T to sign-extend this 'sub-element'
-        out[i] = static_cast<T>(static_cast<C>(n[i]));
-      } else {
-        out[i] = d[i];
-      }
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `trn1 zd, zn, zm`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveTrn1_3vecs(std::vector<RegisterValue>& operands,
-                                     const uint16_t VL_bits) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < (partition_num / 2); i++) {
-      out[2 * i] = n[(2 * i)];
-      out[(2 * i) + 1] = m[(2 * i)];
     }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `trn2 zd, zn, zm`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveTrn2_3vecs(std::vector<RegisterValue>& operands,
-                                     const uint16_t VL_bits) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < (partition_num / 2); i++) {
-      out[2 * i] = n[(2 * i) + 1];
-      out[(2 * i) + 1] = m[(2 * i) + 1];
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `<s,u>unpk>hi,lo> zd,
-   * zn`.
-   * D represents the type of the destination register (e.g. <u>int32_t for
-   * zd.s).
-   * N represents the type of the source register (e.g. <u>int8_t for zn.b).
-   * Returns correctly formatted RegisterValue. */
-  template <typename D, typename N>
-  static RegisterValue sveUnpk_vecs(std::vector<RegisterValue>& operands,
-                                    const uint16_t VL_bits, bool isHi) {
-    const N* n = operands[0].getAsVector<N>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(D) * 8);
-    D out[256 / sizeof(D)] = {0};
-
-    for (int i = 0; i < partition_num; i++) {
-      int index = isHi ? (partition_num + i) : i;
-      out[i] = static_cast<D>(n[index]);
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions with the format `uqdec<b, d, h, w>
-   * <x,w>d{, pattern{, MUL #imm}}`.
-   * D represents the type of dest. register(e.g. uint32_t for wd).
-   * N represents the type of the operation (e.g. for UQDECH, N = 16u).
-   * Returns single value of type uint64_t. */
-  template <typename D, uint64_t N>
-  static uint64_t sveUqdec(
-      std::vector<RegisterValue>& operands,
-      const simeng::arch::aarch64::InstructionMetadata& metadata,
-      const uint16_t VL_bits) {
-    const D d = operands[0].get<D>();
-    const uint8_t imm = metadata.operands[1].imm;
-    const uint16_t count =
-        AuxFunc::sveGetPattern(metadata.operandStr, N, VL_bits);
-
-    // The range of possible values does not fit in the range of any integral
-    // type, so a double is used as an intermediate value. The end result must
-    // be saturated to fit in uint64_t.
-    auto intermediate = double(d) - (imm * count);
-    if (intermediate < 0) {
-      return (uint64_t)0;
-    }
-    return (uint64_t)(d - (imm * count));
-  }
-
-  /** Helper function for SVE instructions with the format `uzp<1,2> zd, zn,
-   * zm`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveUzp_vecs(std::vector<RegisterValue>& operands,
-                                   const uint16_t VL_bits, bool isUzp1) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    for (int i = 0; i < partition_num / 2; i++) {
-      // UZP1 concatenates even elements. UZP2 concatenates odd.
-      int index = isUzp1 ? (2 * i) : (2 * i) + 1;
+    interleave = !interleave;
+  }
+  return out;
+}
+
+/** Helper function for SVE instructions with the format `zip<1,2> zd, zn,
+ * zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveZip_vecs(srcValContainer& sourceValues, const uint16_t VL_bits,
+                          bool isZip2) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  bool interleave = false;
+  int index = isZip2 ? (partition_num / 2) : 0;
+  for (int i = 0; i < partition_num; i++) {
+    if (interleave) {
+      out[i] = m[index];
+      index++;
+    } else {
       out[i] = n[index];
     }
-    for (int i = 0; i < partition_num / 2; i++) {
-      int index = isUzp1 ? (2 * i) : (2 * i) + 1;
-      out[partition_num / 2 + i] = m[index];
+    interleave = !interleave;
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions store instructions to merge
+ * consecutive active elements into blocks to be written.
+ * T represents the size of the vector elements (e.g. for zn.d, T = uint64_t).
+ * C represents the size of the memory elements (e.g. for st1w, C = uint32_t).
+ * Return a vector of RegisterValues.  */
+template <typename T, typename C = T>
+std::vector<RegisterValue> sve_merge_store_data(const T* d, const uint64_t* p,
+                                                uint16_t vl_bits) {
+  std::vector<RegisterValue> outputData;
+
+  uint16_t numVecElems = (vl_bits / (8 * sizeof(T)));
+  // Determine how many predicate elements are present per uint64_t.
+  uint16_t predsPer64 = (64 / sizeof(T));
+
+  // Determine size of array based on the size of the memory access (This is
+  // the C specifier in sve instructions)
+  std::array<C, 256 / sizeof(C)> mData;
+  uint16_t mdSize = 0;
+
+  for (uint16_t x = 0; x < numVecElems; x++) {
+    // Determine mask to get predication for active element.
+    uint64_t shiftedActive = 1ull << ((x % predsPer64) * sizeof(T));
+    if (p[x / predsPer64] & shiftedActive) {
+      mData[mdSize] = static_cast<C>(d[x]);
+      mdSize++;
+    } else if (mdSize) {
+      outputData.push_back(
+          RegisterValue((char*)mData.data(), mdSize * sizeof(C)));
+      mdSize = 0;
     }
-    return {out, 256};
   }
-
-  /** Helper function for SVE instructions with the format `whilelo pd,
-   * <w,x>n, <w,x>m`.
-   * T represents the type of operands n and m (e.g. for wn, T = uint32_t).
-   * P represents the type of operand p (e.g. for pd.b, P = uint8_t).
-   * Returns tuple of type [pred results (array of 4 uint64_t), nzcv]. */
-  template <typename T, typename P>
-  static std::tuple<std::array<uint64_t, 4>, uint8_t> sveWhilelo(
-      std::vector<RegisterValue>& operands, const uint16_t VL_bits,
-      bool calcNZCV) {
-    const T n = operands[0].get<T>();
-    const T m = operands[1].get<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(P) * 8);
-    std::array<uint64_t, 4> out = {0, 0, 0, 0};
-    uint16_t index = 0;
-
-    for (int i = 0; i < partition_num; i++) {
-      // Determine whether lane should be active and shift to align with
-      // element in predicate register.
-      uint64_t shifted_active =
-          (n + i) < m ? 1ull << ((i % (64 / (sizeof(P))) * (sizeof(P)))) : 0;
-      out[index / (64 / (sizeof(P)))] =
-          out[index / (64 / (sizeof(P)))] | shifted_active;
-      index++;
-    }
-    // Byte count = sizeof(P) as destination predicate is predicate of P
-    // bytes.
-    uint8_t nzcv =
-        calcNZCV ? AuxFunc::getNZCVfromPred(out, VL_bits, sizeof(P)) : 0;
-    return {out, nzcv};
-  }
-
-  /** Helper function for SVE instructions with the format `zip<1,2> pd, pn,
-   * pm`.
-   * T represents the type of operands (e.g. for pn.d, T = uint64_t).
-   * Returns an array of 4 uint64_t elements. */
-  template <typename T>
-  static std::array<uint64_t, 4> sveZip_preds(
-      std::vector<RegisterValue>& operands, const uint16_t VL_bits,
-      bool isZip2) {
-    const uint64_t* n = operands[0].getAsVector<uint64_t>();
-    const uint64_t* m = operands[1].getAsVector<uint64_t>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    std::array<uint64_t, 4> out = {0, 0, 0, 0};
-
-    bool interleave = false;
-    int index = isZip2 ? (partition_num / 2) : 0;
-    for (int i = 0; i < partition_num; i++) {
-      uint64_t shifted_active = 1ull
-                                << ((index % (64 / sizeof(T))) * sizeof(T));
-      if (interleave) {
-        out[i / (64 / sizeof(T))] |=
-            ((m[index / (64 / sizeof(T))] & shifted_active) == shifted_active)
-                ? static_cast<uint64_t>(1ull
-                                        << ((i % (64 / sizeof(T))) * sizeof(T)))
-                : 0;
-        index++;
-      } else {
-        out[i / (64 / sizeof(T))] |=
-            ((n[index / (64 / sizeof(T))] & shifted_active) == shifted_active)
-                ? static_cast<uint64_t>(1ull
-                                        << ((i % (64 / sizeof(T))) * sizeof(T)))
-                : 0;
-      }
-      interleave = !interleave;
-    }
-    return out;
+  if (mdSize) {
+    outputData.push_back(
+        RegisterValue((char*)mData.data(), mdSize * sizeof(C)));
   }
+  return outputData;
+}
 
-  /** Helper function for SVE instructions with the format `zip<1,2> zd, zn,
-   * zm`.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Returns correctly formatted RegisterValue. */
-  template <typename T>
-  static RegisterValue sveZip_vecs(std::vector<RegisterValue>& operands,
-                                   const uint16_t VL_bits, bool isZip2) {
-    const T* n = operands[0].getAsVector<T>();
-    const T* m = operands[1].getAsVector<T>();
-
-    const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-    T out[256 / sizeof(T)] = {0};
-
-    bool interleave = false;
-    int index = isZip2 ? (partition_num / 2) : 0;
-    for (int i = 0; i < partition_num; i++) {
-      if (interleave) {
-        out[i] = m[index];
-        index++;
-      } else {
-        out[i] = n[index];
-      }
-      interleave = !interleave;
-    }
-    return {out, 256};
-  }
-
-  /** Helper function for SVE instructions store instructions to merge
-   * consecutive active elements into blocks to be written.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
-   * Return a vector of RegisterValues.  */
-  template <typename T>
-  static std::vector<RegisterValue> sve_merge_store_data(const T* d,
-                                                         const uint64_t* p,
-                                                         uint16_t vl_bits) {
-    std::vector<RegisterValue> outputData;
-
-    uint16_t numVecElems = (vl_bits / (8 * sizeof(T)));
-    // Determine how many predicate elements are present per uint64_t.
-    uint16_t predsPer64 = (64 / sizeof(T));
-
-    // Determine size of array based on the size of the stored element (This is
-    // the T specifier in sve instructions)
-    std::array<T, 256 / sizeof(T)> mData;
-    uint16_t mdSize = 0;
-
-    for (uint16_t x = 0; x < numVecElems; x++) {
-      // Determine mask to get predication for active element.
-      uint64_t shiftedActive = 1ull << ((x % predsPer64) * sizeof(T));
-      if (p[x / predsPer64] & shiftedActive) {
-        mData[mdSize] = d[x];
-        mdSize++;
-      } else if (mdSize) {
-        outputData.push_back(
-            RegisterValue((char*)mData.data(), mdSize * sizeof(T)));
-        mdSize = 0;
-      }
-    }
-    if (mdSize) {
-      outputData.push_back(
-          RegisterValue((char*)mData.data(), mdSize * sizeof(T)));
-    }
-    return outputData;
-  }
-};
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
diff --git a/src/include/simeng/arch/aarch64/operandContainer.hh b/src/include/simeng/arch/aarch64/operandContainer.hh
new file mode 100644
index 0000000000..c73b8881da
--- /dev/null
+++ b/src/include/simeng/arch/aarch64/operandContainer.hh
@@ -0,0 +1,94 @@
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <variant>
+#include <vector>
+
+namespace simeng {
+namespace arch {
+namespace aarch64 {
+
+/** The maximum number of source registers a non-SME instruction can have. */
+const uint8_t MAX_SOURCE_REGISTERS = 6;
+
+/** The maximum number of destination registers a non-SME instruction can have.
+ */
+const uint8_t MAX_DESTINATION_REGISTERS = 5;
+
+/** The maximum number of source/destination operands an SME instruction can
+ * have in addition to any ZA operands. */
+const uint8_t ADDITIONAL_SME_REGISTERS = 11;
+
+/** Simple class to allow AArch64 instructions to use std::array for operands in
+ * most cases, but for SME instructions a std::vector can be utilised to allow
+ * for the increased number of operands used. */
+template <typename T, const uint8_t arrSize>
+class operandContainer {
+ public:
+  /** Make enough space for an SME operand in container - stop using fixed size
+   * array and instead use vector. */
+  constexpr void addSMEOperand(const uint16_t numSMERows) {
+    if (std::holds_alternative<std::array<T, arrSize>>(var_)) {
+      // Get values in array
+      auto arr = std::get<std::array<T, arrSize>>(var_);
+      // Place into vector
+      var_ = std::vector<T>{arr.begin(), arr.end()};
+      // Re-size vector to accomodate SME instruction - make sure to keep all
+      // current operands and make space for any additional operands that can be
+      // present with SME instructions
+      std::get<std::vector<T>>(var_).resize(
+          arr.size() + ADDITIONAL_SME_REGISTERS + numSMERows);
+    } else {
+      // std::vector already in use; only need to allocate enough room for
+      // additional SME operand.
+      this->resize(this->size() + numSMERows);
+    }
+  }
+
+  /** Resize the vector to be the same size as `numRegs`. Primarily used to
+   * ensure any unused vector indexes introduced in addSMEOperand() are removed.
+   */
+  constexpr void resize(uint16_t numRegs) {
+    assert(std::holds_alternative<std::vector<T>>(var_) &&
+           "resize can only be called when the active member is std::vector "
+           "(i.e. after a call to addSMEOperand() has been made)");
+    std::get<std::vector<T>>(var_).resize(numRegs);
+  }
+
+  /** Get the size of the currently active data structure. */
+  [[nodiscard]] constexpr size_t size() const {
+    return std::visit([](auto&& arg) -> size_t { return arg.size(); }, var_);
+  }
+
+  /** Implementation of the [] operator to apply to the currently active variant
+   * member. */
+  [[nodiscard]] constexpr const T& operator[](size_t idx) const {
+    return std::visit([=](auto&& arg) -> const T& { return (arg[idx]); }, var_);
+  }
+
+  /** Implementation of the [] operator to apply to the currently active variant
+   * member. */
+  [[nodiscard]] constexpr T& operator[](size_t idx) {
+    return std::visit([=](auto&& arg) -> T& { return (arg[idx]); }, var_);
+  }
+
+  /** Retrieve the underlying pointer of the active variant member. */
+  [[nodiscard]] constexpr const T* data() const noexcept {
+    return std::visit([](auto&& arg) -> const T* { return arg.data(); }, var_);
+  }
+
+  /** Retrieve the underlying pointer of the active variant member. */
+  [[nodiscard]] constexpr T* data() noexcept {
+    return std::visit([](auto&& arg) -> T* { return arg.data(); }, var_);
+  }
+
+ private:
+  /** Variant containing a fixed size array (used by default) and a vector, the
+   * latter of which can be utilised by calling addSMEOperand(). */
+  std::variant<std::array<T, arrSize>, std::vector<T>> var_;
+};
+
+}  // namespace aarch64
+}  // namespace arch
+}  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/riscv/ArchInfo.hh b/src/include/simeng/arch/riscv/ArchInfo.hh
new file mode 100644
index 0000000000..2d3ea0e238
--- /dev/null
+++ b/src/include/simeng/arch/riscv/ArchInfo.hh
@@ -0,0 +1,98 @@
+#pragma once
+
+#include "simeng/arch/ArchInfo.hh"
+
+namespace simeng {
+namespace arch {
+namespace riscv {
+
+// A temporary enum to hold system register addresses
+// TODO this should be removed upon relevant capstone updates
+typedef enum riscv_sysreg {
+  RISCV_SYSREG_FFLAGS = 0x001,
+  RISCV_SYSREG_FRM = 0x002,
+  RISCV_SYSREG_FCSR = 0x003,
+
+  RISCV_SYSREG_CYCLE = 0xC00,
+  RISCV_SYSREG_TIME = 0xC01,
+  RISCV_SYSREG_INSTRET = 0xC02,
+
+} riscv_sysreg;
+
+//  A struct of RISC-V specific constants
+namespace constantsPool {
+const uint8_t addressAlignMask = 0x3;
+const uint8_t addressAlignMaskCompressed = 0x1;
+const uint8_t minInstWidthBytes = 4;
+const uint8_t minInstWidthBytesCompressed = 2;
+}  // namespace constantsPool
+
+/** A class to hold and generate riscv specific architecture configuration
+ * options. */
+class ArchInfo : public simeng::arch::ArchInfo {
+ public:
+  ArchInfo(ryml::ConstNodeRef config)
+      : sysRegisterEnums_(
+            {riscv_sysreg::RISCV_SYSREG_FFLAGS, riscv_sysreg::RISCV_SYSREG_FRM,
+             riscv_sysreg::RISCV_SYSREG_FCSR, riscv_sysreg::RISCV_SYSREG_CYCLE,
+             riscv_sysreg::RISCV_SYSREG_TIME,
+             riscv_sysreg::RISCV_SYSREG_INSTRET}),
+        archRegStruct_({{8, 32},
+                        {8, 32},
+                        {8, static_cast<uint16_t>(sysRegisterEnums_.size())}}) {
+    // Generate the config-defined physical register structure and quantities
+    ryml::ConstNodeRef regConfig = config["Register-Set"];
+    uint16_t gpCount = regConfig["GeneralPurpose-Count"].as<uint16_t>();
+    uint16_t fpCount = regConfig["FloatingPoint-Count"].as<uint16_t>();
+    physRegStruct_ = {{8, gpCount},
+                      {8, fpCount},
+                      {8, static_cast<uint16_t>(sysRegisterEnums_.size())}};
+    physRegQuantities_ = {gpCount, fpCount,
+                          static_cast<uint16_t>(sysRegisterEnums_.size())};
+  }
+
+  /** Get the set of system register enums currently supported. */
+  const std::vector<uint64_t>& getSysRegEnums() const override {
+    return sysRegisterEnums_;
+  }
+
+  /** Get the structure of the architecture register fileset(s). */
+  const std::vector<simeng::RegisterFileStructure>& getArchRegStruct()
+      const override {
+    return archRegStruct_;
+  }
+
+  /** Get the structure of the physical register fileset(s) as defined in the
+   * simulation configuration. */
+  const std::vector<simeng::RegisterFileStructure>& getPhysRegStruct()
+      const override {
+    return physRegStruct_;
+  }
+
+  /** Get the quantities of the physical register in each fileset as defined in
+   * the simulation configuration. */
+  const std::vector<uint16_t>& getPhysRegQuantities() const override {
+    return physRegQuantities_;
+  }
+
+ private:
+  /** The vector of all system register Capstone enum values used in the
+   * associated Architecture class. */
+  const std::vector<uint64_t> sysRegisterEnums_;
+
+  /** The structure of the architectural register filesets within the
+   * implemented aarch64 architecture. */
+  std::vector<simeng::RegisterFileStructure> archRegStruct_;
+
+  /** The structure of the physical register filesets within the
+   * implemented aarch64 architecture. */
+  std::vector<simeng::RegisterFileStructure> physRegStruct_;
+
+  /** The quantities of the physical register within each filesets of the
+   * implemented aarch64 architecture. */
+  std::vector<uint16_t> physRegQuantities_;
+};
+
+}  // namespace riscv
+}  // namespace arch
+}  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/riscv/Architecture.hh b/src/include/simeng/arch/riscv/Architecture.hh
index d77184b93a..062654560a 100644
--- a/src/include/simeng/arch/riscv/Architecture.hh
+++ b/src/include/simeng/arch/riscv/Architecture.hh
@@ -6,7 +6,6 @@
 #include "simeng/arch/Architecture.hh"
 #include "simeng/arch/riscv/ExceptionHandler.hh"
 #include "simeng/arch/riscv/Instruction.hh"
-#include "simeng/kernel/Linux.hh"
 
 using csh = size_t;
 
@@ -17,31 +16,28 @@ namespace riscv {
 /* A basic RISC-V implementation of the `Architecture` interface. */
 class Architecture : public arch::Architecture {
  public:
-  Architecture(kernel::Linux& kernel, YAML::Node config);
+  Architecture(kernel::Linux& kernel,
+               ryml::ConstNodeRef config = config::SimInfo::getConfig());
+
   ~Architecture();
+
   /** Pre-decode instruction memory into a macro-op of `Instruction`
-   * instances. Returns the number of bytes consumed to produce it (always 4),
-   * and writes into the supplied macro-op vector. */
-  uint8_t predecode(const void* ptr, uint8_t bytesAvailable,
+   * instances. Returns the number of bytes consumed to produce it (0 if
+   * failure), and writes into the supplied macro-op vector. */
+  uint8_t predecode(const uint8_t* ptr, uint16_t bytesAvailable,
                     uint64_t instructionAddress, MacroOp& output,
                     std::string& disasm) const override;
 
-  /** Returns an RISC-V register file structure description. */
-  std::vector<RegisterFileStructure> getRegisterFileStructures() const override;
-
   /** Returns a zero-indexed register tag for a system register encoding. */
   int32_t getSystemRegisterTag(uint16_t reg) const override;
 
-  /** Returns the number of system registers that have a mapping. */
-  uint16_t getNumSystemRegisters() const override;
-
   /** Create an exception handler for the exception generated by `instruction`,
    * providing the core model object and a reference to process memory.
    * Returns a smart pointer to an `ExceptionHandler` which may be ticked until
    * the exception is resolved, and results then obtained. */
   std::shared_ptr<arch::ExceptionHandler> handleException(
       const std::shared_ptr<simeng::Instruction>& instruction, const Core& core,
-      MemoryInterface& memory) const override;
+      memory::MemoryInterface& memory) const override;
 
   /** Retrieve the initial process state. */
   ProcessStateChange getInitialState() const override;
@@ -49,52 +45,40 @@ class Architecture : public arch::Architecture {
   /** Returns the maximum size of a valid instruction in bytes. */
   uint8_t getMaxInstructionSize() const override;
 
+  /** Returns the minimum size of a valid instruction in bytes. */
+  uint8_t getMinInstructionSize() const override;
+
   /** Updates System registers of any system-based timers. */
   void updateSystemTimerRegisters(RegisterFileSet* regFile,
                                   const uint64_t iterations) const override;
 
-  /** Returns the physical register structure as defined within the config file
-   */
-  std::vector<RegisterFileStructure> getConfigPhysicalRegisterStructure(
-      YAML::Node config) const override;
-
-  /** Returns the physical register quantities as defined within the config file
-   */
-  std::vector<uint16_t> getConfigPhysicalRegisterQuantities(
-      YAML::Node config) const override;
-
  private:
-  /** Retrieve an executionInfo object for the requested instruction. If a
+  /** Retrieve an ExecutionInfo object for the requested instruction. If a
    * opcode-based override has been defined for the latency and/or
    * port information, return that instead of the group-defined execution
    * information. */
-  executionInfo getExecutionInfo(Instruction& insn) const;
+  ExecutionInfo getExecutionInfo(const Instruction& insn) const;
 
   /** A decoding cache, mapping an instruction word to a previously decoded
    * instruction. Instructions are added to the cache as they're decoded, to
    * reduce the overhead of future decoding. */
-  static std::unordered_map<uint32_t, Instruction> decodeCache;
+  mutable std::unordered_map<uint32_t, Instruction> decodeCache_;
+
+  mutable std::unordered_map<uint32_t, std::string> disasmCache;
+
   /** A decoding metadata cache, mapping an instruction word to a previously
    * decoded instruction metadata bundle. Metadata is added to the cache as it's
    * decoded, to reduce the overhead of future decoding. */
-  static std::forward_list<InstructionMetadata> metadataCache;
-
-  /** A mapping from system register encoding to a zero-indexed tag. */
-  std::unordered_map<uint16_t, uint16_t> systemRegisterMap_;
-
-  /** A map to hold the relationship between aarch64 instruction groups and
-   * user-defined execution information. */
-  std::unordered_map<uint16_t, executionInfo> groupExecutionInfo_;
+  mutable std::forward_list<InstructionMetadata> metadataCache_;
 
-  /** A map to hold the relationship between aarch64 instruction opcode and
-   * user-defined execution information. */
-  std::unordered_map<uint16_t, executionInfo> opcodeExecutionInfo_;
+  /** System Register of Processor Cycle Counter. */
+  simeng::Register cycleSystemReg_;
 
-  /** A Capstone decoding library handle, for decoding instructions. */
-  csh capstoneHandle;
+  /** A mask used to determine if an address has the correct byte alignment */
+  uint8_t addressAlignmentMask_;
 
-  /** A reference to a Linux kernel object to forward syscalls to. */
-  kernel::Linux& linux_;
+  /** Minimum number of bytes that can represent an instruction */
+  uint8_t minInsnLength_;
 };
 
 }  // namespace riscv
diff --git a/src/include/simeng/arch/riscv/ExceptionHandler.hh b/src/include/simeng/arch/riscv/ExceptionHandler.hh
index 02d29c93bb..f28e6f3929 100644
--- a/src/include/simeng/arch/riscv/ExceptionHandler.hh
+++ b/src/include/simeng/arch/riscv/ExceptionHandler.hh
@@ -16,7 +16,7 @@ class ExceptionHandler : public simeng::arch::ExceptionHandler {
   /** Create an exception handler with references to the instruction that caused
    * the exception, along with the core model object and process memory. */
   ExceptionHandler(const std::shared_ptr<simeng::Instruction>& instruction,
-                   const Core& core, MemoryInterface& memory,
+                   const Core& core, memory::MemoryInterface& memory,
                    kernel::Linux& linux);
 
   /** Progress handling of the exception, by calling and returning the result of
@@ -59,7 +59,7 @@ class ExceptionHandler : public simeng::arch::ExceptionHandler {
                       bool firstCall = true);
 
   /** A data buffer used for reading data from memory. */
-  std::vector<uint8_t> dataBuffer;
+  std::vector<uint8_t> dataBuffer_;
 
   /** Performs a readlinkat syscall using the path supplied. */
   void readLinkAt(span<char> path);
@@ -75,10 +75,10 @@ class ExceptionHandler : public simeng::arch::ExceptionHandler {
   const Instruction& instruction_;
 
   /** The core model object. */
-  const Core& core;
+  const Core& core_;
 
   /** The process memory. */
-  MemoryInterface& memory_;
+  memory::MemoryInterface& memory_;
 
   /** The Linux kernel to forward syscalls to. */
   kernel::Linux& linux_;
@@ -96,6 +96,16 @@ class ExceptionHandler : public simeng::arch::ExceptionHandler {
   static constexpr Register R3 = {RegisterType::GENERAL, 13};
   static constexpr Register R4 = {RegisterType::GENERAL, 14};
   static constexpr Register R5 = {RegisterType::GENERAL, 15};
+
+  /** Let the following ExceptionHandlerTest derived classes be a friend of this
+   * class to allow proper testing of `readStringThen()`, `readBufferThen()` and
+   * `printException()` functions. */
+  friend class RiscVExceptionHandlerTest_readStringThen_Test;
+  friend class RiscVExceptionHandlerTest_readStringThen_maxLen0_Test;
+  friend class RiscVExceptionHandlerTest_readStringThen_maxLenReached_Test;
+  friend class RiscVExceptionHandlerTest_readBufferThen_Test;
+  friend class RiscVExceptionHandlerTest_readBufferThen_length0_Test;
+  friend class RiscVExceptionHandlerTest_printException_Test;
 };
 
 }  // namespace riscv
diff --git a/src/include/simeng/arch/riscv/Instruction.hh b/src/include/simeng/arch/riscv/Instruction.hh
index 8ee390f94c..62b6c0b8b5 100644
--- a/src/include/simeng/arch/riscv/Instruction.hh
+++ b/src/include/simeng/arch/riscv/Instruction.hh
@@ -1,11 +1,13 @@
 #pragma once
 
 #include <array>
+#include <cfenv>
+#include <functional>
 #include <unordered_map>
 
-#include "simeng/BranchPredictor.hh"
 #include "simeng/Instruction.hh"
 #include "simeng/arch/riscv/InstructionGroups.hh"
+#include "simeng/branchpredictors/BranchPredictor.hh"
 
 namespace simeng {
 namespace arch {
@@ -21,35 +23,61 @@ const uint8_t GENERAL = 0;
 const uint8_t FLOAT = 1;
 /** The system registers. */
 const uint8_t SYSTEM = 2;
-}  // namespace RegisterType
-
-/** A struct holding user-defined execution information for a aarch64
- * instruction. */
-struct executionInfo {
-  /** The latency for the instruction. */
-  uint16_t latency = 1;
-
-  /** The execution throughput for the instruction. */
-  uint16_t stallCycles = 1;
 
-  /** The ports that support the instruction. */
-  std::vector<uint16_t> ports = {};
-};
+/** A special register value representing the zero register. */
+const Register ZERO_REGISTER = {GENERAL, (uint16_t)0};
+}  // namespace RegisterType
 
 /** The various exceptions that can be raised by an individual instruction. */
 enum class InstructionException {
   None = 0,
   EncodingUnallocated,
-  EncodingNotYetImplemented,
   ExecutionNotYetImplemented,
+  AliasNotYetImplemented,
   MisalignedPC,
   DataAbort,
   SupervisorCall,
   HypervisorCall,
   SecureMonitorCall,
-  NoAvailablePort
+  NoAvailablePort,
+  IllegalInstruction,
+  PipelineFlush
+};
+
+// RISC-V Instruction Identifier Masks
+enum class InsnType : uint16_t {
+  /** Is this a store operation? */
+  isStore = 1 << 0,
+  /** Is this a load operation? */
+  isLoad = 1 << 1,
+  /** Is this a branch operation? */
+  isBranch = 1 << 2,
+  /** Is this a multiply operation? */
+  isMultiply = 1 << 3,
+  /** Is this a divide operation? */
+  isDivide = 1 << 4,
+  /** Is this a shift operation? */
+  isShift = 1 << 5,
+  /** Is this an atomic instruction? */
+  isAtomic = 1 << 6,
+  /** Is this a logical instruction? */
+  isLogical = 1 << 7,
+  /** Is this a compare instruction? */
+  isCompare = 1 << 8,
+  /** Is this a floating point operation? */
+  isFloat = 1 << 9,
+  /** Is this a floating point <-> integer convert operation? */
+  isConvert = 1 << 10,
 };
 
+/** The maximum number of source registers any supported RISC-V instruction
+ * can have. */
+const uint8_t MAX_SOURCE_REGISTERS = 3;
+
+/** The maximum number of destination registers any supported RISC-V
+ * instruction can have. */
+const uint8_t MAX_DESTINATION_REGISTERS = 1;
+
 /** A basic RISC-V implementation of the `Instruction` interface. */
 class Instruction : public simeng::Instruction {
  public:
@@ -58,58 +86,45 @@ class Instruction : public simeng::Instruction {
   Instruction(const Architecture& architecture,
               const InstructionMetadata& metadata);
 
-  /** Construct an instruction instance by decoding a provided instruction word.
-   */
-  Instruction(const Architecture& architecture,
-              const InstructionMetadata& metadata, uint8_t latency,
-              uint8_t stallCycles);
-
   /** Construct an instruction instance that raises an exception. */
   Instruction(const Architecture& architecture,
               const InstructionMetadata& metadata,
               InstructionException exception);
 
-  /** Retrieve the identifier for the first exception that occurred during
-   * processing this instruction. */
-  virtual InstructionException getException() const;
-
   /** Retrieve the source registers this instruction reads. */
-  const span<Register> getOperandRegisters() const override;
+  const span<Register> getSourceRegisters() const override;
+
+  /** Retrieve the data contained in the source registers this instruction
+   * reads.*/
+  const span<RegisterValue> getSourceOperands() const override;
 
   /** Retrieve the destination registers this instruction will write to.
    * A register value of -1 signifies a Zero Register read, and should not be
    * renamed. */
   const span<Register> getDestinationRegisters() const override;
 
-  /** Check whether the operand at index `i` has had a value supplied. */
-  bool isOperandReady(int index) const override;
-
   /** Override the specified source register with a renamed physical register.
    */
-  void renameSource(uint8_t i, Register renamed) override;
+  void renameSource(uint16_t i, Register renamed) override;
 
   /** Override the specified destination register with a renamed physical
    * register. */
-  void renameDestination(uint8_t i, Register renamed) override;
+  void renameDestination(uint16_t i, Register renamed) override;
 
   /** Provide a value for the operand at the specified index. */
-  virtual void supplyOperand(uint8_t i, const RegisterValue& value) override;
-
-  /** Check whether all operand values have been supplied, and the instruction
-   * is ready to execute. */
-  bool canExecute() const override;
+  void supplyOperand(uint16_t i, const RegisterValue& value) override;
 
-  /** Execute the instruction. */
-  void execute() override;
+  /** Check whether the operand at index `i` has had a value supplied. */
+  bool isOperandReady(int index) const override;
 
   /** Retrieve register results. */
   const span<RegisterValue> getResults() const override;
 
   /** Generate memory addresses this instruction wishes to access. */
-  span<const MemoryAccessTarget> generateAddresses() override;
+  span<const memory::MemoryAccessTarget> generateAddresses() override;
 
   /** Retrieve previously generated memory addresses. */
-  span<const MemoryAccessTarget> getGeneratedAddresses() const override;
+  span<const memory::MemoryAccessTarget> getGeneratedAddresses() const override;
 
   /** Provide data from a requested memory address. */
   void supplyData(uint64_t address, const RegisterValue& data) override;
@@ -117,11 +132,6 @@ class Instruction : public simeng::Instruction {
   /** Retrieve supplied memory data. */
   span<const RegisterValue> getData() const override;
 
-  /** Early misprediction check; see if it's possible to determine whether the
-   * next instruction address was mispredicted without executing the
-   * instruction. */
-  std::tuple<bool, uint64_t> checkEarlyBranchMisprediction() const override;
-
   /** Retrieve branch type. */
   BranchType getBranchType() const override;
 
@@ -142,116 +152,100 @@ class Instruction : public simeng::Instruction {
   /** Is this a branch operation? */
   bool isBranch() const override;
 
-  /** Is this an atomic instruction? */
-  bool isAtomic() const;
-
   /** Retrieve the instruction group this instruction belongs to. */
   uint16_t getGroup() const override;
 
-  /** Set this instruction's execution information including it's execution
-   * latency and throughput, and the set of ports which support it. */
-  void setExecutionInfo(const executionInfo& info);
+  /** Check whether all operand values have been supplied, and the instruction
+   * is ready to execute. */
+  bool canExecute() const override;
+
+  /** Execute the instruction. */
+  void execute() override;
 
   /** Get this instruction's supported set of ports. */
   const std::vector<uint16_t>& getSupportedPorts() override;
 
+  /** Set this instruction's execution information including it's execution
+   * latency and throughput, and the set of ports which support it. */
+  void setExecutionInfo(const ExecutionInfo& info) override;
+
   /** Retrieve the instruction's metadata. */
   const InstructionMetadata& getMetadata() const;
 
-  /** A special register value representing the zero register. If passed to
-   * `setSourceRegisters`/`setDestinationRegisters`, the value will be
-   * automatically supplied as zero. */
-  static const Register ZERO_REGISTER;
+  /** Retrieve the instruction's associated architecture. */
+  const Architecture& getArchitecture() const;
+
+  /** Retrieve the identifier for the first exception that occurred during
+   * processing this instruction. */
+  InstructionException getException() const;
 
  private:
-  /** The maximum number of source registers any supported RISC-V instruction
-   * can have. */
-  static const uint8_t MAX_SOURCE_REGISTERS = 2;
-  /** The maximum number of destination registers any supported RISC-V
-   * instruction can have. */
-  static const uint8_t MAX_DESTINATION_REGISTERS = 1;
+  /** Process the instruction's metadata to determine source/destination
+   * registers. */
+  void decode();
+
+  /** Update the instruction's identifier with an additional field. */
+  constexpr void setInstructionType(InsnType identifier) {
+    instructionIdentifier_ |=
+        static_cast<std::underlying_type_t<InsnType>>(identifier);
+  }
+
+  /** Tests whether this instruction has the given identifier set. */
+  constexpr bool isInstruction(InsnType identifier) const {
+    return (instructionIdentifier_ &
+            static_cast<std::underlying_type_t<InsnType>>(identifier));
+  }
+
+  /** For instructions with a valid rm field, extract the rm value and change
+   * the CPP rounding mode accordingly, then call the function "operation"
+   * before reverting the CPP rounding mode to its initial value. "Operation"
+   * should contain the entire execution logic of the instruction */
+  void setStaticRoundingModeThen(std::function<void(void)> operation);
+
+  /** Generate an ExecutionNotYetImplemented exception. */
+  void executionNYI();
 
   /** A reference to the ISA instance this instruction belongs to. */
   const Architecture& architecture_;
 
   /** A reference to the decoding metadata for this instruction. */
-  const InstructionMetadata& metadata;
+  const InstructionMetadata& metadata_;
 
   /** An array of source registers. */
-  std::array<Register, MAX_SOURCE_REGISTERS> sourceRegisters;
+  std::array<Register, MAX_SOURCE_REGISTERS> sourceRegisters_;
+
   /** The number of source registers this instruction reads from. */
-  uint8_t sourceRegisterCount = 0;
+  uint8_t sourceRegisterCount_ = 0;
 
   /** An array of destination registers. */
-  std::array<Register, MAX_DESTINATION_REGISTERS> destinationRegisters;
+  std::array<Register, MAX_DESTINATION_REGISTERS> destinationRegisters_;
+
   /** The number of destination registers this instruction writes to. */
-  uint8_t destinationRegisterCount = 0;
+  uint8_t destinationRegisterCount_ = 0;
 
   /** An array of provided operand values. Each entry corresponds to a
    * `sourceRegisters` entry. */
-  std::array<RegisterValue, MAX_SOURCE_REGISTERS> operands;
+  std::array<RegisterValue, MAX_SOURCE_REGISTERS> sourceValues_;
+
+  /** The immediate source operand for which there is only ever one. Remains 0
+   * if unused. */
+  int64_t sourceImm_ = 0;
 
   /** An array of generated output results. Each entry corresponds to a
    * `destinationRegisters` entry. */
-  std::array<RegisterValue, MAX_DESTINATION_REGISTERS> results;
+  std::array<RegisterValue, MAX_DESTINATION_REGISTERS> results_;
 
   /** The current exception state of this instruction. */
   InstructionException exception_ = InstructionException::None;
 
-  // Decoding
-  /** Process the instruction's metadata to determine source/destination
-   * registers. */
-  void decode();
-
-  /** Invalidate instructions that are currently not yet implemented. This
- prevents errors during speculated branches with unknown destinations;
- non-executable assertions. memory is decoded into valid but not implemented
- instructions tripping assertions.
- TODO remove once all extensions are supported*/
-  void invalidateIfNotImplemented();
-
-  // Scheduling
-  /** The number of operands that have not yet had values supplied. Used to
-   * determine execution readiness. */
-  short operandsPending = 0;
-
-  // Execution
-  /** Generate an ExecutionNotYetImplemented exception. */
-  void executionNYI();
-
-  // Metadata
-  /** Is this a store operation? */
-  bool isStore_ = false;
-  /** Is this a load operation? */
-  bool isLoad_ = false;
-  /** Is this a branch operation? */
-  bool isBranch_ = false;
-  /** Is this a multiply operation? */
-  bool isMultiply_ = false;
-  /** Is this a divide operation? */
-  bool isDivide_ = false;
-  /** Is this a shift operation? */
-  bool isShift_ = false;
-  /** Is this an atomic instruction? */
-  bool isAtomic_ = false;
-  /** Is this a logical instruction? */
-  bool isLogical_ = false;
-  /** Is this a compare instruction? */
-  bool isCompare_ = false;
-
-  // Memory
-  /** Set the accessed memory addresses, and create a corresponding memory data
-   * vector. */
-  void setMemoryAddresses(const std::vector<MemoryAccessTarget>& addresses);
-
-  /** The memory addresses this instruction accesses, as a vector of {offset,
-   * width} pairs. */
-  std::vector<MemoryAccessTarget> memoryAddresses;
+  /** The number of source operands that have not yet had values supplied. Used
+   * to determine execution readiness. */
+  uint16_t sourceOperandsPending_ = 0;
 
-  /** A vector of memory values, that were either loaded memory, or are prepared
-   * for sending to memory (according to instruction type). Each entry
-   * corresponds to a `memoryAddresses` entry. */
-  std::vector<RegisterValue> memoryData;
+  /** Used to denote what type of instruction this is. Utilises the constants in
+   * the `InsnType` namespace allowing each bit to represent a unique
+   * identifier such as `isLoad` or `isMultiply` etc. */
+  uint16_t instructionIdentifier_ = 0;
 };
 
 }  // namespace riscv
diff --git a/src/include/simeng/arch/riscv/InstructionGroups.hh b/src/include/simeng/arch/riscv/InstructionGroups.hh
index b26efbbcd8..098d534bdc 100644
--- a/src/include/simeng/arch/riscv/InstructionGroups.hh
+++ b/src/include/simeng/arch/riscv/InstructionGroups.hh
@@ -13,26 +13,51 @@ const uint16_t INT_SIMPLE_CMP = 3;
 const uint16_t INT_SIMPLE_LOGICAL = 4;
 const uint16_t INT_SIMPLE_SHIFT = 5;
 const uint16_t INT_MUL = 6;
-const uint16_t INT_DIV = 7;
+const uint16_t INT_DIV_OR_SQRT = 7;
 const uint16_t LOAD_INT = 8;
 const uint16_t STORE_INT = 9;
-const uint16_t LOAD = 10;
-const uint16_t STORE = 11;
-const uint16_t BRANCH = 12;
+const uint16_t FLOAT = 10;
+const uint16_t FLOAT_SIMPLE = 11;
+const uint16_t FLOAT_SIMPLE_ARTH = 12;
+const uint16_t FLOAT_SIMPLE_CMP = 13;
+const uint16_t FLOAT_SIMPLE_LOGICAL = 14;
+const uint16_t FLOAT_SIMPLE_CVT = 15;
+const uint16_t FLOAT_MUL = 16;
+const uint16_t FLOAT_DIV_OR_SQRT = 17;
+const uint16_t LOAD_FLOAT = 18;
+const uint16_t STORE_FLOAT = 19;
+const uint16_t LOAD = 20;
+const uint16_t STORE = 21;
+const uint16_t BRANCH = 22;
+const uint16_t ALL = 23;
+const uint16_t NONE = 24;
 }  // namespace InstructionGroups
 
-static constexpr uint8_t NUM_GROUPS = 13;
+static constexpr uint8_t NUM_GROUPS = 25;
 
-const std::unordered_map<uint16_t, std::vector<uint16_t>> groupInheritance = {
+const std::unordered_map<uint16_t, std::vector<uint16_t>> groupInheritance_ = {
+    {InstructionGroups::ALL,
+     {InstructionGroups::INT, InstructionGroups::FLOAT, InstructionGroups::LOAD,
+      InstructionGroups::STORE, InstructionGroups::BRANCH}},
     {InstructionGroups::INT,
      {InstructionGroups::INT_SIMPLE, InstructionGroups::INT_MUL,
-      InstructionGroups::INT_DIV}},
+      InstructionGroups::INT_DIV_OR_SQRT}},
     {InstructionGroups::INT_SIMPLE,
      {InstructionGroups::INT_SIMPLE_ARTH, InstructionGroups::INT_SIMPLE_CMP,
       InstructionGroups::INT_SIMPLE_LOGICAL,
       InstructionGroups::INT_SIMPLE_SHIFT}},
-    {InstructionGroups::LOAD, {InstructionGroups::LOAD_INT}},
-    {InstructionGroups::STORE, {InstructionGroups::STORE_INT}}};
+    {InstructionGroups::LOAD,
+     {InstructionGroups::LOAD_INT, InstructionGroups::LOAD_FLOAT}},
+    {InstructionGroups::STORE,
+     {InstructionGroups::STORE_INT, InstructionGroups::STORE_FLOAT}},
+    {InstructionGroups::FLOAT,
+     {InstructionGroups::FLOAT_SIMPLE, InstructionGroups::FLOAT_MUL,
+      InstructionGroups::FLOAT_DIV_OR_SQRT}},
+    {InstructionGroups::FLOAT_SIMPLE,
+     {InstructionGroups::FLOAT_SIMPLE_ARTH,
+      InstructionGroups::FLOAT_SIMPLE_LOGICAL,
+      InstructionGroups::FLOAT_SIMPLE_CMP,
+      InstructionGroups::FLOAT_SIMPLE_CVT}}};
 
 }  // namespace riscv
 }  // namespace arch
diff --git a/src/include/simeng/AlwaysNotTakenPredictor.hh b/src/include/simeng/branchpredictors/AlwaysNotTakenPredictor.hh
similarity index 56%
rename from src/include/simeng/AlwaysNotTakenPredictor.hh
rename to src/include/simeng/branchpredictors/AlwaysNotTakenPredictor.hh
index 7ec8027d4b..382a495420 100644
--- a/src/include/simeng/AlwaysNotTakenPredictor.hh
+++ b/src/include/simeng/branchpredictors/AlwaysNotTakenPredictor.hh
@@ -1,11 +1,11 @@
 #pragma once
 
-#include "simeng/BranchPredictor.hh"
+#include "simeng/branchpredictors/BranchPredictor.hh"
 
 namespace simeng {
 
-/** An "Always Not Taken" branch predictor; predicts all branches as not taken.
- */
+/** An "Always Not Taken" branch predictor; predicts all branches as not
+ * taken. */
 class AlwaysNotTakenPredictor : public BranchPredictor {
  public:
   /** Generate a branch prediction for the specified instruction address; will
@@ -13,14 +13,18 @@ class AlwaysNotTakenPredictor : public BranchPredictor {
   BranchPrediction predict(uint64_t address, BranchType type,
                            int64_t knownOffset) override;
 
-  /** Provide branch results to update the prediction model for the specified
-   * instruction address. As this model is static, this does nothing. */
-  void update(uint64_t address, bool taken, uint64_t targetAddress,
-              BranchType type) override;
+  /** Updates appropriate predictor model objects based on the address, type and
+   * outcome of the branch instruction.  Update must be called on
+   * branches in program order.  To check this, instructionId is also passed
+   * to this function. */
+  void update(uint64_t address, bool isTaken, uint64_t targetAddress,
+              BranchType type, uint64_t instructionId) override;
 
   /** Provide flush logic for branch prediction scheme. As there's no flush
    * logic for an always taken predictor, this does nothing. */
   void flush(uint64_t address) override;
+
+ private:
 };
 
 }  // namespace simeng
diff --git a/src/include/simeng/branchpredictors/BranchPrediction.hh b/src/include/simeng/branchpredictors/BranchPrediction.hh
new file mode 100644
index 0000000000..aac7de52ea
--- /dev/null
+++ b/src/include/simeng/branchpredictors/BranchPrediction.hh
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <cstdint>
+
+namespace simeng {
+
+/** The types of branches recognised. */
+enum class BranchType {
+  Conditional = 0,
+  LoopClosing,
+  Return,
+  SubroutineCall,
+  Unconditional,
+  Unknown
+};
+
+/** A branch result prediction for an instruction. */
+struct BranchPrediction {
+  /** Whether the branch will be taken. */
+  bool isTaken;
+
+  /** The branch instruction's target address. If `isTaken == false`, the value
+   * will be ignored. */
+  uint64_t target;
+
+  /** Check for equality of two branch predictions . */
+  bool operator==(const BranchPrediction& other) {
+    return ((isTaken == other.isTaken) && (target == other.target));
+  }
+
+  /** Check for inequality of two branch predictions . */
+  bool operator!=(const BranchPrediction& other) {
+    return ((isTaken != other.isTaken) || (target != other.target));
+  }
+};
+
+}  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/branchpredictors/BranchPredictor.hh b/src/include/simeng/branchpredictors/BranchPredictor.hh
new file mode 100644
index 0000000000..7779fe0703
--- /dev/null
+++ b/src/include/simeng/branchpredictors/BranchPredictor.hh
@@ -0,0 +1,97 @@
+#pragma once
+
+#include <cstdint>
+#include <tuple>
+
+#include "simeng/Instruction.hh"
+#include "simeng/branchpredictors/BranchPrediction.hh"
+#include "simeng/pipeline/PipelineBuffer.hh"
+
+namespace simeng {
+
+/** An abstract branch predictor interface. */
+class BranchPredictor {
+ public:
+  virtual ~BranchPredictor(){};
+
+  /** Generate a branch prediction for the supplied instruction address, a
+   * branch type, and a known branch offset.  Returns a branch direction and
+   * branch target address. */
+  virtual BranchPrediction predict(uint64_t address, BranchType type,
+                                   int64_t knownOffset) = 0;
+
+  /** Updates appropriate predictor model objects based on the address, type and
+   * outcome of the branch instruction.  Update must be called on
+   * branches in program order.  To check this, instructionId is also passed
+   * to this function. */
+  virtual void update(uint64_t address, bool isTaken, uint64_t targetAddress,
+                      BranchType type, uint64_t instructionId) = 0;
+
+  /** Provides flushing behaviour for the implemented branch prediction schemes
+   * via the instruction address.  Branches must be flushed in reverse
+   * program order (though, if a block of n instructions is being flushed at
+   * once, the exact order that the individual instructions within this block
+   * are flushed does not matter so long as they are all flushed). */
+  virtual void flush(uint64_t address) = 0;
+
+  /**
+   * Overloaded function for flushing branch instructions from a
+   * PipelineBuffer. Accepts a reference to a PipelineBuffer of microOps.
+   * Iterates over the entries of the PipelineBuffer and, if they are a
+   * branch instruction, flushes them.
+   */
+  void flushBranchesInBufferFromSelf(
+      pipeline::PipelineBuffer<std::shared_ptr<Instruction>>& buffer) {
+    for (size_t slot = 0; slot < buffer.getWidth(); slot++) {
+      auto& uop = buffer.getTailSlots()[slot];
+      if (uop != nullptr && uop->isBranch()) {
+        flush(uop->getInstructionAddress());
+      }
+
+      uop = buffer.getHeadSlots()[slot];
+      if (uop != nullptr && uop->isBranch()) {
+        flush(uop->getInstructionAddress());
+      }
+    }
+  }
+
+  /**
+   * Overloaded function for flushing branch instructions from a
+   * PipelineBuffer. Accepts a reference to a PipelineBuffer macroOps.
+   * Iterates over the entries of the PipelineBuffer and, if they are a
+   * branch instruction, flushes them.
+   */
+  void flushBranchesInBufferFromSelf(
+      pipeline::PipelineBuffer<std::vector<std::shared_ptr<Instruction>>>&
+          buffer) {
+    for (size_t slot = 0; slot < buffer.getWidth(); slot++) {
+      auto& macroOp = buffer.getTailSlots()[slot];
+      for (size_t uop = 0; uop < macroOp.size(); uop++) {
+        if (macroOp[uop]->isBranch()) {
+          flush(macroOp[uop]->getInstructionAddress());
+        }
+      }
+      macroOp = buffer.getHeadSlots()[slot];
+      for (size_t uop = 0; uop < macroOp.size(); uop++) {
+        if (macroOp[uop]->isBranch()) {
+          flush(macroOp[uop]->getInstructionAddress());
+        }
+      }
+    }
+  }
+
+  /** lastUpdatedInstructionId_ is used only in debug mode. Clang throws a
+   * warning (which becomes an error with our cmake flags) for unused
+   * variables. If the [[maybe_unused]] attribute is added to avoid this,
+   * then gcc throws a warning (which becomes an error) because it ignores
+   * this attribute. Therefore, to avoid the above catch 22, this variable is
+   * hidden behind an ifdef such that it is declared only in debug mode; when
+   * it is used. */
+#ifndef NDEBUG
+  /** The Id of the last instruction that update was called on -- used to
+   * ensure that update is called in program order. */
+  uint64_t lastUpdatedInstructionId_ = 0;
+#endif
+};
+
+}  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/branchpredictors/GenericPredictor.hh b/src/include/simeng/branchpredictors/GenericPredictor.hh
new file mode 100644
index 0000000000..ae1aff6d05
--- /dev/null
+++ b/src/include/simeng/branchpredictors/GenericPredictor.hh
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <cassert>
+#include <deque>
+#include <map>
+#include <vector>
+
+#include "simeng/branchpredictors/BranchPredictor.hh"
+#include "simeng/config/SimInfo.hh"
+
+namespace simeng {
+
+/** A generic branch predictor implementing well known/text book branch
+ * predictor logic. The following predictors have been included:
+ *
+ * - Static predictor based on pre-allocated branch type.
+ *
+ * - A Branch Target Buffer (BTB) with a local and global indexing scheme and a
+ * 2-bit saturating counter.
+ *
+ * - A Return Address Stack (RAS) is also in use.
+ */
+
+class GenericPredictor : public BranchPredictor {
+ public:
+  /** Initialise predictor models. */
+  GenericPredictor(ryml::ConstNodeRef config = config::SimInfo::getConfig());
+  ~GenericPredictor();
+
+  /** Generate a branch prediction for the supplied instruction address, a
+   * branch type, and a known branch offset.  Returns a branch direction and
+   * branch target address. */
+  BranchPrediction predict(uint64_t address, BranchType type,
+                           int64_t knownOffset) override;
+
+  /** Updates appropriate predictor model objects based on the address, type and
+   * outcome of the branch instruction.  Update must be called on
+   * branches in program order.  To check this, instructionId is also passed
+   * to this function. */
+  void update(uint64_t address, bool isTaken, uint64_t targetAddress,
+              BranchType type, uint64_t instructionId) override;
+
+  /** Provides flushing behaviour for the implemented branch prediction schemes
+   * via the instruction address.  Branches must be flushed in reverse
+   * program order (though, if a block of n instructions is being flushed at
+   * once, the exact order that the individual instructions within this block
+   * are flushed does not matter so long as they are all flushed). */
+  void flush(uint64_t address) override;
+
+ private:
+  /** The bitlength of the BTB index; BTB will have 2^bits entries. */
+  uint8_t btbBits_;
+
+  /** A 2^bits length vector of pairs containing a satCntBits_-bit saturating
+   * counter and a branch target. */
+  std::vector<std::pair<uint8_t, uint64_t>> btb_;
+
+  /** Fetch Target Queue containing the direction prediction and previous global
+   * history state of branches that are currently unresolved */
+  std::deque<std::pair<bool, uint64_t>> ftq_;
+
+  /** The number of bits used to form the saturating counter in a BTB entry. */
+  uint8_t satCntBits_;
+
+  /** An n-bit history of previous branch directions where n is equal to
+   * globalHistoryLength_.  Each bit represents a branch taken (1) or not
+   * taken (0), with the most recent branch being the least-significant-bit */
+  uint64_t globalHistory_ = 0;
+
+  /** The number of previous branch directions recorded globally. */
+  uint16_t globalHistoryLength_;
+
+  /** A bit mask for truncating the global history to the correct size.
+   * Stored as a member variable to avoid duplicative calculation */
+  uint64_t globalHistoryMask_;
+
+  /** A return address stack. */
+  std::deque<uint64_t> ras_;
+
+  /** RAS history with instruction address as the keys. A non-zero value
+   * represents the target prediction for a return instruction and a 0 entry for
+   * a branch-and-link instruction. */
+  std::map<uint64_t, uint64_t> rasHistory_;
+
+  /** The size of the RAS. */
+  uint16_t rasSize_;
+};
+
+}  // namespace simeng
diff --git a/src/include/simeng/branchpredictors/PerceptronPredictor.hh b/src/include/simeng/branchpredictors/PerceptronPredictor.hh
new file mode 100644
index 0000000000..d9e05bca52
--- /dev/null
+++ b/src/include/simeng/branchpredictors/PerceptronPredictor.hh
@@ -0,0 +1,105 @@
+#pragma once
+
+#include <cassert>
+#include <deque>
+#include <map>
+#include <vector>
+
+#include "simeng/branchpredictors/BranchPredictor.hh"
+#include "simeng/config/SimInfo.hh"
+
+namespace simeng {
+
+/** A Perceptron branch predictor implementing the branch predictor described in
+ * Jimenez and Lin ("Dynamic branch prediction with perceptrons", IEEE High-
+ * Performance Computer Architecture Symposium Proceedings (2001), 197-206 --
+ * https://www.cs.utexas.edu/~lin/papers/hpca01.pdf).
+ * The following predictors have been included:
+ *
+ * - Static predictor based on pre-allocated branch type.
+ *
+ * - A Branch Target Buffer (BTB) with a local and global indexing scheme and a
+ * perceptron.
+ *
+ * - A Return Address Stack (RAS) is also in use.
+ */
+
+class PerceptronPredictor : public BranchPredictor {
+ public:
+  /** Initialise predictor models. */
+  PerceptronPredictor(ryml::ConstNodeRef config = config::SimInfo::getConfig());
+  ~PerceptronPredictor();
+
+  /** Generate a branch prediction for the supplied instruction address, a
+   * branch type, and a known branch offset.  Returns a branch direction and
+   * branch target address. */
+  BranchPrediction predict(uint64_t address, BranchType type,
+                           int64_t knownOffset) override;
+
+  /** Updates appropriate predictor model objects based on the address, type and
+   * outcome of the branch instruction.  Update must be called on
+   * branches in program order.  To check this, instructionId is also passed
+   * to this function. */
+  void update(uint64_t address, bool isTaken, uint64_t targetAddress,
+              BranchType type, uint64_t instructionId) override;
+
+  /** Provides flushing behaviour for the implemented branch prediction schemes
+   * via the instruction address.  Branches must be flushed in reverse
+   * program order (though, if a block of n instructions is being flushed at
+   * once, the exact order that the individual instructions within this block
+   * are flushed does not matter so long as they are all flushed). */
+  void flush(uint64_t address) override;
+
+ private:
+  /** Returns the dot product of a perceptron and a history vector.  Used to
+   * determine a direction prediction */
+  int64_t getDotProduct(const std::vector<int8_t>& perceptron,
+                        uint64_t history);
+
+  /** The length in bits of the BTB index; BTB will have 2^bits entries. */
+  uint64_t btbBits_;
+
+  /** A 2^bits length vector of pairs containing a perceptron with
+   * globalHistoryLength_ + 1 inputs, and a branch target.
+   * The perceptrons are used to provide a branch direction prediction by
+   * taking a dot product with the global history, as described
+   * in Jiminez and Lin */
+  std::vector<std::pair<std::vector<int8_t>, uint64_t>> btb_;
+
+  /** Fetch Target Queue containing the dot product of the perceptron and the
+   * global history; and the global history, both at the time of prediction,
+   * for each of the branch instructions that are currently unresolved.  The dot
+   * product represents the confidence of the perceptrons direction
+   * prediction and is needed for a correct update when the branch
+   * instruction is resolved. */
+  std::deque<std::pair<int64_t, uint64_t>> ftq_;
+
+  /** An n-bit history of previous branch directions where n is equal to
+   * globalHistoryLength_.  Each bit represents a branch taken (1) or not
+   * taken (0), with the most recent branch being the least-significant-bit */
+  uint64_t globalHistory_ = 0;
+
+  /** The number of previous branch directions recorded globally. */
+  uint64_t globalHistoryLength_;
+
+  /** A bit mask for truncating the global history to the correct size.
+   * Stored as a member variable to avoid duplicative calculation */
+  uint64_t globalHistoryMask_;
+
+  /** The magnitude of the dot product of the perceptron and the global history,
+   * below which the perceptron's weight must be updated */
+  uint64_t trainingThreshold_;
+
+  /** A return address stack. */
+  std::deque<uint64_t> ras_;
+
+  /** RAS history with instruction address as the keys. A non-zero value
+   * represents the target prediction for a return instruction and a 0 entry for
+   * a branch-and-link instruction. */
+  std::map<uint64_t, uint64_t> rasHistory_;
+
+  /** The size of the RAS. */
+  uint64_t rasSize_;
+};
+
+}  // namespace simeng
diff --git a/src/include/simeng/config/ExpectationNode.hh b/src/include/simeng/config/ExpectationNode.hh
new file mode 100644
index 0000000000..187d3ed37a
--- /dev/null
+++ b/src/include/simeng/config/ExpectationNode.hh
@@ -0,0 +1,562 @@
+#pragma once
+
+#include <math.h>
+
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <fstream>
+#include <iostream>
+#include <queue>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <variant>
+
+#include "simeng/config/yaml/ryml.hh"
+
+namespace simeng {
+namespace config {
+
+/** An enum containing all supported data types that can be expected of a
+ * config option.
+ * NOTE: The index of the ExpectedType enum matches that of
+ * ExpectationNode::DataTypeVariant.
+ */
+enum class ExpectedType {
+  Bool,
+  Double,
+  Float,
+  Integer8,
+  Integer16,
+  Integer32,
+  Integer64,
+  String,
+  UInteger8,
+  UInteger16,
+  UInteger32,
+  UInteger64,
+  Valueless
+};
+
+/** The string used to represent a wildcard node as described in the comment
+ * above class ExpectationNode. */
+const std::string wildcard = "*";
+
+/** A struct to hold whether the validation was valid and an accompanying
+ * outcome message. */
+struct ValidationResult {
+  bool valid;
+  std::string message;
+};
+
+/** A class to hold the expectations of a specific config option. Each instance
+ * is considered to be one node of a tree-like structure which maps onto the
+ * hierarchical YAML structure of the passed/generated config file. Each node
+ * can contain any number of children, each of which is another instance of the
+ * `ExpectationNode` class for another config option. The expectation placed on
+ * each config option can be defined as a type, a set of values to which it must
+ * belong, and a set of bounds it must lie between. A default value is also
+ * expected for the sake of default construction and generation of default
+ * config files. The values of such expectations are held within a
+ * `std::variant` which can hold one of the expected data types equivalent to
+ * that held in the `ExpectedType` enum.
+ *
+ * The notion of a wildcard node is implemented to allow for expectations to
+ * be placed on a set of nodes. Rather than specifing the specific node name
+ * string, a wildcard string variable can be used to denote any possible node
+ * name. An example is given below:
+ *
+ *                  |--["child_0"]-["value_node_A"]
+ * ["parent_node"]--|--["child_1"]-["value_node_B"]
+ *                  |--["child_2"]-["value_node_C"]
+ *
+ * can be represented by
+ *
+ *                             |--["value_node_A"]
+ * ["parent_node"]-[wildcard]--|--["value_node_B"]
+ *                             |--["value_node_C"]
+ */
+class ExpectationNode {
+ public:
+  /** NOTE: The index of the ExpectedType enum matches that of
+   * ExpectationNode::DataTypeVariant.
+   */
+  using DataTypeVariant =
+      std::variant<bool, double, float, int8_t, int16_t, int32_t, int64_t,
+                   std::string, uint8_t, uint16_t, uint32_t, uint64_t>;
+
+  /** A templated struct to store a boolean value denoting whether a passed
+   * typename T belongs to one types represented in ExpectedType. */
+  template <typename T>
+  struct is_expected_type
+      : std::integral_constant<bool, std::is_same<bool, T>::value ||
+                                         std::is_same<double, T>::value ||
+                                         std::is_same<float, T>::value ||
+                                         std::is_same<int8_t, T>::value ||
+                                         std::is_same<int16_t, T>::value ||
+                                         std::is_same<int32_t, T>::value ||
+                                         std::is_same<int64_t, T>::value ||
+                                         std::is_same<std::string, T>::value ||
+                                         std::is_same<uint8_t, T>::value ||
+                                         std::is_same<uint16_t, T>::value ||
+                                         std::is_same<uint32_t, T>::value ||
+                                         std::is_same<uint64_t, T>::value> {};
+
+  /** A templated function to allow for the creation of an `ExpectationNode`
+   * instance. The instance created is one with a value and a key. A default
+   * value, key, type, and a bool denoting whether the node is optional are
+   * provided to the underlying constructor. */
+  template <typename T>
+  static ExpectationNode createExpectation(T defaultValue, std::string key,
+                                           bool optional = false) {
+    // Ensure templated type is of an expected type
+    static_assert(is_expected_type<T>::value &&
+                  "[SimEng:ExpectationNode] Unexpected type given to "
+                  "ExpectationNode::createExpectation");
+    DataTypeVariant defValVariant = defaultValue;
+    ExpectedType type = static_cast<ExpectedType>(defValVariant.index());
+    ExpectationNode node = ExpectationNode(key, type, optional);
+    node.setDefaultValue(defValVariant);
+    return node;
+  }
+
+  /** A templated function to allow for the creation of an `ExpectationNode`
+   * instance with a key but no associated value. A key and a bool denoting
+   * whether the node is optional are provided to the underlying constructor. */
+  static ExpectationNode createExpectation(std::string key,
+                                           bool optional = false) {
+    ExpectationNode node =
+        ExpectationNode(key, ExpectedType::Valueless, optional);
+    return node;
+  }
+
+  /** Default constructor. Used primarily to provide a root node for populated
+   * ExpectationNode instances to be added to. */
+  ExpectationNode(){};
+
+  ~ExpectationNode(){};
+
+  /** A getter function to retrieve the key of a node. */
+  std::string getKey() const { return nodeKey_; }
+
+  /** A setter function to create the hierarchy key of this node, prefixed by
+   * the key sent from a parent node. */
+  void setHierarchyKey(std::string hKey) {
+    // If a non-blank key is passed, prefix this instances hierarchyKey_ with it
+    if (hKey != "") hierarchyKey_ = hKey + ":" + nodeKey_;
+    // Don't consider "INVALID" node keys when constructing a hierarchyKey_
+    else if (nodeKey_ != "INVALID")
+      hierarchyKey_ = nodeKey_;
+  }
+
+  /** A getter function to retrieve the held default value of a node. */
+  template <typename T>
+  T getDefault() const {
+    return getByType<T>(defaultValue_);
+  }
+
+  /** A getter function to retrieve the value type of a node. */
+  ExpectedType getType() const { return type_; }
+
+  /** A getter function to retrieve the child ExpectationNode instances of this
+   * node. */
+  const std::vector<ExpectationNode>& getChildren() const {
+    return nodeChildren_;
+  }
+
+  /** A getter function to retrieve whether the expectations should be applied
+   * to a sequence of config values. */
+  bool isSequence() const { return isSequence_; }
+
+  /** A getter function to retrieve whether the node is a wildcard. */
+  bool isWildcard() const { return isWildcard_; }
+
+  /** Setter function to set the expected bounds for this node's associated
+   * config option. */
+  template <typename T>
+  void setValueBounds(T lower, T upper) {
+    // Value type check
+    DataTypeVariant valCheck = lower;
+    if (valCheck.index() != static_cast<size_t>(type_)) {
+      std::cerr
+          << "[SimEng:ExpectationNode] The data type of the passed "
+             "value bounds used in setValueBounds() does not match that held "
+             "within the ExpectationNode with key "
+          << hierarchyKey_ << ". Passed bounds are of type "
+          << typeToString(valCheck.index())
+          << " and the expected type of this node is "
+          << typeToString(static_cast<size_t>(type_)) << "." << std::endl;
+      exit(1);
+    }
+    // Ensure an expectation set hasn't already been defined for this node
+    if (definedSet_) {
+      std::cerr
+          << "[SimEng:ExpectationNode] Invalid call of setValueBounds() for "
+             "the ExpectationNode with key "
+          << hierarchyKey_ << " as a value set has already been defined."
+          << std::endl;
+      exit(1);
+    }
+
+    definedBounds_ = true;
+    expectedBounds_.first = lower;
+    expectedBounds_.second = upper;
+  }
+
+  /** Setter function to set the expected set of values for this node's
+   * associated config option. */
+  template <typename T>
+  void setValueSet(std::vector<T> set) {
+    // Value type check
+    if (set.size()) {
+      T val = set[0];
+      DataTypeVariant valCheck = val;
+      if (valCheck.index() != static_cast<size_t>(type_)) {
+        std::cerr << "[SimEng:ExpectationNode] The data type of the passed "
+                     "vector used in setValueSet() does not match that held "
+                     "within the ExpectationNode with key "
+                  << hierarchyKey_ << ". Passed vector elements are of type "
+                  << typeToString(valCheck.index())
+                  << " and the expected type of this node is "
+                  << typeToString(static_cast<size_t>(type_)) << "."
+                  << std::endl;
+        exit(1);
+      }
+    }
+    // Ensure expectation bounds haven't already been defined for this node
+    if (definedBounds_) {
+      std::cerr << "[SimEng:ExpectationNode] Invalid call of setValueSet() for "
+                   "the ExpectationNode with key "
+                << hierarchyKey_
+                << " as value bounds have already been defined." << std::endl;
+      exit(1);
+    }
+
+    definedSet_ = true;
+    for (const T& s : set) {
+      DataTypeVariant dtv = s;
+      expectedSet_.push_back(dtv);
+    }
+  }
+
+  /** A setter function which denotes this node's expectations should be applied
+   * to a sequence of config values. */
+  void setAsSequence() { isSequence_ = true; }
+
+  /** Add a child node to the vector of children within this node. */
+  void addChild(ExpectationNode child) {
+    // Ensure that if the new child is a wildcard node, one does not already
+    // exist in this instance's children
+    if (child.getKey() == wildcard) {
+      if (hasWildcard_) {
+        std::cerr
+            << "[SimEng:ExpectationNode] Attempted to add multiple wildcard "
+               "nodes to the same ExpectationNode instance of key "
+            << hierarchyKey_ << std::endl;
+        exit(1);
+      }
+      hasWildcard_ = true;
+    }
+    nodeChildren_.push_back(child);
+
+    // Set hierarchy key of child
+    nodeChildren_.back().setHierarchyKey(hierarchyKey_);
+  }
+
+  /** An intermediary function which sets the expectations that the passed
+   * config option should be checked against. */
+  ValidationResult validateConfigNode(ryml::NodeRef node) const {
+    // If the node is a wildcard, then only a key will exist in the validation
+    // check
+    if (isWildcard_) {
+      if (!node.has_key()) {
+        return {false, "has no key"};
+      }
+      return {true, "Success"};
+    } else {
+      // Continue to validate the passed config option based on the held
+      // expected type
+      switch (type_) {
+        case ExpectedType::Bool:
+          return validateConfigNodeWithType<bool>(node);
+        case ExpectedType::Double:
+          return validateConfigNodeWithType<double>(node);
+        case ExpectedType::Float:
+          return validateConfigNodeWithType<float>(node);
+        case ExpectedType::Integer8:
+          return validateConfigNodeWithType<int8_t>(node);
+        case ExpectedType::Integer16:
+          return validateConfigNodeWithType<int16_t>(node);
+        case ExpectedType::Integer32:
+          return validateConfigNodeWithType<int32_t>(node);
+        case ExpectedType::Integer64:
+          return validateConfigNodeWithType<int64_t>(node);
+        case ExpectedType::String:
+          return validateConfigNodeWithType<std::string>(node);
+        case ExpectedType::UInteger8:
+          return validateConfigNodeWithType<uint8_t>(node);
+        case ExpectedType::UInteger16:
+          return validateConfigNodeWithType<uint16_t>(node);
+        case ExpectedType::UInteger32:
+          return validateConfigNodeWithType<uint32_t>(node);
+        case ExpectedType::UInteger64:
+          return validateConfigNodeWithType<uint64_t>(node);
+        case ExpectedType::Valueless: {
+          // If the node has no value, then only a key will exist in the
+          // validation check
+          if (!node.has_key() && !isOptional_) {
+            return {false, "has no key"};
+          }
+          return {true, "Success"};
+        }
+        default:
+          std::cerr << "[SimEng:validateConfigNode] Unexpected ExpectedType"
+                    << std::endl;
+
+          exit(-1);
+      }
+    }
+  }
+
+  /** Search through the held children to find a node with the key `childKey`.
+   * If no `childKey` can be found, then it is considered to be fatal for the
+   * simulation. However, if a wildcard node is present within the children,
+   * then return said child. */
+  ExpectationNode& operator[](std::string childKey) {
+    int wildcardIndex = -1;
+    // Search children for childKey and record latest wildcard children
+    for (size_t child = 0; child < nodeChildren_.size(); child++) {
+      if (nodeChildren_[child].getKey() == childKey)
+        return nodeChildren_[child];
+      else if (nodeChildren_[child].getKey() == wildcard)
+        wildcardIndex = child;
+    }
+
+    // If no child was found but a wildcard node exists, return the wildcard
+    // child node
+    if (wildcardIndex != -1) return nodeChildren_[wildcardIndex];
+
+    std::cerr << "[SimEng:ExpectationNode] Tried to access a config node that "
+                 "does not exist, namely \""
+              << childKey << "\" in parent node \"" << nodeKey_ << "\""
+              << std::endl;
+    exit(1);
+  }
+
+ private:
+  /** Constructor for ExpectationNode instances. */
+  ExpectationNode(std::string key, ExpectedType type, bool optional)
+      : nodeKey_(key), type_(type), isOptional_(optional) {
+    if (nodeKey_ == wildcard) isWildcard_ = true;
+  }
+
+  /** A utility function for converting the type held in DataTypeVariant or the
+   * value of type_ into a string via an index. */
+  std::string typeToString(size_t index) const {
+    switch (index) {
+      case static_cast<size_t>(ExpectedType::Bool):
+        return "bool";
+      case static_cast<size_t>(ExpectedType::Double):
+        return "double";
+      case static_cast<size_t>(ExpectedType::Float):
+        return "float";
+      case static_cast<size_t>(ExpectedType::Integer8):
+        return "8-bit integer";
+      case static_cast<size_t>(ExpectedType::Integer16):
+        return "16-bit integer";
+      case static_cast<size_t>(ExpectedType::Integer32):
+        return "32-bit integer";
+      case static_cast<size_t>(ExpectedType::Integer64):
+        return "64-bit integer";
+      case static_cast<size_t>(ExpectedType::String):
+        return "string";
+      case static_cast<size_t>(ExpectedType::UInteger8):
+        return "8-bit unsigned integer";
+      case static_cast<size_t>(ExpectedType::UInteger16):
+        return "16-bit unsigned integer";
+      case static_cast<size_t>(ExpectedType::UInteger32):
+        return "32-bit unsigned integer";
+      case static_cast<size_t>(ExpectedType::UInteger64):
+        return "64-bit unsigned integer";
+    }
+    return "unknown";
+  }
+
+  /** Setter function to set the default value for this node's associated config
+   * option. */
+  void setDefaultValue(DataTypeVariant var) {
+    if (var.index() != static_cast<size_t>(type_)) {
+      std::cerr
+          << "[SimEng:ExpectationNode] A DataTypeVariant used to set the "
+             "default value is not of type held within the ExpectationNode "
+             "with key "
+          << hierarchyKey_ << ". Variant holds a " << typeToString(var.index())
+          << " and the expected type of this node is "
+          << typeToString(static_cast<size_t>(type_)) << "." << std::endl;
+      exit(1);
+    }
+    defaultValue_ = var;
+  }
+
+  /** A utility function used by the class to get a value from a `std::variant`
+   * with error handling if the passed type is not currently stored. */
+  template <typename T>
+  T getByType(const DataTypeVariant& variant) const {
+    // Ensure templated type is of an expected type
+    static_assert(is_expected_type<T>::value &&
+                  "[SimEng:ExpectationNode] Unexpected type given to "
+                  "ExpectationNode::getByType");
+
+    // Value existence check
+    if (variant.valueless_by_exception()) {
+      std::cerr << "[SimEng:ExpectationNode] No value in passed "
+                   "DataTypeVariant within ExpectationNode with key "
+                << hierarchyKey_ << std::endl;
+      exit(1);
+    }
+    // Value type check
+    if (!std::holds_alternative<T>(variant)) {
+      std::cerr << "[SimEng:ExpectationNode] A value of given type not held in "
+                   "variant within ExpectationNode with key "
+                << hierarchyKey_ << ". Variant holds a "
+                << typeToString(variant.index())
+                << " and the expected type of this node is "
+                << typeToString(static_cast<size_t>(type_)) << "." << std::endl;
+      exit(1);
+    }
+    return std::get<T>(variant);
+  }
+
+  /** A function to validate a passed config option against held expectations.
+   */
+  template <typename T>
+  ValidationResult validateConfigNodeWithType(ryml::NodeRef node) const {
+    // Value existence check
+    if (!node.has_val()) {
+      // If the node is optional, fill in the missing config
+      // value with held default value
+      if (isOptional_) {
+        // If the node is a sequence, add the default value to a new child
+        if (isSequence_) {
+          node |= ryml::SEQ;
+          node = node.append_child() << getByType<T>(defaultValue_);
+        } else {
+          node << getDefault<T>();
+        }
+      } else {
+        return {false, "has no value"};
+      }
+    }
+
+    // Read as check
+    T nodeVal = node.as<T>();
+
+    std::ostringstream retStr;
+
+    if (definedSet_) {
+      // Check for value in set
+      bool foundInSet = false;
+      for (size_t i = 0; i < expectedSet_.size(); i++) {
+        if (getByType<T>(expectedSet_[i]) == nodeVal) {
+          foundInSet = true;
+          break;
+        }
+      }
+      if (!foundInSet) {
+        // Construct a human-readable output explaining failure to match the
+        // expected value set
+        retStr << nodeVal << " not in set {";
+        for (size_t i = 0; i < expectedSet_.size(); i++) {
+          retStr << getByType<T>(expectedSet_[i]);
+          if (i < expectedSet_.size() - 1) retStr << ", ";
+        }
+        retStr << "}";
+        return {false, retStr.str()};
+      }
+    }
+
+    if (definedBounds_) {
+      // Check for value between bounds
+      if (getByType<T>(expectedBounds_.first) > nodeVal ||
+          getByType<T>(expectedBounds_.second) < nodeVal) {
+        // Construct a human-readable output explaining failure to be within
+        // expected bounds
+        retStr << nodeVal << " not in the bounds {"
+               << getByType<T>(expectedBounds_.first) << " to "
+               << getByType<T>(expectedBounds_.second) << "}";
+        return {false, retStr.str()};
+      }
+    }
+
+    return {true, "Success"};
+  }
+
+  /** The key of this node used for indexing the tree-like ExpectationNode
+   * structure. */
+  std::string nodeKey_ = "INVALID";
+
+  /** The cumulatively constructed key of all connected nodes which came before
+   * this instance. Primarily used for improved debugging when an errored
+   * ExceptionNode instance is encountered. */
+  std::string hierarchyKey_ = "";
+
+  /** The expected value type this node places on it associated config option.
+   */
+  ExpectedType type_ = ExpectedType::Valueless;
+
+  /** Whether the config option associated with this node is optional. */
+  bool isOptional_ = false;
+
+  /** Whether the config option associated with this node is a sequence. A
+   * sequence is defined by a named config option having many values, for
+   * example the below Instruction-Group-Support option is a sequence,
+   *
+   * Instruction-Group-Support:
+   * - INT_SIMPLE
+   * - INT_MUL
+   * - STORE_DATA
+   *
+   * In this instance, the expectations set for the named node are applied to
+   * all the values lower in the YAML hierarchy rather than just a single value.
+   */
+  bool isSequence_ = false;
+
+  /** Whether this instance of ExpectationNode is a wildcard as described in the
+   * comment above class ExpectationNode. */
+  bool isWildcard_ = false;
+
+  /** Whether this instance of ExpectationNode has a child node which is a
+   * wildcard. Each parent node can only have one wildcard node in its children.
+   */
+  bool hasWildcard_ = false;
+
+  /** The default value for the associated config option. */
+  DataTypeVariant defaultValue_;
+
+  /** Whether a value set has been defined as part of the expectation for the
+   * associated config option. */
+  bool definedSet_ = false;
+
+  /** The set of values the associated config option is expected to belong to.
+   */
+  std::vector<DataTypeVariant> expectedSet_;
+
+  /** Whether a value bounds have been defined as part of the expectation for
+   * the associated config option. */
+  bool definedBounds_ = false;
+
+  /** The value bounds the associated config option is expected to lie between.
+   */
+  // TODO needs initialisation in case validation called before setting. Unsure
+  // whether this is a good solution
+  std::pair<DataTypeVariant, DataTypeVariant> expectedBounds_ = {false, false};
+
+  /** The instances of ExpectationNodes held within this node. Considered to be
+   * the children of this node. */
+  std::vector<ExpectationNode> nodeChildren_;
+};
+
+}  // namespace config
+}  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/config/ModelConfig.hh b/src/include/simeng/config/ModelConfig.hh
new file mode 100644
index 0000000000..8c18b7e0c9
--- /dev/null
+++ b/src/include/simeng/config/ModelConfig.hh
@@ -0,0 +1,125 @@
+#pragma once
+
+#include <math.h>
+
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <fstream>
+#include <iostream>
+#include <queue>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <variant>
+
+#include "simeng/config/ExpectationNode.hh"
+#include "simeng/config/yaml/ryml.hh"
+#include "simeng/version.hh"
+
+namespace simeng {
+namespace config {
+
+/** Enum representing the possible ISAs. */
+enum class ISA { AArch64, RV64 };
+
+/** A class to correctly validate and format the provided
+ * configuration YAML file. */
+class ModelConfig {
+ public:
+  /** Construct a ModelConfig class by reading in the YAML file and
+   * run it through validation and formatting. */
+  ModelConfig(std::string path);
+
+  /** Default constructor which creates a default configuration file. */
+  ModelConfig();
+
+  /** A getter function to retrieve the validated and formatted config file. */
+  ryml::Tree getConfig();
+
+  /** Re-generate the default config file based on the passed isa. A force bool
+   * is used to force the re-generation of the default config, regardless of
+   * previously set ISA. */
+  void reGenerateDefault(ISA isa, bool force = false);
+
+  /** Append/replace config options within the held config file. */
+  void addConfigOptions(std::string config);
+
+ private:
+  /** A utility function to pass configTree_ through validation checks and
+   * output any errors where necessary. */
+  void validate();
+
+  /** A utility function to construct a default config file and pass it through
+   * validation and post-validation logic. */
+  void generateDefault();
+
+  /** Construct a default config file within `configTree_` from the default
+   * value information held within `expectations`. A `root_id` is supplied as an
+   * index for adding new config options to the `configTree` ryml::Tree. */
+  void constructDefault(ExpectationNode expectations, size_t root_id);
+
+  /** A utility function to recursively iterate over the passed NodeRef and its
+   * children and add them to the held config file `configTree_`. A `id` is
+   * supplied as an index for adding new config options to the `configTree`
+   * ryml::Tree. */
+  void recursiveAdd(ryml::NodeRef node, size_t id);
+
+  /** Create the ExpectationNode tree-like structure `expectations_` which holds
+   * all expectations on the values of passed/created config files. */
+  void setExpectations(bool isDefault = false);
+
+  /** A utility function to recursively iterate over all instances of
+   * ExpectationNode in `expectations` and the values within the config file,
+   * calling ExpectationNode validate functionality on each associated config
+   * option. A `hierarchyString` is used for printouts concerning errored
+   * validation. */
+  void recursiveValidate(ExpectationNode expectation, ryml::NodeRef node,
+                         std::string hierarchyString = "");
+
+  /** A set of formatting and checks performed on the config file after its
+   * validation is complete. */
+  void postValidation();
+
+  /** From a pre-defined vector of instruction group strings, instantiate an
+   * ISA specific mapping between the instruction group strings and the
+   * relevant instruction group namespace numbers. */
+  void createGroupMapping();
+
+  /** A representation of the YAML config file passed to the simulation or a
+   * config file constructed from pre-defined default values. */
+  ryml::Tree configTree_;
+
+  /** The ISA currently being simulated. Various config options rely on the
+   * knowledge of the ISA under simulation thus a variable is used to keep track
+   * of its value. */
+  ISA isa_ = ISA::AArch64;
+
+  /** Whether the config file was created from default values. */
+  bool isDefault_ = true;
+
+  /** The first node of the tree-like structure containing the expectations of
+   * all config options used within the simulation. */
+  ExpectationNode expectations_ = ExpectationNode();
+
+  /** The ISA specific vector of instruction group strings for matching
+   * against user inputted groups. */
+  std::vector<std::string> groupOptions_;
+
+  /** ISA specific mapping between the defined instruction strings and the
+   * instruction group variables. */
+  std::unordered_map<std::string, uint16_t> groupMapping_;
+
+  /** A string stream containing information about missing config
+   * fields. */
+  std::ostringstream missing_;
+
+  /** A string stream containing information about invalid values. */
+  std::ostringstream invalid_;
+
+  /** The default special file directory. */
+  std::string defaultSpecialFilePath_ = SIMENG_BUILD_DIR "/specialFiles/";
+};  // namespace ModelConfig
+
+}  // namespace config
+}  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/config/SimInfo.hh b/src/include/simeng/config/SimInfo.hh
new file mode 100644
index 0000000000..7e6b6df9d7
--- /dev/null
+++ b/src/include/simeng/config/SimInfo.hh
@@ -0,0 +1,129 @@
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include "simeng/Instruction.hh"
+#include "simeng/RegisterFileSet.hh"
+#include "simeng/arch/aarch64/ArchInfo.hh"
+#include "simeng/arch/riscv/ArchInfo.hh"
+#include "simeng/config/ModelConfig.hh"
+#include "simeng/config/yaml/ryml.hh"
+
+#define DEFAULT_STR "Default"
+
+namespace simeng {
+namespace config {
+
+/** Enum representing the possible simulation modes. */
+enum class SimulationMode { Emulation, InOrderPipelined, Outoforder };
+
+/** A SimInfo class to hold values, specified by the constructed ryml::Tree
+ * object in the ModelConfig class and manually, used after the instantiation of
+ * the current simulation and its objects. */
+class SimInfo {
+ public:
+  /** A getter function to retrieve the ryml::Tree representing the underlying
+   * model config file. */
+  static ryml::ConstNodeRef getConfig();
+
+  /** A setter function to set the model config file from a path to a YAML file.
+   */
+  static void setConfig(std::string path);
+
+  /** A function to add additional config values to the model config file. */
+  static void addToConfig(std::string configAdditions);
+
+  /** A function to generate a default config file based on a passed ISA. */
+  static void generateDefault(ISA isa, bool force = false);
+
+  /** A getter function to retrieve the config file path. */
+  static std::string getConfigPath();
+
+  /** A getter function to retrieve the simulation mode of the current SimEng
+   * instance. */
+  static SimulationMode getSimMode();
+
+  /** A getter function to retrieve the simulation mode of the current SimEng
+   * instance as a string. */
+  static std::string getSimModeStr();
+
+  /** A getter function to retrieve which ISA the current simulation is using.
+   */
+  static ISA getISA();
+
+  /** A getter function to retrieve which ISA the current simulation is using in
+   * a string format. */
+  static std::string getISAString();
+
+  /** A getter function to retrieve a vector of {size, number} pairs describing
+   * the available architectural registers. */
+  static const std::vector<simeng::RegisterFileStructure>& getArchRegStruct();
+
+  /** A getter function to retrieve a vector of {size, number} pairs describing
+   * the available physical registers. */
+  static const std::vector<simeng::RegisterFileStructure>& getPhysRegStruct();
+
+  /** A getter function to retrieve a vector of uint16_t values describing
+   * the quantities of physical registers available. */
+  static const std::vector<uint16_t>& getPhysRegQuantities();
+
+  /** A getter function to retrieve a vector of Capstone sysreg enums for
+   * all the system registers that should be utilised in simulation. */
+  static const std::vector<uint64_t>& getSysRegVec();
+
+  /** A getter function to retrieve whether or not the special files
+   * directories should be generated. */
+  static bool getGenSpecFiles();
+
+  /** A utility function to rebuild/construct member variables/classes. For use
+   * if the configuration used changes during simulation (e.g. during the
+   * execution of a test suite). */
+  static void reBuild();
+
+ private:
+  SimInfo();
+
+  /** Gets the static instance of the SimInfo class. */
+  static std::unique_ptr<SimInfo>& getInstance();
+
+  /** Create a model config from a passed YAML file path. */
+  void makeConfig(std::string path);
+
+  /** A function to extract various values from the generated config file to
+   * populate frequently queried model config values. */
+  void extractValues();
+
+  /** The validated model config file represented as a ryml:Tree. */
+  ryml::Tree validatedConfig_;
+
+  /** The ModelConfig instance used to create and maintain the model config
+   * file. */
+  ModelConfig modelConfig_;
+
+  /** The path of the model config file. Defaults to "Default". */
+  std::string configFilePath_ = DEFAULT_STR;
+
+  /** The simulation mode of the current execution of SimEng. */
+  SimulationMode mode_;
+
+  /** The simulation mode string of the current execution of SimEng. */
+  std::string modeStr_;
+
+  /** The instruction set architecture of the current execution of SimEng. */
+  ISA isa_;
+
+  /** The instruction set architecture of the current execution of SimEng in a
+   * string format. */
+  std::string isaString_;
+
+  /** Instance of an ArchInfo class used to store architecture specific
+   * configuration options. */
+  std::unique_ptr<arch::ArchInfo> archInfo_;
+
+  /** A bool representing if the special file directory should be created. */
+  bool genSpecialFiles_;
+};
+
+}  // namespace config
+}  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/config/yaml/.clang-format b/src/include/simeng/config/yaml/.clang-format
new file mode 100644
index 0000000000..9d159247d5
--- /dev/null
+++ b/src/include/simeng/config/yaml/.clang-format
@@ -0,0 +1,2 @@
+DisableFormat: true
+SortIncludes: false
diff --git a/src/include/simeng/config/yaml/ryml.hh b/src/include/simeng/config/yaml/ryml.hh
new file mode 100644
index 0000000000..bed8f4620b
--- /dev/null
+++ b/src/include/simeng/config/yaml/ryml.hh
@@ -0,0 +1,33659 @@
+#ifndef _RYML_SINGLE_HEADER_AMALGAMATED_HPP_
+#define _RYML_SINGLE_HEADER_AMALGAMATED_HPP_
+
+//
+// Rapid YAML - a library to parse and emit YAML, and do it fast.
+//
+// https://github.com/biojppm/rapidyaml
+//
+// DO NOT EDIT. This file is generated automatically.
+// This is an amalgamated single-header version of the library.
+//
+// INSTRUCTIONS:
+//   - Include at will in any header of your project
+//   - In one (and only one) of your project source files,
+//     #define RYML_SINGLE_HDR_DEFINE_NOW and then include this header.
+//     This will enable the function and class definitions in
+//     the header file.
+//   - To compile into a shared library, just define the
+//     preprocessor symbol RYML_SHARED . This will take
+//     care of symbol export/import.
+//
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// LICENSE.txt
+// https://github.com/biojppm/rapidyaml/LICENSE.txt
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+// Copyright (c) 2018, Joao Paulo Magalhaes <dev@jpmag.me>
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+//
+
+ // shared library: export when defining
+#if defined(RYML_SHARED) && defined(RYML_SINGLE_HDR_DEFINE_NOW) && !defined(RYML_EXPORTS)
+#define RYML_EXPORTS
+#endif
+
+
+ // propagate defines to c4core
+#if defined(RYML_SINGLE_HDR_DEFINE_NOW) && !defined(C4CORE_SINGLE_HDR_DEFINE_NOW)
+#define C4CORE_SINGLE_HDR_DEFINE_NOW
+#endif
+
+#if defined(RYML_EXPORTS) && !defined(C4CORE_EXPORTS)
+#define C4CORE_EXPORTS
+#endif
+
+#if defined(RYML_SHARED) && !defined(C4CORE_SHARED)
+#define C4CORE_SHARED
+#endif
+
+// workaround for include removal while amalgamating
+// resulting in <stdarg.h> missing in arm-none-eabi-g++
+// https://github.com/biojppm/rapidyaml/issues/193
+#include <stdarg.h>
+
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/c4core_all.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/c4core_all.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4CORE_SINGLE_HEADER_AMALGAMATED_HPP_
+#define _C4CORE_SINGLE_HEADER_AMALGAMATED_HPP_
+
+//
+// c4core - C++ utilities
+//
+// https://github.com/biojppm/c4core
+//
+// DO NOT EDIT. This file is generated automatically.
+// This is an amalgamated single-header version of the library.
+//
+// INSTRUCTIONS:
+//   - Include at will in any header of your project
+//   - In one (and only one) of your project source files,
+//     #define C4CORE_SINGLE_HDR_DEFINE_NOW and then include this header.
+//     This will enable the function and class definitions in
+//     the header file.
+//   - To compile into a shared library, just define the
+//     preprocessor symbol C4CORE_SHARED . This will take
+//     care of symbol export/import.
+//
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// LICENSE.txt
+// https://github.com/biojppm/c4core/LICENSE.txt
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+// Copyright (c) 2018, Joao Paulo Magalhaes <dev@jpmag.me>
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+//
+
+// shared library: export when defining
+#if defined(C4CORE_SHARED) && defined(C4CORE_SINGLE_HDR_DEFINE_NOW) && !defined(C4CORE_EXPORTS)
+#define C4CORE_EXPORTS
+#endif
+
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/export.hpp
+// https://github.com/biojppm/c4core/src/c4/export.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef C4_EXPORT_HPP_
+#define C4_EXPORT_HPP_
+
+#ifdef _WIN32
+    #ifdef C4CORE_SHARED
+        #ifdef C4CORE_EXPORTS
+            #define C4CORE_EXPORT __declspec(dllexport)
+        #else
+            #define C4CORE_EXPORT __declspec(dllimport)
+        #endif
+    #else
+        #define C4CORE_EXPORT
+    #endif
+#else
+    #define C4CORE_EXPORT
+#endif
+
+#endif /* C4CORE_EXPORT_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/export.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/preprocessor.hpp
+// https://github.com/biojppm/c4core/src/c4/preprocessor.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_PREPROCESSOR_HPP_
+#define _C4_PREPROCESSOR_HPP_
+
+/** @file preprocessor.hpp Contains basic macros and preprocessor utilities.
+ * @ingroup basic_headers */
+
+#ifdef __clang__
+    /* NOTE: using , ## __VA_ARGS__ to deal with zero-args calls to
+     * variadic macros is not portable, but works in clang, gcc, msvc, icc.
+     * clang requires switching off compiler warnings for pedantic mode.
+     * @see http://stackoverflow.com/questions/32047685/variadic-macro-without-arguments */
+#   pragma clang diagnostic push
+#   pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments" // warning: token pasting of ',' and __VA_ARGS__ is a GNU extension
+#elif defined(__GNUC__)
+    /* GCC also issues a warning for zero-args calls to variadic macros.
+     * This warning is switched on with -pedantic and apparently there is no
+     * easy way to turn it off as with clang. But marking this as a system
+     * header works.
+     * @see https://gcc.gnu.org/onlinedocs/cpp/System-Headers.html
+     * @see http://stackoverflow.com/questions/35587137/ */
+#   pragma GCC system_header
+#endif
+
+#define C4_WIDEN(str) L"" str
+
+#define C4_COUNTOF(arr) (sizeof(arr)/sizeof((arr)[0]))
+
+#define C4_EXPAND(arg) arg
+
+/** useful in some macro calls with template arguments */
+#define C4_COMMA ,
+/** useful in some macro calls with template arguments
+ * @see C4_COMMA */
+#define C4_COMMA_X C4_COMMA
+
+/** expand and quote */
+#define C4_XQUOTE(arg) _C4_XQUOTE(arg)
+#define _C4_XQUOTE(arg) C4_QUOTE(arg)
+#define C4_QUOTE(arg) #arg
+
+/** expand and concatenate */
+#define C4_XCAT(arg1, arg2) _C4_XCAT(arg1, arg2)
+#define _C4_XCAT(arg1, arg2) C4_CAT(arg1, arg2)
+#define C4_CAT(arg1, arg2) arg1##arg2
+
+#define C4_VERSION_CAT(major, minor, patch) ((major)*10000 + (minor)*100 + (patch))
+
+/** A preprocessor foreach. Spectacular trick taken from:
+ * http://stackoverflow.com/a/1872506/5875572
+ * The first argument is for a macro receiving a single argument,
+ * which will be called with every subsequent argument. There is
+ * currently a limit of 32 arguments, and at least 1 must be provided.
+ *
+Example:
+@code{.cpp}
+struct Example {
+    int a;
+    int b;
+    int c;
+};
+// define a one-arg macro to be called
+#define PRN_STRUCT_OFFSETS(field) PRN_STRUCT_OFFSETS_(Example, field)
+#define PRN_STRUCT_OFFSETS_(structure, field) printf(C4_XQUOTE(structure) ":" C4_XQUOTE(field)" - offset=%zu\n", offsetof(structure, field));
+
+// now call the macro for a, b and c
+C4_FOR_EACH(PRN_STRUCT_OFFSETS, a, b, c);
+@endcode */
+#define C4_FOR_EACH(what, ...) C4_FOR_EACH_SEP(what, ;, __VA_ARGS__)
+
+/** same as C4_FOR_EACH(), but use a custom separator between statements.
+ * If a comma is needed as the separator, use the C4_COMMA macro.
+ * @see C4_FOR_EACH
+ * @see C4_COMMA
+ */
+#define C4_FOR_EACH_SEP(what, sep, ...) _C4_FOR_EACH_(_C4_FOR_EACH_NARG(__VA_ARGS__), what, sep, __VA_ARGS__)
+
+/// @cond dev
+
+#define _C4_FOR_EACH_01(what, sep, x) what(x) sep
+#define _C4_FOR_EACH_02(what, sep, x, ...) what(x) sep _C4_FOR_EACH_01(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_03(what, sep, x, ...) what(x) sep _C4_FOR_EACH_02(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_04(what, sep, x, ...) what(x) sep _C4_FOR_EACH_03(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_05(what, sep, x, ...) what(x) sep _C4_FOR_EACH_04(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_06(what, sep, x, ...) what(x) sep _C4_FOR_EACH_05(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_07(what, sep, x, ...) what(x) sep _C4_FOR_EACH_06(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_08(what, sep, x, ...) what(x) sep _C4_FOR_EACH_07(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_09(what, sep, x, ...) what(x) sep _C4_FOR_EACH_08(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_10(what, sep, x, ...) what(x) sep _C4_FOR_EACH_09(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_11(what, sep, x, ...) what(x) sep _C4_FOR_EACH_10(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_12(what, sep, x, ...) what(x) sep _C4_FOR_EACH_11(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_13(what, sep, x, ...) what(x) sep _C4_FOR_EACH_12(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_14(what, sep, x, ...) what(x) sep _C4_FOR_EACH_13(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_15(what, sep, x, ...) what(x) sep _C4_FOR_EACH_14(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_16(what, sep, x, ...) what(x) sep _C4_FOR_EACH_15(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_17(what, sep, x, ...) what(x) sep _C4_FOR_EACH_16(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_18(what, sep, x, ...) what(x) sep _C4_FOR_EACH_17(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_19(what, sep, x, ...) what(x) sep _C4_FOR_EACH_18(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_20(what, sep, x, ...) what(x) sep _C4_FOR_EACH_19(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_21(what, sep, x, ...) what(x) sep _C4_FOR_EACH_20(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_22(what, sep, x, ...) what(x) sep _C4_FOR_EACH_21(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_23(what, sep, x, ...) what(x) sep _C4_FOR_EACH_22(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_24(what, sep, x, ...) what(x) sep _C4_FOR_EACH_23(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_25(what, sep, x, ...) what(x) sep _C4_FOR_EACH_24(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_26(what, sep, x, ...) what(x) sep _C4_FOR_EACH_25(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_27(what, sep, x, ...) what(x) sep _C4_FOR_EACH_26(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_28(what, sep, x, ...) what(x) sep _C4_FOR_EACH_27(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_29(what, sep, x, ...) what(x) sep _C4_FOR_EACH_28(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_30(what, sep, x, ...) what(x) sep _C4_FOR_EACH_29(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_31(what, sep, x, ...) what(x) sep _C4_FOR_EACH_30(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_32(what, sep, x, ...) what(x) sep _C4_FOR_EACH_31(what, sep, __VA_ARGS__)
+#define _C4_FOR_EACH_NARG(...) _C4_FOR_EACH_NARG_(__VA_ARGS__, _C4_FOR_EACH_RSEQ_N())
+#define _C4_FOR_EACH_NARG_(...) _C4_FOR_EACH_ARG_N(__VA_ARGS__)
+#define _C4_FOR_EACH_ARG_N(_01, _02, _03, _04, _05, _06, _07, _08, _09, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, N, ...) N
+#define _C4_FOR_EACH_RSEQ_N() 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 09, 08, 07, 06, 05, 04, 03, 02, 01
+#define _C4_FOR_EACH_(N, what, sep, ...) C4_XCAT(_C4_FOR_EACH_, N)(what, sep, __VA_ARGS__)
+
+/// @endcond
+
+#ifdef __clang__
+#   pragma clang diagnostic pop
+#endif
+
+#endif /* _C4_PREPROCESSOR_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/preprocessor.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/platform.hpp
+// https://github.com/biojppm/c4core/src/c4/platform.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_PLATFORM_HPP_
+#define _C4_PLATFORM_HPP_
+
+/** @file platform.hpp Provides platform information macros
+ * @ingroup basic_headers */
+
+// see also https://sourceforge.net/p/predef/wiki/OperatingSystems/
+
+#if defined(_WIN64)
+#   define C4_WIN
+#   define C4_WIN64
+#elif defined(_WIN32)
+#   define C4_WIN
+#   define C4_WIN32
+#elif defined(__ANDROID__)
+#   define C4_ANDROID
+#elif defined(__APPLE__)
+#   include "TargetConditionals.h"
+#   if TARGET_OS_IPHONE || TARGET_IPHONE_SIMULATOR
+#       define C4_IOS
+#   elif TARGET_OS_MAC || TARGET_OS_OSX
+#       define C4_MACOS
+#   else
+#       error "Unknown Apple platform"
+#   endif
+#elif defined(__linux__) || defined(__linux)
+#   define C4_UNIX
+#   define C4_LINUX
+#elif defined(__unix__) || defined(__unix)
+#   define C4_UNIX
+#elif defined(__arm__) || defined(__aarch64__)
+#   define C4_ARM
+#elif defined(SWIG)
+#   define C4_SWIG
+#else
+#   error "unknown platform"
+#endif
+
+#if defined(__posix) || defined(C4_UNIX) || defined(C4_LINUX)
+#   define C4_POSIX
+#endif
+
+
+#endif /* _C4_PLATFORM_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/platform.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/cpu.hpp
+// https://github.com/biojppm/c4core/src/c4/cpu.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_CPU_HPP_
+#define _C4_CPU_HPP_
+
+/** @file cpu.hpp Provides processor information macros
+ * @ingroup basic_headers */
+
+// see also https://sourceforge.net/p/predef/wiki/Architectures/
+// see also https://sourceforge.net/p/predef/wiki/Endianness/
+// see also https://github.com/googlesamples/android-ndk/blob/android-mk/hello-jni/jni/hello-jni.c
+// see http://code.qt.io/cgit/qt/qtbase.git/tree/src/corelib/global/qprocessordetection.h
+
+#ifdef __ORDER_LITTLE_ENDIAN__
+    #define _C4EL __ORDER_LITTLE_ENDIAN__
+#else
+    #define _C4EL 1234
+#endif
+
+#ifdef __ORDER_BIG_ENDIAN__
+    #define _C4EB __ORDER_BIG_ENDIAN__
+#else
+    #define _C4EB 4321
+#endif
+
+// mixed byte order (eg, PowerPC or ia64)
+#define _C4EM 1111
+
+#if defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
+    #define C4_CPU_X86_64
+    #define C4_WORDSIZE 8
+    #define C4_BYTE_ORDER _C4EL
+
+#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
+    #define C4_CPU_X86
+    #define C4_WORDSIZE 4
+    #define C4_BYTE_ORDER _C4EL
+
+#elif defined(__arm__) || defined(_M_ARM) \
+    || defined(__TARGET_ARCH_ARM) || defined(__aarch64__) || defined(_M_ARM64)
+   #if defined(__aarch64__) || defined(_M_ARM64)
+       #define C4_CPU_ARM64
+       #define C4_CPU_ARMV8
+       #define C4_WORDSIZE 8
+   #else
+       #define C4_CPU_ARM
+       #define C4_WORDSIZE 4
+       #if defined(__ARM_ARCH_8__) || defined(__ARM_ARCH_8A__)  \
+        || (defined(__ARCH_ARM) && __ARCH_ARM >= 8)
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM >= 8)  \
+           #define C4_CPU_ARMV8
+       #elif defined(__ARM_ARCH_7__) || defined(_ARM_ARCH_7)    \
+        || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) \
+        || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) \
+        || defined(__ARM_ARCH_7EM__) \
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM >= 7) \
+        || (defined(_M_ARM) && _M_ARM >= 7)
+           #define C4_CPU_ARMV7
+       #elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
+        || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) \
+        || defined(__ARM_ARCH_6K__)  || defined(__ARM_ARCH_6ZK__) \
+        || defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6KZ__) \
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM >= 6)
+           #define C4_CPU_ARMV6
+       #elif defined(__ARM_ARCH_5TEJ__) \
+        || defined(__ARM_ARCH_5TE__) \
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM >= 5)
+           #define C4_CPU_ARMV5
+       #elif defined(__ARM_ARCH_4T__) \
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM >= 4)
+           #define C4_CPU_ARMV4
+       #else
+           #error "unknown CPU architecture: ARM"
+       #endif
+   #endif
+   #if defined(__ARMEL__) || defined(__LITTLE_ENDIAN__) || defined(__AARCH64EL__) \
+       || (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) \
+       || defined(_MSC_VER) // winarm64 does not provide any of the above macros,
+                            // but advises little-endianess:
+                            // https://docs.microsoft.com/en-us/cpp/build/overview-of-arm-abi-conventions?view=msvc-170
+                            // So if it is visual studio compiling, we'll assume little endian.
+       #define C4_BYTE_ORDER _C4EL
+   #elif defined(__ARMEB__) || defined(__BIG_ENDIAN__) || defined(__AARCH64EB__) \
+       || (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))
+       #define C4_BYTE_ORDER _C4EB
+   #elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_PDP_ENDIAN__)
+       #define C4_BYTE_ORDER _C4EM
+   #else
+       #error "unknown endianness"
+   #endif
+
+#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
+   #define C4_CPU_IA64
+   #define C4_WORDSIZE 8
+   #define C4_BYTE_ORDER _C4EM
+   // itanium is bi-endian - check byte order below
+
+#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__)       \
+    || defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC)  \
+    || defined(_M_MPPC) || defined(_M_PPC)
+   #if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
+       #define C4_CPU_PPC64
+       #define C4_WORDSIZE 8
+   #else
+       #define C4_CPU_PPC
+       #define C4_WORDSIZE 4
+   #endif
+   #define C4_BYTE_ORDER _C4EM
+   // ppc is bi-endian - check byte order below
+
+#elif defined(__s390x__) || defined(__zarch__) || defined(__SYSC_ZARCH_)
+#   define C4_CPU_S390_X
+#   define C4_WORDSIZE 8
+#   define C4_BYTE_ORDER _C4EB
+
+#elif defined(__riscv)
+   #if __riscv_xlen == 64
+       #define C4_CPU_RISCV64
+       #define C4_WORDSIZE 8
+   #else
+       #define C4_CPU_RISCV32
+       #define C4_WORDSIZE 4
+   #endif
+   #define C4_BYTE_ORDER _C4EL
+
+#elif defined(__EMSCRIPTEN__)
+#   define C4_BYTE_ORDER _C4EL
+#   define C4_WORDSIZE 4
+
+#elif defined(SWIG)
+   #error "please define CPU architecture macros when compiling with swig"
+
+#else
+   #error "unknown CPU architecture"
+#endif
+
+#define C4_LITTLE_ENDIAN (C4_BYTE_ORDER == _C4EL)
+#define C4_BIG_ENDIAN (C4_BYTE_ORDER == _C4EB)
+#define C4_MIXED_ENDIAN (C4_BYTE_ORDER == _C4EM)
+
+#endif /* _C4_CPU_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/cpu.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/compiler.hpp
+// https://github.com/biojppm/c4core/src/c4/compiler.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_COMPILER_HPP_
+#define _C4_COMPILER_HPP_
+
+/** @file compiler.hpp Provides compiler information macros
+ * @ingroup basic_headers */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/platform.hpp
+//#include "c4/platform.hpp"
+#if !defined(C4_PLATFORM_HPP_) && !defined(_C4_PLATFORM_HPP_)
+#error "amalgamate: file c4/platform.hpp must have been included at this point"
+#endif /* C4_PLATFORM_HPP_ */
+
+
+// Compilers:
+//      C4_MSVC
+//             Visual Studio 2022: MSVC++ 17, 1930
+//             Visual Studio 2019: MSVC++ 16, 1920
+//             Visual Studio 2017: MSVC++ 15
+//             Visual Studio 2015: MSVC++ 14
+//             Visual Studio 2013: MSVC++ 13
+//             Visual Studio 2013: MSVC++ 12
+//             Visual Studio 2012: MSVC++ 11
+//             Visual Studio 2010: MSVC++ 10
+//             Visual Studio 2008: MSVC++ 09
+//             Visual Studio 2005: MSVC++ 08
+//      C4_CLANG
+//      C4_GCC
+//      C4_ICC (intel compiler)
+/** @see http://sourceforge.net/p/predef/wiki/Compilers/ for a list of compiler identifier macros */
+/** @see https://msdn.microsoft.com/en-us/library/b0084kay.aspx for VS2013 predefined macros */
+
+#if defined(_MSC_VER)// && (defined(C4_WIN) || defined(C4_XBOX) || defined(C4_UE4))
+#   define C4_MSVC
+#   define C4_MSVC_VERSION_2022 17
+#   define C4_MSVC_VERSION_2019 16
+#   define C4_MSVC_VERSION_2017 15
+#   define C4_MSVC_VERSION_2015 14
+#   define C4_MSVC_VERSION_2013 12
+#   define C4_MSVC_VERSION_2012 11
+#   if _MSC_VER >= 1930
+#       define C4_MSVC_VERSION C4_MSVC_VERSION_2022  // visual studio 2022
+#       define C4_MSVC_2022
+#   elif _MSC_VER >= 1920
+#       define C4_MSVC_VERSION C_4MSVC_VERSION_2019  // visual studio 2019
+#       define C4_MSVC_2019
+#   elif _MSC_VER >= 1910
+#       define C4_MSVC_VERSION C4_MSVC_VERSION_2017  // visual studio 2017
+#       define C4_MSVC_2017
+#   elif _MSC_VER == 1900
+#       define C4_MSVC_VERSION C4_MSVC_VERSION_2015  // visual studio 2015
+#       define C4_MSVC_2015
+#   elif _MSC_VER == 1800
+#       error "MSVC version not supported"
+#       define C4_MSVC_VERSION C4_MSVC_VERSION_2013  // visual studio 2013
+#       define C4_MSVC_2013
+#   elif _MSC_VER == 1700
+#       error "MSVC version not supported"
+#       define C4_MSVC_VERSION C4_MSVC_VERSION_2012  // visual studio 2012
+#       define C4_MSVC_2012
+#   elif _MSC_VER == 1600
+#       error "MSVC version not supported"
+#       define C4_MSVC_VERSION 10  // visual studio 2010
+#       define C4_MSVC_2010
+#   elif _MSC_VER == 1500
+#       error "MSVC version not supported"
+#       define C4_MSVC_VERSION 09  // visual studio 2008
+#       define C4_MSVC_2008
+#   elif _MSC_VER == 1400
+#       error "MSVC version not supported"
+#       define C4_MSVC_VERSION 08  // visual studio 2005
+#       define C4_MSVC_2005
+#   else
+#       error "MSVC version not supported"
+#   endif // _MSC_VER
+#else
+#   define C4_MSVC_VERSION 0   // visual studio not present
+#   define C4_GCC_LIKE
+#   ifdef __INTEL_COMPILER // check ICC before checking GCC, as ICC defines __GNUC__ too
+#       define C4_ICC
+#       define C4_ICC_VERSION __INTEL_COMPILER
+#   elif defined(__APPLE_CC__)
+#       define C4_XCODE
+#       if defined(__clang__)
+#           define C4_CLANG
+#           ifndef __apple_build_version__
+#               define C4_CLANG_VERSION C4_VERSION_ENCODED(__clang_major__, __clang_minor__, __clang_patchlevel__)
+#           else
+#               define C4_CLANG_VERSION __apple_build_version__
+#           endif
+#       else
+#           define C4_XCODE_VERSION __APPLE_CC__
+#       endif
+#   elif defined(__clang__)
+#       define C4_CLANG
+#       ifndef __apple_build_version__
+#           define C4_CLANG_VERSION C4_VERSION_ENCODED(__clang_major__, __clang_minor__, __clang_patchlevel__)
+#       else
+#           define C4_CLANG_VERSION __apple_build_version__
+#       endif
+#   elif defined(__GNUC__)
+#       define C4_GCC
+#       if defined(__GNUC_PATCHLEVEL__)
+#           define C4_GCC_VERSION C4_VERSION_ENCODED(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
+#       else
+#           define C4_GCC_VERSION C4_VERSION_ENCODED(__GNUC__, __GNUC_MINOR__, 0)
+#       endif
+#       if __GNUC__ < 5
+#           if __GNUC__ == 4 && __GNUC_MINOR__ >= 8
+// provided by cmake sub-project
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/gcc-4.8.hpp
+//#               include "c4/gcc-4.8.hpp"
+#if !defined(C4_GCC_4_8_HPP_) && !defined(_C4_GCC_4_8_HPP_)
+#error "amalgamate: file c4/gcc-4.8.hpp must have been included at this point"
+#endif /* C4_GCC_4_8_HPP_ */
+
+#           else
+// we do not support GCC < 4.8:
+//  * misses std::is_trivially_copyable
+//  * misses std::align
+//  * -Wshadow has false positives when a local function parameter has the same name as a method
+#               error "GCC < 4.8 is not supported"
+#           endif
+#       endif
+#   endif
+#endif // defined(C4_WIN) && defined(_MSC_VER)
+
+#endif /* _C4_COMPILER_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/compiler.hpp)
+
+// these includes are needed to work around conditional
+// includes in the gcc4.8 shim
+#include <cstdint>
+#include <type_traits>
+#include <cstring>
+
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// cmake/compat/c4/gcc-4.8.hpp
+// https://github.com/biojppm/c4core/cmake/compat/c4/gcc-4.8.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_COMPAT_GCC_4_8_HPP_
+#define _C4_COMPAT_GCC_4_8_HPP_
+
+#if __GNUC__ == 4 && __GNUC_MINOR__ >= 8
+/* STL polyfills for old GNU compilers */
+
+_Pragma("GCC diagnostic ignored \"-Wshadow\"")
+_Pragma("GCC diagnostic ignored \"-Wmissing-field-initializers\"")
+
+#if __cplusplus
+//included above:
+//#include <cstdint>
+//included above:
+//#include <type_traits>
+
+namespace std {
+
+template<typename _Tp>
+struct is_trivially_copyable : public integral_constant<bool,
+    is_destructible<_Tp>::value && __has_trivial_destructor(_Tp) &&
+    (__has_trivial_constructor(_Tp) || __has_trivial_copy(_Tp) || __has_trivial_assign(_Tp))>
+{ };
+
+template<typename _Tp>
+using is_trivially_copy_constructible = has_trivial_copy_constructor<_Tp>;
+
+template<typename _Tp>
+using is_trivially_default_constructible = has_trivial_default_constructor<_Tp>;
+
+template<typename _Tp>
+using is_trivially_copy_assignable = has_trivial_copy_assign<_Tp>;
+
+/* not supported */
+template<typename _Tp>
+struct is_trivially_move_constructible : false_type
+{ };
+
+/* not supported */
+template<typename _Tp>
+struct is_trivially_move_assignable : false_type
+{ };
+
+inline void *align(size_t __align, size_t __size, void*& __ptr, size_t& __space) noexcept
+{
+    if (__space < __size)
+        return nullptr;
+    const auto __intptr = reinterpret_cast<uintptr_t>(__ptr);
+    const auto __aligned = (__intptr - 1u + __align) & -__align;
+    const auto __diff = __aligned - __intptr;
+    if (__diff > (__space - __size))
+        return nullptr;
+    else
+    {
+        __space -= __diff;
+        return __ptr = reinterpret_cast<void*>(__aligned);
+    }
+}
+typedef long double max_align_t ;
+
+}
+#else // __cplusplus
+
+//included above:
+//#include <string.h>
+// see https://sourceware.org/bugzilla/show_bug.cgi?id=25399 (ubuntu gcc-4.8)
+#define memset(s, c, count) __builtin_memset(s, c, count)
+
+#endif // __cplusplus
+
+#endif // __GNUC__ == 4 && __GNUC_MINOR__ >= 8
+
+#endif // _C4_COMPAT_GCC_4_8_HPP_
+
+
+// (end https://github.com/biojppm/c4core/cmake/compat/c4/gcc-4.8.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/language.hpp
+// https://github.com/biojppm/c4core/src/c4/language.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_LANGUAGE_HPP_
+#define _C4_LANGUAGE_HPP_
+
+/** @file language.hpp Provides language standard information macros and
+ * compiler agnostic utility macros: namespace facilities, function attributes,
+ * variable attributes, etc.
+ * @ingroup basic_headers */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/preprocessor.hpp
+//#include "c4/preprocessor.hpp"
+#if !defined(C4_PREPROCESSOR_HPP_) && !defined(_C4_PREPROCESSOR_HPP_)
+#error "amalgamate: file c4/preprocessor.hpp must have been included at this point"
+#endif /* C4_PREPROCESSOR_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/compiler.hpp
+//#include "c4/compiler.hpp"
+#if !defined(C4_COMPILER_HPP_) && !defined(_C4_COMPILER_HPP_)
+#error "amalgamate: file c4/compiler.hpp must have been included at this point"
+#endif /* C4_COMPILER_HPP_ */
+
+
+/* Detect C++ standard.
+ * @see http://stackoverflow.com/a/7132549/5875572 */
+#ifndef C4_CPP
+#   ifdef _MSC_VER
+#       if _MSC_VER >= 1910  // >VS2015: VS2017, VS2019
+#           if (!defined(_MSVC_LANG))
+#               error _MSVC not defined
+#           endif
+#           if _MSVC_LANG >= 201705L
+#               define C4_CPP 20
+#               define C4_CPP20
+#           elif _MSVC_LANG == 201703L
+#               define C4_CPP 17
+#               define C4_CPP17
+#           elif _MSVC_LANG >= 201402L
+#               define C4_CPP 14
+#               define C4_CPP14
+#           elif _MSVC_LANG >= 201103L
+#               define C4_CPP 11
+#               define C4_CPP11
+#           else
+#               error C++ lesser than C++11 not supported
+#           endif
+#       else
+#           if _MSC_VER == 1900
+#               define C4_CPP 14  // VS2015 is c++14 https://devblogs.microsoft.com/cppblog/c111417-features-in-vs-2015-rtm/
+#               define C4_CPP14
+#           elif _MSC_VER == 1800 // VS2013
+#               define C4_CPP 11
+#               define C4_CPP11
+#           else
+#               error C++ lesser than C++11 not supported
+#           endif
+#       endif
+#   elif defined(__INTEL_COMPILER) // https://software.intel.com/en-us/node/524490
+#       ifdef __INTEL_CXX20_MODE__ // not sure about this
+#           define C4_CPP 20
+#           define C4_CPP20
+#       elif defined __INTEL_CXX17_MODE__ // not sure about this
+#           define C4_CPP 17
+#           define C4_CPP17
+#       elif defined __INTEL_CXX14_MODE__ // not sure about this
+#           define C4_CPP 14
+#           define C4_CPP14
+#       elif defined __INTEL_CXX11_MODE__
+#           define C4_CPP 11
+#           define C4_CPP11
+#       else
+#           error C++ lesser than C++11 not supported
+#       endif
+#   else
+#       ifndef __cplusplus
+#           error __cplusplus is not defined?
+#       endif
+#       if __cplusplus == 1
+#           error cannot handle __cplusplus==1
+#       elif __cplusplus >= 201709L
+#           define C4_CPP 20
+#           define C4_CPP20
+#       elif __cplusplus >= 201703L
+#           define C4_CPP 17
+#           define C4_CPP17
+#       elif __cplusplus >= 201402L
+#           define C4_CPP 14
+#           define C4_CPP14
+#       elif __cplusplus >= 201103L
+#           define C4_CPP 11
+#           define C4_CPP11
+#       elif __cplusplus >= 199711L
+#           error C++ lesser than C++11 not supported
+#       endif
+#   endif
+#else
+#   ifdef C4_CPP == 20
+#       define C4_CPP20
+#   elif C4_CPP == 17
+#       define C4_CPP17
+#   elif C4_CPP == 14
+#       define C4_CPP14
+#   elif C4_CPP == 11
+#       define C4_CPP11
+#   elif C4_CPP == 98
+#       define C4_CPP98
+#       error C++ lesser than C++11 not supported
+#   else
+#       error C4_CPP must be one of 20, 17, 14, 11, 98
+#   endif
+#endif
+
+#ifdef C4_CPP20
+#   define C4_CPP17
+#   define C4_CPP14
+#   define C4_CPP11
+#elif defined(C4_CPP17)
+#   define C4_CPP14
+#   define C4_CPP11
+#elif defined(C4_CPP14)
+#   define C4_CPP11
+#endif
+
+/** lifted from this answer: http://stackoverflow.com/a/20170989/5875572 */
+#ifndef _MSC_VER
+#  if __cplusplus < 201103
+#    define C4_CONSTEXPR11
+#    define C4_CONSTEXPR14
+//#    define C4_NOEXCEPT
+#  elif __cplusplus == 201103
+#    define C4_CONSTEXPR11 constexpr
+#    define C4_CONSTEXPR14
+//#    define C4_NOEXCEPT noexcept
+#  else
+#    define C4_CONSTEXPR11 constexpr
+#    define C4_CONSTEXPR14 constexpr
+//#    define C4_NOEXCEPT noexcept
+#  endif
+#else  // _MSC_VER
+#  if _MSC_VER < 1900
+#    define C4_CONSTEXPR11
+#    define C4_CONSTEXPR14
+//#    define C4_NOEXCEPT
+#  elif _MSC_VER < 2000
+#    define C4_CONSTEXPR11 constexpr
+#    define C4_CONSTEXPR14
+//#    define C4_NOEXCEPT noexcept
+#  else
+#    define C4_CONSTEXPR11 constexpr
+#    define C4_CONSTEXPR14 constexpr
+//#    define C4_NOEXCEPT noexcept
+#  endif
+#endif  // _MSC_VER
+
+
+#if C4_CPP < 17
+#define C4_IF_CONSTEXPR
+#define C4_INLINE_CONSTEXPR constexpr
+#else
+#define C4_IF_CONSTEXPR constexpr
+#define C4_INLINE_CONSTEXPR inline constexpr
+#endif
+
+
+//------------------------------------------------------------
+
+#define _C4_BEGIN_NAMESPACE(ns) namespace ns {
+#define _C4_END_NAMESPACE(ns)   }
+
+// MSVC cant handle the C4_FOR_EACH macro... need to fix this
+//#define C4_BEGIN_NAMESPACE(...) C4_FOR_EACH_SEP(_C4_BEGIN_NAMESPACE, , __VA_ARGS__)
+//#define C4_END_NAMESPACE(...) C4_FOR_EACH_SEP(_C4_END_NAMESPACE, , __VA_ARGS__)
+#define C4_BEGIN_NAMESPACE(ns) namespace ns {
+#define C4_END_NAMESPACE(ns) }
+
+#define C4_BEGIN_HIDDEN_NAMESPACE namespace /*hidden*/ {
+#define C4_END_HIDDEN_NAMESPACE } /* namespace hidden */
+
+//------------------------------------------------------------
+
+#ifndef C4_API
+#   if defined(_MSC_VER)
+#       if defined(C4_EXPORT)
+#           define C4_API __declspec(dllexport)
+#       elif defined(C4_IMPORT)
+#           define C4_API __declspec(dllimport)
+#       else
+#           define C4_API
+#       endif
+#   else
+#       define C4_API
+#   endif
+#endif
+
+#ifndef _MSC_VER  ///< @todo assuming gcc-like compiler. check it is actually so.
+/** for function attributes in GCC,
+ * @see https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#Common-Function-Attributes */
+/** for __builtin functions in GCC,
+ * @see https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html */
+#   define C4_RESTRICT __restrict__
+#   define C4_RESTRICT_FN __attribute__((restrict))
+#   define C4_NO_INLINE __attribute__((noinline))
+#   define C4_ALWAYS_INLINE inline __attribute__((always_inline))
+#   define C4_CONST __attribute__((const))
+#   define C4_PURE __attribute__((pure))
+/** force inlining of every callee function */
+#   define C4_FLATTEN __atribute__((flatten))
+/** mark a function as hot, ie as having a visible impact in CPU time
+ * thus making it more likely to inline, etc
+ * @see http://stackoverflow.com/questions/15028990/semantics-of-gcc-hot-attribute */
+#   define C4_HOT __attribute__((hot))
+/** mark a function as cold, ie as NOT having a visible impact in CPU time
+ * @see http://stackoverflow.com/questions/15028990/semantics-of-gcc-hot-attribute */
+#   define C4_COLD __attribute__((cold))
+#   define C4_EXPECT(x, y) __builtin_expect(x, y) ///< @see https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
+#   define C4_LIKELY(x)   __builtin_expect(x, 1)
+#   define C4_UNLIKELY(x) __builtin_expect(x, 0)
+#   define C4_UNREACHABLE() __builtin_unreachable()
+#   define C4_ATTR_FORMAT(...) //__attribute__((format (__VA_ARGS__))) ///< @see https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#Common-Function-Attributes
+#   define C4_NORETURN __attribute__((noreturn))
+#else
+#   define C4_RESTRICT __restrict
+#   define C4_RESTRICT_FN __declspec(restrict)
+#   define C4_NO_INLINE __declspec(noinline)
+#   define C4_ALWAYS_INLINE inline __forceinline
+/** these are not available in VS AFAIK */
+#   define C4_CONST
+#   define C4_PURE
+#   define C4_FLATTEN
+#   define C4_HOT         /** @todo */
+#   define C4_COLD        /** @todo */
+#   define C4_EXPECT(x, y) x /** @todo */
+#   define C4_LIKELY(x)   x /** @todo */
+#   define C4_UNLIKELY(x) x /** @todo */
+#   define C4_UNREACHABLE() /** @todo */
+#   define C4_ATTR_FORMAT(...) /** */
+#   define C4_NORETURN /** @todo */
+#endif
+
+#ifndef _MSC_VER
+#   define C4_FUNC __FUNCTION__
+#   define C4_PRETTY_FUNC __PRETTY_FUNCTION__
+#else /// @todo assuming gcc-like compiler. check it is actually so.
+#   define C4_FUNC __FUNCTION__
+#   define C4_PRETTY_FUNC __FUNCSIG__
+#endif
+
+/** prevent compiler warnings about a specific var being unused */
+#define C4_UNUSED(var) (void)var
+
+#if C4_CPP >= 17
+#define C4_STATIC_ASSERT(cond) static_assert(cond)
+#else
+#define C4_STATIC_ASSERT(cond) static_assert((cond), #cond)
+#endif
+#define C4_STATIC_ASSERT_MSG(cond, msg) static_assert((cond), #cond ": " msg)
+
+/** @def C4_DONT_OPTIMIZE idea lifted from GoogleBenchmark.
+ * @see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark_api.h */
+namespace c4 {
+namespace detail {
+#ifdef __GNUC__
+#   define C4_DONT_OPTIMIZE(var) c4::detail::dont_optimize(var)
+template< class T >
+C4_ALWAYS_INLINE void dont_optimize(T const& value) { asm volatile("" : : "g"(value) : "memory"); }
+#else
+#   define C4_DONT_OPTIMIZE(var) c4::detail::use_char_pointer(reinterpret_cast< const char* >(&var))
+void use_char_pointer(char const volatile*);
+#endif
+} // namespace detail
+} // namespace c4
+
+/** @def C4_KEEP_EMPTY_LOOP prevent an empty loop from being optimized out.
+ * @see http://stackoverflow.com/a/7084193/5875572 */
+#ifndef _MSC_VER
+#   define C4_KEEP_EMPTY_LOOP { asm(""); }
+#else
+#   define C4_KEEP_EMPTY_LOOP { char c; C4_DONT_OPTIMIZE(c); }
+#endif
+
+/** @def C4_VA_LIST_REUSE_MUST_COPY
+ * @todo <jpmag> I strongly suspect that this is actually only in UNIX platforms. revisit this. */
+#ifdef __GNUC__
+#   define C4_VA_LIST_REUSE_MUST_COPY
+#endif
+
+#endif /* _C4_LANGUAGE_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/language.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/types.hpp
+// https://github.com/biojppm/c4core/src/c4/types.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_TYPES_HPP_
+#define _C4_TYPES_HPP_
+
+//included above:
+//#include <stdint.h>
+#include <stddef.h>
+//included above:
+//#include <type_traits>
+
+#if __cplusplus >= 201103L
+#include <utility>  // for integer_sequence and friends
+#endif
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/preprocessor.hpp
+//#include "c4/preprocessor.hpp"
+#if !defined(C4_PREPROCESSOR_HPP_) && !defined(_C4_PREPROCESSOR_HPP_)
+#error "amalgamate: file c4/preprocessor.hpp must have been included at this point"
+#endif /* C4_PREPROCESSOR_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/language.hpp
+//#include "c4/language.hpp"
+#if !defined(C4_LANGUAGE_HPP_) && !defined(_C4_LANGUAGE_HPP_)
+#error "amalgamate: file c4/language.hpp must have been included at this point"
+#endif /* C4_LANGUAGE_HPP_ */
+
+
+/** @file types.hpp basic types, and utility macros and traits for types.
+ * @ingroup basic_headers */
+
+/** @defgroup types Type utilities */
+
+namespace c4 {
+
+/** @defgroup intrinsic_types Intrinsic types
+ * @ingroup types
+ * @{ */
+
+using cbyte = const char; /**< a constant byte */
+using  byte =       char; /**< a mutable byte */
+
+using  i8 =   int8_t;
+using i16 =  int16_t;
+using i32 =  int32_t;
+using i64 =  int64_t;
+using  u8 =  uint8_t;
+using u16 = uint16_t;
+using u32 = uint32_t;
+using u64 = uint64_t;
+
+using f32 =  float;
+using f64 = double;
+
+using ssize_t = typename std::make_signed<size_t>::type;
+
+/** @} */
+
+//--------------------------------------------------
+
+/** @defgroup utility_types Utility types
+ * @ingroup types
+ * @{ */
+
+// some tag types
+
+/** a tag type for initializing the containers with variadic arguments a la
+ * initializer_list, minus the initializer_list overload problems.
+ */
+struct aggregate_t {};
+/** @see aggregate_t */
+constexpr const aggregate_t aggregate{};
+
+/** a tag type for specifying the initial capacity of allocatable contiguous storage */
+struct with_capacity_t {};
+/** @see with_capacity_t */
+constexpr const with_capacity_t with_capacity{};
+
+/** a tag type for disambiguating template parameter packs in variadic template overloads */
+struct varargs_t {};
+/** @see with_capacity_t */
+constexpr const varargs_t varargs{};
+
+
+//--------------------------------------------------
+
+/** whether a value should be used in place of a const-reference in argument passing. */
+template<class T>
+struct cref_uses_val
+{
+    enum { value = (
+    std::is_scalar<T>::value
+    ||
+    (
+#if C4_CPP >= 20
+        (std::is_trivially_copyable<T>::value && std::is_standard_layout<T>::value)
+#else
+        std::is_pod<T>::value
+#endif
+        &&
+        sizeof(T) <= sizeof(size_t))) };
+};
+/** utility macro to override the default behaviour for c4::fastcref<T>
+ @see fastcref */
+#define C4_CREF_USES_VAL(T) \
+template<>                  \
+struct cref_uses_val<T>     \
+{                           \
+    enum { value = true };  \
+};
+
+/** Whether to use pass-by-value or pass-by-const-reference in a function argument
+ * or return type. */
+template<class T>
+using fastcref = typename std::conditional<c4::cref_uses_val<T>::value, T, T const&>::type;
+
+//--------------------------------------------------
+
+/** Just what its name says. Useful sometimes as a default empty policy class. */
+struct EmptyStruct
+{
+    template<class... T> EmptyStruct(T && ...){}
+};
+
+/** Just what its name says. Useful sometimes as a default policy class to
+ * be inherited from. */
+struct EmptyStructVirtual
+{
+    virtual ~EmptyStructVirtual() = default;
+    template<class... T> EmptyStructVirtual(T && ...){}
+};
+
+
+/** */
+template<class T>
+struct inheritfrom : public T {};
+
+//--------------------------------------------------
+// Utilities to make a class obey size restrictions (eg, min size or size multiple of).
+// DirectX usually makes this restriction with uniform buffers.
+// This is also useful for padding to prevent false-sharing.
+
+/** how many bytes must be added to size such that the result is at least minsize? */
+C4_ALWAYS_INLINE constexpr size_t min_remainder(size_t size, size_t minsize) noexcept
+{
+    return size < minsize ? minsize-size : 0;
+}
+
+/** how many bytes must be added to size such that the result is a multiple of multipleof?  */
+C4_ALWAYS_INLINE constexpr size_t mult_remainder(size_t size, size_t multipleof) noexcept
+{
+    return (((size % multipleof) != 0) ? (multipleof-(size % multipleof)) : 0);
+}
+
+/* force the following class to be tightly packed. */
+#pragma pack(push, 1)
+/** pad a class with more bytes at the end.
+ * @see http://stackoverflow.com/questions/21092415/force-c-structure-to-pack-tightly */
+template<class T, size_t BytesToPadAtEnd>
+struct Padded : public T
+{
+    using T::T;
+    using T::operator=;
+    Padded(T const& val) : T(val) {}
+    Padded(T && val) : T(val) {}
+    char ___c4padspace___[BytesToPadAtEnd];
+};
+#pragma pack(pop)
+/** When the padding argument is 0, we cannot declare the char[] array. */
+template<class T>
+struct Padded<T, 0> : public T
+{
+    using T::T;
+    using T::operator=;
+    Padded(T const& val) : T(val) {}
+    Padded(T && val) : T(val) {}
+};
+
+/** make T have a size which is at least Min bytes */
+template<class T, size_t Min>
+using MinSized = Padded<T, min_remainder(sizeof(T), Min)>;
+
+/** make T have a size which is a multiple of Mult bytes */
+template<class T, size_t Mult>
+using MultSized = Padded<T, mult_remainder(sizeof(T), Mult)>;
+
+/** make T have a size which is simultaneously:
+ *  -bigger or equal than Min
+ *  -a multiple of Mult */
+template<class T, size_t Min, size_t Mult>
+using MinMultSized = MultSized<MinSized<T, Min>, Mult>;
+
+/** make T be suitable for use as a uniform buffer. (at least with DirectX). */
+template<class T>
+using UbufSized = MinMultSized<T, 64, 16>;
+
+
+//-----------------------------------------------------------------------------
+
+#define C4_NO_COPY_CTOR(ty) ty(ty const&) = delete
+#define C4_NO_MOVE_CTOR(ty) ty(ty     &&) = delete
+#define C4_NO_COPY_ASSIGN(ty) ty& operator=(ty const&) = delete
+#define C4_NO_MOVE_ASSIGN(ty) ty& operator=(ty     &&) = delete
+#define C4_DEFAULT_COPY_CTOR(ty) ty(ty const&) noexcept = default
+#define C4_DEFAULT_MOVE_CTOR(ty) ty(ty     &&) noexcept = default
+#define C4_DEFAULT_COPY_ASSIGN(ty) ty& operator=(ty const&) noexcept = default
+#define C4_DEFAULT_MOVE_ASSIGN(ty) ty& operator=(ty     &&) noexcept = default
+
+#define C4_NO_COPY_OR_MOVE_CTOR(ty) \
+    C4_NO_COPY_CTOR(ty); \
+    C4_NO_MOVE_CTOR(ty)
+
+#define C4_NO_COPY_OR_MOVE_ASSIGN(ty) \
+    C4_NO_COPY_ASSIGN(ty); \
+    C4_NO_MOVE_ASSIGN(ty)
+
+#define C4_NO_COPY_OR_MOVE(ty) \
+    C4_NO_COPY_OR_MOVE_CTOR(ty); \
+    C4_NO_COPY_OR_MOVE_ASSIGN(ty)
+
+#define C4_DEFAULT_COPY_AND_MOVE_CTOR(ty) \
+    C4_DEFAULT_COPY_CTOR(ty); \
+    C4_DEFAULT_MOVE_CTOR(ty)
+
+#define C4_DEFAULT_COPY_AND_MOVE_ASSIGN(ty) \
+    C4_DEFAULT_COPY_ASSIGN(ty); \
+    C4_DEFAULT_MOVE_ASSIGN(ty)
+
+#define C4_DEFAULT_COPY_AND_MOVE(ty) \
+    C4_DEFAULT_COPY_AND_MOVE_CTOR(ty); \
+    C4_DEFAULT_COPY_AND_MOVE_ASSIGN(ty)
+
+/** @see https://en.cppreference.com/w/cpp/named_req/TriviallyCopyable */
+#define C4_MUST_BE_TRIVIAL_COPY(ty) \
+    static_assert(std::is_trivially_copyable<ty>::value, #ty " must be trivially copyable")
+
+/** @} */
+
+
+//-----------------------------------------------------------------------------
+
+/** @defgroup traits_types Type traits utilities
+ * @ingroup types
+ * @{ */
+
+// http://stackoverflow.com/questions/10821380/is-t-an-instance-of-a-template-in-c
+template<template<typename...> class X, typename    T> struct is_instance_of_tpl             : std::false_type {};
+template<template<typename...> class X, typename... Y> struct is_instance_of_tpl<X, X<Y...>> : std::true_type {};
+
+//-----------------------------------------------------------------------------
+
+/** SFINAE. use this macro to enable a template function overload
+based on a compile-time condition.
+@code
+// define an overload for a non-pod type
+template<class T, C4_REQUIRE_T(std::is_pod<T>::value)>
+void foo() { std::cout << "pod type\n"; }
+
+// define an overload for a non-pod type
+template<class T, C4_REQUIRE_T(!std::is_pod<T>::value)>
+void foo() { std::cout << "nonpod type\n"; }
+
+struct non_pod
+{
+    non_pod() : name("asdfkjhasdkjh") {}
+    const char *name;
+};
+
+int main()
+{
+    foo<float>(); // prints "pod type"
+    foo<non_pod>(); // prints "nonpod type"
+}
+@endcode */
+#define C4_REQUIRE_T(cond) typename std::enable_if<cond, bool>::type* = nullptr
+
+/** enable_if for a return type
+ * @see C4_REQUIRE_T */
+#define C4_REQUIRE_R(cond, type_) typename std::enable_if<cond, type_>::type
+
+//-----------------------------------------------------------------------------
+/** define a traits class reporting whether a type provides a member typedef */
+#define C4_DEFINE_HAS_TYPEDEF(member_typedef)               \
+template<typename T>                                        \
+struct has_##stype                                          \
+{                                                           \
+private:                                                    \
+                                                            \
+    typedef char                      yes;                  \
+    typedef struct { char array[2]; } no;                   \
+                                                            \
+    template<typename C>                                    \
+    static yes _test(typename C::member_typedef*);          \
+                                                            \
+    template<typename C>                                    \
+    static no  _test(...);                                  \
+                                                            \
+public:                                                     \
+                                                            \
+    enum { value = (sizeof(_test<T>(0)) == sizeof(yes)) };  \
+                                                            \
+}
+
+
+/** @} */
+
+
+//-----------------------------------------------------------------------------
+
+
+/** @defgroup type_declarations Type declaration utilities
+ * @ingroup types
+ * @{ */
+
+#define _c4_DEFINE_ARRAY_TYPES_WITHOUT_ITERATOR(T, I)           \
+                                                                \
+    using size_type = I;                                        \
+    using ssize_type = typename std::make_signed<I>::type;      \
+    using difference_type = typename std::make_signed<I>::type; \
+                                                                \
+    using value_type = T;                                       \
+    using pointer = T*;                                         \
+    using const_pointer = T const*;                             \
+    using reference = T&;                                       \
+    using const_reference = T const&
+
+#define _c4_DEFINE_TUPLE_ARRAY_TYPES_WITHOUT_ITERATOR(interior_types, I) \
+                                                                        \
+    using size_type = I;                                                \
+    using ssize_type = typename std::make_signed<I>::type;              \
+    using difference_type = typename std::make_signed<I>::type;         \
+                                                                        \
+    template<I n> using value_type = typename std::tuple_element< n, std::tuple<interior_types...>>::type; \
+    template<I n> using pointer = value_type<n>*;                       \
+    template<I n> using const_pointer = value_type<n> const*;           \
+    template<I n> using reference = value_type<n>&;                     \
+    template<I n> using const_reference = value_type<n> const&
+
+
+#define _c4_DEFINE_ARRAY_TYPES(T, I)                                \
+                                                                    \
+    _c4_DEFINE_ARRAY_TYPES_WITHOUT_ITERATOR(T, I);                  \
+                                                                    \
+    using iterator = T*;                                            \
+    using const_iterator = T const*;                                \
+    using reverse_iterator = std::reverse_iterator<T*>;             \
+    using const_reverse_iterator = std::reverse_iterator<T const*>
+
+
+#define _c4_DEFINE_TUPLE_ARRAY_TYPES(interior_types, I)                 \
+                                                                        \
+    _c4_DEFINE_TUPLE_ARRAY_TYPES_WITHOUT_ITERATOR(interior_types, I);   \
+                                                                        \
+    template<I n> using iterator = value_type<n>*;                      \
+    template<I n> using const_iterator = value_type<n> const*;          \
+    template<I n> using reverse_iterator = std::reverse_iterator< value_type<n>*>; \
+    template<I n> using const_reverse_iterator = std::reverse_iterator< value_type<n> const*>
+
+
+
+/** @} */
+
+
+//-----------------------------------------------------------------------------
+
+
+/** @defgroup compatility_utilities Backport implementation of some Modern C++ utilities
+ * @ingroup types
+ * @{ */
+
+//-----------------------------------------------------------------------------
+// index_sequence and friends are available only for C++14 and later.
+// A C++11 implementation is provided here.
+// This implementation was copied over from clang.
+// see http://llvm.org/viewvc/llvm-project/libcxx/trunk/include/utility?revision=211563&view=markup#l687
+
+#if __cplusplus > 201103L
+
+using std::integer_sequence;
+using std::index_sequence;
+using std::make_integer_sequence;
+using std::make_index_sequence;
+using std::index_sequence_for;
+
+#else
+
+/** C++11 implementation of integer sequence
+ * @see https://en.cppreference.com/w/cpp/utility/integer_sequence
+ * @see taken from clang: http://llvm.org/viewvc/llvm-project/libcxx/trunk/include/utility?revision=211563&view=markup#l687 */
+template<class _Tp, _Tp... _Ip>
+struct integer_sequence
+{
+    static_assert(std::is_integral<_Tp>::value,
+                  "std::integer_sequence can only be instantiated with an integral type" );
+    using value_type = _Tp;
+    static constexpr size_t size() noexcept { return sizeof...(_Ip); }
+};
+
+/** C++11 implementation of index sequence
+ * @see https://en.cppreference.com/w/cpp/utility/integer_sequence
+ * @see taken from clang: http://llvm.org/viewvc/llvm-project/libcxx/trunk/include/utility?revision=211563&view=markup#l687 */
+template<size_t... _Ip>
+using index_sequence = integer_sequence<size_t, _Ip...>;
+
+/** @cond DONT_DOCUMENT_THIS */
+namespace __detail {
+
+template<typename _Tp, size_t ..._Extra>
+struct __repeat;
+
+template<typename _Tp, _Tp ..._Np, size_t ..._Extra>
+struct __repeat<integer_sequence<_Tp, _Np...>, _Extra...>
+{
+    using type = integer_sequence<_Tp,
+                            _Np...,
+                            sizeof...(_Np) + _Np...,
+                            2 * sizeof...(_Np) + _Np...,
+                            3 * sizeof...(_Np) + _Np...,
+                            4 * sizeof...(_Np) + _Np...,
+                            5 * sizeof...(_Np) + _Np...,
+                            6 * sizeof...(_Np) + _Np...,
+                            7 * sizeof...(_Np) + _Np...,
+                            _Extra...>;
+};
+
+template<size_t _Np> struct __parity;
+template<size_t _Np> struct __make : __parity<_Np % 8>::template __pmake<_Np> {};
+
+template<> struct __make<0> { using type = integer_sequence<size_t>; };
+template<> struct __make<1> { using type = integer_sequence<size_t, 0>; };
+template<> struct __make<2> { using type = integer_sequence<size_t, 0, 1>; };
+template<> struct __make<3> { using type = integer_sequence<size_t, 0, 1, 2>; };
+template<> struct __make<4> { using type = integer_sequence<size_t, 0, 1, 2, 3>; };
+template<> struct __make<5> { using type = integer_sequence<size_t, 0, 1, 2, 3, 4>; };
+template<> struct __make<6> { using type = integer_sequence<size_t, 0, 1, 2, 3, 4, 5>; };
+template<> struct __make<7> { using type = integer_sequence<size_t, 0, 1, 2, 3, 4, 5, 6>; };
+
+template<> struct __parity<0> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type> {}; };
+template<> struct __parity<1> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 1> {}; };
+template<> struct __parity<2> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 2, _Np - 1> {}; };
+template<> struct __parity<3> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 3, _Np - 2, _Np - 1> {}; };
+template<> struct __parity<4> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 4, _Np - 3, _Np - 2, _Np - 1> {}; };
+template<> struct __parity<5> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 5, _Np - 4, _Np - 3, _Np - 2, _Np - 1> {}; };
+template<> struct __parity<6> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 6, _Np - 5, _Np - 4, _Np - 3, _Np - 2, _Np - 1> {}; };
+template<> struct __parity<7> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 7, _Np - 6, _Np - 5, _Np - 4, _Np - 3, _Np - 2, _Np - 1> {}; };
+
+template<typename _Tp, typename _Up>
+struct __convert
+{
+    template<typename> struct __result;
+    template<_Tp ..._Np> struct __result<integer_sequence<_Tp, _Np...>>
+    {
+        using type = integer_sequence<_Up, _Np...>;
+    };
+};
+
+template<typename _Tp>
+struct __convert<_Tp, _Tp>
+{
+    template<typename _Up> struct __result
+    {
+         using type = _Up;
+    };
+};
+
+template<typename _Tp, _Tp _Np>
+using __make_integer_sequence_unchecked = typename __detail::__convert<size_t, _Tp>::template __result<typename __detail::__make<_Np>::type>::type;
+
+template<class _Tp, _Tp _Ep>
+struct __make_integer_sequence
+{
+    static_assert(std::is_integral<_Tp>::value,
+                  "std::make_integer_sequence can only be instantiated with an integral type" );
+    static_assert(0 <= _Ep, "std::make_integer_sequence input shall not be negative");
+    typedef __make_integer_sequence_unchecked<_Tp, _Ep> type;
+};
+
+} // namespace __detail
+/** @endcond */
+
+
+/** C++11 implementation of index sequence
+ * @see https://en.cppreference.com/w/cpp/utility/integer_sequence
+ * @see taken from clang: http://llvm.org/viewvc/llvm-project/libcxx/trunk/include/utility?revision=211563&view=markup#l687 */
+template<class _Tp, _Tp _Np>
+using make_integer_sequence = typename __detail::__make_integer_sequence<_Tp, _Np>::type;
+
+/** C++11 implementation of index sequence
+ * @see https://en.cppreference.com/w/cpp/utility/integer_sequence
+ * @see taken from clang: http://llvm.org/viewvc/llvm-project/libcxx/trunk/include/utility?revision=211563&view=markup#l687 */
+template<size_t _Np>
+using make_index_sequence = make_integer_sequence<size_t, _Np>;
+
+/** C++11 implementation of index sequence
+ * @see https://en.cppreference.com/w/cpp/utility/integer_sequence
+ * @see taken from clang: http://llvm.org/viewvc/llvm-project/libcxx/trunk/include/utility?revision=211563&view=markup#l687 */
+template<class... _Tp>
+using index_sequence_for = make_index_sequence<sizeof...(_Tp)>;
+#endif
+
+/** @} */
+
+
+} // namespace c4
+
+#endif /* _C4_TYPES_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/types.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/config.hpp
+// https://github.com/biojppm/c4core/src/c4/config.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_CONFIG_HPP_
+#define _C4_CONFIG_HPP_
+
+/** @defgroup basic_headers Basic headers
+ * @brief Headers providing basic macros, platform+cpu+compiler information,
+ * C++ facilities and basic typedefs. */
+
+/** @file config.hpp Contains configuration defines and includes the basic_headers.
+ * @ingroup basic_headers */
+
+//#define C4_DEBUG
+
+#define C4_ERROR_SHOWS_FILELINE
+//#define C4_ERROR_SHOWS_FUNC
+//#define C4_ERROR_THROWS_EXCEPTION
+//#define C4_NO_ALLOC_DEFAULTS
+//#define C4_REDEFINE_CPPNEW
+
+#ifndef C4_SIZE_TYPE
+#   define C4_SIZE_TYPE size_t
+#endif
+
+#ifndef C4_STR_SIZE_TYPE
+#   define C4_STR_SIZE_TYPE C4_SIZE_TYPE
+#endif
+
+#ifndef C4_TIME_TYPE
+#   define C4_TIME_TYPE double
+#endif
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/export.hpp
+//#include "c4/export.hpp"
+#if !defined(C4_EXPORT_HPP_) && !defined(_C4_EXPORT_HPP_)
+#error "amalgamate: file c4/export.hpp must have been included at this point"
+#endif /* C4_EXPORT_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/preprocessor.hpp
+//#include "c4/preprocessor.hpp"
+#if !defined(C4_PREPROCESSOR_HPP_) && !defined(_C4_PREPROCESSOR_HPP_)
+#error "amalgamate: file c4/preprocessor.hpp must have been included at this point"
+#endif /* C4_PREPROCESSOR_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/platform.hpp
+//#include "c4/platform.hpp"
+#if !defined(C4_PLATFORM_HPP_) && !defined(_C4_PLATFORM_HPP_)
+#error "amalgamate: file c4/platform.hpp must have been included at this point"
+#endif /* C4_PLATFORM_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/cpu.hpp
+//#include "c4/cpu.hpp"
+#if !defined(C4_CPU_HPP_) && !defined(_C4_CPU_HPP_)
+#error "amalgamate: file c4/cpu.hpp must have been included at this point"
+#endif /* C4_CPU_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/compiler.hpp
+//#include "c4/compiler.hpp"
+#if !defined(C4_COMPILER_HPP_) && !defined(_C4_COMPILER_HPP_)
+#error "amalgamate: file c4/compiler.hpp must have been included at this point"
+#endif /* C4_COMPILER_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/language.hpp
+//#include "c4/language.hpp"
+#if !defined(C4_LANGUAGE_HPP_) && !defined(_C4_LANGUAGE_HPP_)
+#error "amalgamate: file c4/language.hpp must have been included at this point"
+#endif /* C4_LANGUAGE_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/types.hpp
+//#include "c4/types.hpp"
+#if !defined(C4_TYPES_HPP_) && !defined(_C4_TYPES_HPP_)
+#error "amalgamate: file c4/types.hpp must have been included at this point"
+#endif /* C4_TYPES_HPP_ */
+
+
+#endif // _C4_CONFIG_HPP_
+
+
+// (end https://github.com/biojppm/c4core/src/c4/config.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/ext/debugbreak/debugbreak.h
+// https://github.com/biojppm/c4core/src/c4/ext/debugbreak/debugbreak.h
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+/* Copyright (c) 2011-2021, Scott Tsai
+ * 
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef DEBUG_BREAK_H
+#define DEBUG_BREAK_H
+
+#ifdef _MSC_VER
+
+#define debug_break __debugbreak
+
+#else
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DEBUG_BREAK_USE_TRAP_INSTRUCTION 1
+#define DEBUG_BREAK_USE_BULTIN_TRAP      2
+#define DEBUG_BREAK_USE_SIGTRAP          3
+
+#if defined(__i386__) || defined(__x86_64__)
+	#define DEBUG_BREAK_IMPL DEBUG_BREAK_USE_TRAP_INSTRUCTION
+__inline__ static void trap_instruction(void)
+{
+	__asm__ volatile("int $0x03");
+}
+#elif defined(__thumb__)
+	#define DEBUG_BREAK_IMPL DEBUG_BREAK_USE_TRAP_INSTRUCTION
+/* FIXME: handle __THUMB_INTERWORK__ */
+__attribute__((always_inline))
+__inline__ static void trap_instruction(void)
+{
+	/* See 'arm-linux-tdep.c' in GDB source.
+	 * Both instruction sequences below work. */
+#if 1
+	/* 'eabi_linux_thumb_le_breakpoint' */
+	__asm__ volatile(".inst 0xde01");
+#else
+	/* 'eabi_linux_thumb2_le_breakpoint' */
+	__asm__ volatile(".inst.w 0xf7f0a000");
+#endif
+
+	/* Known problem:
+	 * After a breakpoint hit, can't 'stepi', 'step', or 'continue' in GDB.
+	 * 'step' would keep getting stuck on the same instruction.
+	 *
+	 * Workaround: use the new GDB commands 'debugbreak-step' and
+	 * 'debugbreak-continue' that become available
+	 * after you source the script from GDB:
+	 *
+	 * $ gdb -x debugbreak-gdb.py <... USUAL ARGUMENTS ...>
+	 *
+	 * 'debugbreak-step' would jump over the breakpoint instruction with
+	 * roughly equivalent of:
+	 * (gdb) set $instruction_len = 2
+	 * (gdb) tbreak *($pc + $instruction_len)
+	 * (gdb) jump   *($pc + $instruction_len)
+	 */
+}
+#elif defined(__arm__) && !defined(__thumb__)
+	#define DEBUG_BREAK_IMPL DEBUG_BREAK_USE_TRAP_INSTRUCTION
+__attribute__((always_inline))
+__inline__ static void trap_instruction(void)
+{
+	/* See 'arm-linux-tdep.c' in GDB source,
+	 * 'eabi_linux_arm_le_breakpoint' */
+	__asm__ volatile(".inst 0xe7f001f0");
+	/* Known problem:
+	 * Same problem and workaround as Thumb mode */
+}
+#elif defined(__aarch64__) && defined(__APPLE__)
+	#define DEBUG_BREAK_IMPL DEBUG_BREAK_USE_BULTIN_DEBUGTRAP
+#elif defined(__aarch64__)
+	#define DEBUG_BREAK_IMPL DEBUG_BREAK_USE_TRAP_INSTRUCTION
+__attribute__((always_inline))
+__inline__ static void trap_instruction(void)
+{
+	/* See 'aarch64-tdep.c' in GDB source,
+	 * 'aarch64_default_breakpoint' */
+	__asm__ volatile(".inst 0xd4200000");
+}
+#elif defined(__powerpc__)
+	/* PPC 32 or 64-bit, big or little endian */
+	#define DEBUG_BREAK_IMPL DEBUG_BREAK_USE_TRAP_INSTRUCTION
+__attribute__((always_inline))
+__inline__ static void trap_instruction(void)
+{
+	/* See 'rs6000-tdep.c' in GDB source,
+	 * 'rs6000_breakpoint' */
+	__asm__ volatile(".4byte 0x7d821008");
+
+	/* Known problem:
+	 * After a breakpoint hit, can't 'stepi', 'step', or 'continue' in GDB.
+	 * 'step' stuck on the same instruction ("twge r2,r2").
+	 *
+	 * The workaround is the same as ARM Thumb mode: use debugbreak-gdb.py
+	 * or manually jump over the instruction. */
+}
+#elif defined(__riscv)
+	/* RISC-V 32 or 64-bit, whether the "C" extension
+	 * for compressed, 16-bit instructions are supported or not */
+	#define DEBUG_BREAK_IMPL DEBUG_BREAK_USE_TRAP_INSTRUCTION
+__attribute__((always_inline))
+__inline__ static void trap_instruction(void)
+{
+	/* See 'riscv-tdep.c' in GDB source,
+	 * 'riscv_sw_breakpoint_from_kind' */
+	__asm__ volatile(".4byte 0x00100073");
+}
+#else
+	#define DEBUG_BREAK_IMPL DEBUG_BREAK_USE_SIGTRAP
+#endif
+
+
+#ifndef DEBUG_BREAK_IMPL
+#error "debugbreak.h is not supported on this target"
+#elif DEBUG_BREAK_IMPL == DEBUG_BREAK_USE_TRAP_INSTRUCTION
+__attribute__((always_inline))
+__inline__ static void debug_break(void)
+{
+	trap_instruction();
+}
+#elif DEBUG_BREAK_IMPL == DEBUG_BREAK_USE_BULTIN_DEBUGTRAP
+__attribute__((always_inline))
+__inline__ static void debug_break(void)
+{
+	__builtin_debugtrap();
+}
+#elif DEBUG_BREAK_IMPL == DEBUG_BREAK_USE_BULTIN_TRAP
+__attribute__((always_inline))
+__inline__ static void debug_break(void)
+{
+	__builtin_trap();
+}
+#elif DEBUG_BREAK_IMPL == DEBUG_BREAK_USE_SIGTRAP
+#include <signal.h>
+__attribute__((always_inline))
+__inline__ static void debug_break(void)
+{
+	raise(SIGTRAP);
+}
+#else
+#error "invalid DEBUG_BREAK_IMPL value"
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ifdef _MSC_VER */
+
+#endif /* ifndef DEBUG_BREAK_H */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/ext/debugbreak/debugbreak.h)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/error.hpp
+// https://github.com/biojppm/c4core/src/c4/error.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_ERROR_HPP_
+#define _C4_ERROR_HPP_
+
+/** @file error.hpp Facilities for error reporting and runtime assertions. */
+
+/** @defgroup error_checking Error checking */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/config.hpp
+//#include "c4/config.hpp"
+#if !defined(C4_CONFIG_HPP_) && !defined(_C4_CONFIG_HPP_)
+#error "amalgamate: file c4/config.hpp must have been included at this point"
+#endif /* C4_CONFIG_HPP_ */
+
+
+#ifdef _DOXYGEN_
+    /** if this is defined and exceptions are enabled, then calls to C4_ERROR()
+     * will throw an exception
+     * @ingroup error_checking */
+#   define C4_EXCEPTIONS_ENABLED
+    /** if this is defined and exceptions are enabled, then calls to C4_ERROR()
+     *  will throw an exception
+     * @see C4_EXCEPTIONS_ENABLED
+     * @ingroup error_checking */
+#   define C4_ERROR_THROWS_EXCEPTION
+    /** evaluates to noexcept when C4_ERROR might be called and
+     * exceptions are disabled. Otherwise, defaults to nothing.
+     * @ingroup error_checking */
+#   define C4_NOEXCEPT
+#endif // _DOXYGEN_
+
+#if defined(C4_EXCEPTIONS_ENABLED) && defined(C4_ERROR_THROWS_EXCEPTION)
+#   define C4_NOEXCEPT
+#else
+#   define C4_NOEXCEPT noexcept
+#endif
+
+
+namespace c4 {
+namespace detail {
+struct fail_type__ {};
+} // detail
+} // c4
+#define C4_STATIC_ERROR(dummy_type, errmsg)                             \
+    static_assert(std::is_same<dummy_type, c4::detail::fail_type__>::value, errmsg)
+
+
+//-----------------------------------------------------------------------------
+
+#define C4_ASSERT_SAME_TYPE(ty1, ty2)                       \
+    C4_STATIC_ASSERT(std::is_same<ty1 C4_COMMA_X ty2>::value)
+
+#define C4_ASSERT_DIFF_TYPE(ty1, ty2)                       \
+    C4_STATIC_ASSERT( ! std::is_same<ty1 C4_COMMA_X ty2>::value)
+
+
+//-----------------------------------------------------------------------------
+
+#ifdef _DOXYGEN_
+/** utility macro that triggers a breakpoint when
+ * the debugger is attached and NDEBUG is not defined.
+ * @ingroup error_checking */
+#   define C4_DEBUG_BREAK()
+#endif // _DOXYGEN_
+
+
+#if defined(NDEBUG) || defined(C4_NO_DEBUG_BREAK)
+#   define C4_DEBUG_BREAK()
+#else
+#   ifdef __clang__
+#       pragma clang diagnostic push
+#       if !defined(__APPLE_CC__)
+#           if __clang_major__ >= 10
+#               pragma clang diagnostic ignored "-Wgnu-inline-cpp-without-extern" // debugbreak/debugbreak.h:50:16: error: 'gnu_inline' attribute without 'extern' in C++ treated as externally available, this changed in Clang 10 [-Werror,-Wgnu-inline-cpp-without-extern]
+#           endif
+#       else
+#           if __clang_major__ >= 13
+#               pragma clang diagnostic ignored "-Wgnu-inline-cpp-without-extern" // debugbreak/debugbreak.h:50:16: error: 'gnu_inline' attribute without 'extern' in C++ treated as externally available, this changed in Clang 10 [-Werror,-Wgnu-inline-cpp-without-extern]
+#           endif
+#       endif
+#   elif defined(__GNUC__)
+#   endif
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/ext/debugbreak/debugbreak.h
+//#   include <c4/ext/debugbreak/debugbreak.h>
+#if !defined(DEBUG_BREAK_H) && !defined(_DEBUG_BREAK_H)
+#error "amalgamate: file c4/ext/debugbreak/debugbreak.h must have been included at this point"
+#endif /* DEBUG_BREAK_H */
+
+#   define C4_DEBUG_BREAK() if(c4::is_debugger_attached()) { ::debug_break(); }
+#   ifdef __clang__
+#       pragma clang diagnostic pop
+#   elif defined(__GNUC__)
+#   endif
+#endif
+
+namespace c4 {
+C4CORE_EXPORT bool is_debugger_attached();
+} // namespace c4
+
+
+//-----------------------------------------------------------------------------
+
+#ifdef __clang__
+    /* NOTE: using , ## __VA_ARGS__ to deal with zero-args calls to
+     * variadic macros is not portable, but works in clang, gcc, msvc, icc.
+     * clang requires switching off compiler warnings for pedantic mode.
+     * @see http://stackoverflow.com/questions/32047685/variadic-macro-without-arguments */
+#   pragma clang diagnostic push
+#   pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments" // warning: token pasting of ',' and __VA_ARGS__ is a GNU extension
+#elif defined(__GNUC__)
+    /* GCC also issues a warning for zero-args calls to variadic macros.
+     * This warning is switched on with -pedantic and apparently there is no
+     * easy way to turn it off as with clang. But marking this as a system
+     * header works.
+     * @see https://gcc.gnu.org/onlinedocs/cpp/System-Headers.html
+     * @see http://stackoverflow.com/questions/35587137/ */
+#   pragma GCC system_header
+#endif
+
+
+//-----------------------------------------------------------------------------
+
+namespace c4 {
+
+typedef enum : uint32_t {
+    /** when an error happens and the debugger is attached, call C4_DEBUG_BREAK().
+     * Without effect otherwise. */
+    ON_ERROR_DEBUGBREAK = 0x01 << 0,
+    /** when an error happens log a message. */
+    ON_ERROR_LOG = 0x01 << 1,
+    /** when an error happens invoke a callback if it was set with
+     * set_error_callback(). */
+    ON_ERROR_CALLBACK = 0x01 << 2,
+    /** when an error happens call std::terminate(). */
+    ON_ERROR_ABORT = 0x01 << 3,
+    /** when an error happens and exceptions are enabled throw an exception.
+     * Without effect otherwise. */
+    ON_ERROR_THROW = 0x01 << 4,
+    /** the default flags. */
+    ON_ERROR_DEFAULTS = ON_ERROR_DEBUGBREAK|ON_ERROR_LOG|ON_ERROR_CALLBACK|ON_ERROR_ABORT
+} ErrorFlags_e;
+using error_flags = uint32_t;
+C4CORE_EXPORT void set_error_flags(error_flags f);
+C4CORE_EXPORT error_flags get_error_flags();
+
+
+using error_callback_type = void (*)(const char* msg, size_t msg_size);
+C4CORE_EXPORT void set_error_callback(error_callback_type cb);
+C4CORE_EXPORT error_callback_type get_error_callback();
+
+
+//-----------------------------------------------------------------------------
+/** RAII class controling the error settings inside a scope. */
+struct ScopedErrorSettings
+{
+    error_flags m_flags;
+    error_callback_type m_callback;
+
+    explicit ScopedErrorSettings(error_callback_type cb)
+    :   m_flags(get_error_flags()),
+        m_callback(get_error_callback())
+    {
+        set_error_callback(cb);
+    }
+    explicit ScopedErrorSettings(error_flags flags)
+    :   m_flags(get_error_flags()),
+        m_callback(get_error_callback())
+    {
+        set_error_flags(flags);
+    }
+    explicit ScopedErrorSettings(error_flags flags, error_callback_type cb)
+    :   m_flags(get_error_flags()),
+        m_callback(get_error_callback())
+    {
+        set_error_flags(flags);
+        set_error_callback(cb);
+    }
+    ~ScopedErrorSettings()
+    {
+        set_error_flags(m_flags);
+        set_error_callback(m_callback);
+    }
+};
+
+
+//-----------------------------------------------------------------------------
+
+/** source location */
+struct srcloc;
+
+C4CORE_EXPORT void handle_error(srcloc s, const char *fmt, ...);
+C4CORE_EXPORT void handle_warning(srcloc s, const char *fmt, ...);
+
+
+#   define C4_ERROR(msg, ...)                               \
+    do {                                                    \
+        if(c4::get_error_flags() & c4::ON_ERROR_DEBUGBREAK) \
+        {                                                   \
+            C4_DEBUG_BREAK()                                \
+        }                                                   \
+        c4::handle_error(C4_SRCLOC(), msg, ## __VA_ARGS__); \
+    } while(0)
+
+
+#   define C4_WARNING(msg, ...)                             \
+    c4::handle_warning(C4_SRCLOC(), msg, ## __VA_ARGS__)
+
+
+#if defined(C4_ERROR_SHOWS_FILELINE) && defined(C4_ERROR_SHOWS_FUNC)
+
+struct srcloc
+{
+    const char *file = "";
+    const char *func = "";
+    int line = 0;
+};
+#define C4_SRCLOC() c4::srcloc{__FILE__, C4_PRETTY_FUNC, __LINE__}
+
+#elif defined(C4_ERROR_SHOWS_FILELINE)
+
+struct srcloc
+{
+    const char *file;
+    int line;
+};
+#define C4_SRCLOC() c4::srcloc{__FILE__, __LINE__}
+
+#elif ! defined(C4_ERROR_SHOWS_FUNC)
+
+struct srcloc
+{
+};
+#define C4_SRCLOC() c4::srcloc()
+
+#else
+#   error not implemented
+#endif
+
+
+//-----------------------------------------------------------------------------
+// assertions
+
+// Doxygen needs this so that only one definition counts
+#ifdef _DOXYGEN_
+    /** Explicitly enables assertions, independently of NDEBUG status.
+     * This is meant to allow enabling assertions even when NDEBUG is defined.
+     * Defaults to undefined.
+     * @ingroup error_checking */
+#   define C4_USE_ASSERT
+    /** assert that a condition is true; this is turned off when NDEBUG
+     * is defined and C4_USE_ASSERT is not true.
+     * @ingroup error_checking  */
+#   define C4_ASSERT
+    /** same as C4_ASSERT(), additionally prints a printf-formatted message
+     * @ingroup error_checking */
+#   define C4_ASSERT_MSG
+    /** evaluates to C4_NOEXCEPT when C4_XASSERT is disabled; otherwise, defaults
+     * to noexcept
+     * @ingroup error_checking */
+#   define C4_NOEXCEPT_A
+#endif // _DOXYGEN_
+
+#ifndef C4_USE_ASSERT
+#   ifdef NDEBUG
+#       define C4_USE_ASSERT 0
+#   else
+#       define C4_USE_ASSERT 1
+#   endif
+#endif
+
+#if C4_USE_ASSERT
+#   define C4_ASSERT(cond) C4_CHECK(cond)
+#   define C4_ASSERT_MSG(cond, /*fmt, */...) C4_CHECK_MSG(cond, ## __VA_ARGS__)
+#   define C4_ASSERT_IF(predicate, cond) if(predicate) { C4_ASSERT(cond); }
+#   define C4_NOEXCEPT_A C4_NOEXCEPT
+#else
+#   define C4_ASSERT(cond)
+#   define C4_ASSERT_MSG(cond, /*fmt, */...)
+#   define C4_ASSERT_IF(predicate, cond)
+#   define C4_NOEXCEPT_A noexcept
+#endif
+
+
+//-----------------------------------------------------------------------------
+// extreme assertions
+
+// Doxygen needs this so that only one definition counts
+#ifdef _DOXYGEN_
+    /** Explicitly enables extreme assertions; this is meant to allow enabling
+     * assertions even when NDEBUG is defined. Defaults to undefined.
+     * @ingroup error_checking */
+#   define C4_USE_XASSERT
+    /** extreme assertion: can be switched off independently of
+     * the regular assertion; use for example for bounds checking in hot code.
+     * Turned on only when C4_USE_XASSERT is defined
+     * @ingroup error_checking */
+#   define C4_XASSERT
+    /** same as C4_XASSERT(), and additionally prints a printf-formatted message
+     * @ingroup error_checking */
+#   define C4_XASSERT_MSG
+    /** evaluates to C4_NOEXCEPT when C4_XASSERT is disabled; otherwise, defaults to noexcept
+     * @ingroup error_checking */
+#   define C4_NOEXCEPT_X
+#endif // _DOXYGEN_
+
+#ifndef C4_USE_XASSERT
+#   define C4_USE_XASSERT C4_USE_ASSERT
+#endif
+
+#if C4_USE_XASSERT
+#   define C4_XASSERT(cond) C4_CHECK(cond)
+#   define C4_XASSERT_MSG(cond, /*fmt, */...) C4_CHECK_MSG(cond, ## __VA_ARGS__)
+#   define C4_XASSERT_IF(predicate, cond) if(predicate) { C4_XASSERT(cond); }
+#   define C4_NOEXCEPT_X C4_NOEXCEPT
+#else
+#   define C4_XASSERT(cond)
+#   define C4_XASSERT_MSG(cond, /*fmt, */...)
+#   define C4_XASSERT_IF(predicate, cond)
+#   define C4_NOEXCEPT_X noexcept
+#endif
+
+
+//-----------------------------------------------------------------------------
+// checks: never switched-off
+
+/** Check that a condition is true, or raise an error when not
+ * true. Unlike C4_ASSERT(), this check is not disabled in non-debug
+ * builds.
+ * @see C4_ASSERT
+ * @ingroup error_checking
+ *
+ * @todo add constexpr-compatible compile-time assert:
+ * https://akrzemi1.wordpress.com/2017/05/18/asserts-in-constexpr-functions/
+ */
+#define C4_CHECK(cond)                              \
+    do {                                            \
+        if(C4_UNLIKELY(!(cond)))                    \
+        {                                           \
+            C4_ERROR("check failed: %s", #cond);    \
+        }                                           \
+    } while(0)
+
+
+/** like C4_CHECK(), and additionally log a printf-style message.
+ * @see C4_CHECK
+ * @ingroup error_checking */
+#define C4_CHECK_MSG(cond, fmt, ...)                                    \
+    do {                                                                \
+        if(C4_UNLIKELY(!(cond)))                                        \
+        {                                                               \
+            C4_ERROR("check failed: " #cond "\n" fmt, ## __VA_ARGS__);  \
+        }                                                               \
+    } while(0)
+
+
+//-----------------------------------------------------------------------------
+// Common error conditions
+
+#define C4_NOT_IMPLEMENTED() C4_ERROR("NOT IMPLEMENTED")
+#define C4_NOT_IMPLEMENTED_MSG(/*msg, */...) C4_ERROR("NOT IMPLEMENTED: " ## __VA_ARGS__)
+#define C4_NOT_IMPLEMENTED_IF(condition) do { if(C4_UNLIKELY(condition)) { C4_ERROR("NOT IMPLEMENTED"); } } while(0)
+#define C4_NOT_IMPLEMENTED_IF_MSG(condition, /*msg, */...) do { if(C4_UNLIKELY(condition)) { C4_ERROR("NOT IMPLEMENTED: " ## __VA_ARGS__); } } while(0)
+
+#define C4_NEVER_REACH() do { C4_ERROR("never reach this point"); C4_UNREACHABLE(); } while(0)
+#define C4_NEVER_REACH_MSG(/*msg, */...) do { C4_ERROR("never reach this point: " ## __VA_ARGS__); C4_UNREACHABLE(); } while(0)
+
+
+
+//-----------------------------------------------------------------------------
+// helpers for warning suppression
+// idea adapted from https://github.com/onqtam/doctest/
+
+
+#ifdef C4_MSVC
+#define C4_SUPPRESS_WARNING_MSVC_PUSH __pragma(warning(push))
+#define C4_SUPPRESS_WARNING_MSVC(w)  __pragma(warning(disable : w))
+#define C4_SUPPRESS_WARNING_MSVC_POP __pragma(warning(pop))
+#define C4_SUPPRESS_WARNING_MSVC_WITH_PUSH(w)   \
+    C4_SUPPRESS_WARNING_MSVC_PUSH               \
+    C4_SUPPRESS_WARNING_MSVC(w)
+#else // C4_MSVC
+#define C4_SUPPRESS_WARNING_MSVC_PUSH
+#define C4_SUPPRESS_WARNING_MSVC(w)
+#define C4_SUPPRESS_WARNING_MSVC_POP
+#define C4_SUPPRESS_WARNING_MSVC_WITH_PUSH(w)
+#endif // C4_MSVC
+
+
+#ifdef C4_CLANG
+#define C4_PRAGMA_TO_STR(x) _Pragma(#x)
+#define C4_SUPPRESS_WARNING_CLANG_PUSH _Pragma("clang diagnostic push")
+#define C4_SUPPRESS_WARNING_CLANG(w) C4_PRAGMA_TO_STR(clang diagnostic ignored w)
+#define C4_SUPPRESS_WARNING_CLANG_POP _Pragma("clang diagnostic pop")
+#define C4_SUPPRESS_WARNING_CLANG_WITH_PUSH(w)  \
+    C4_SUPPRESS_WARNING_CLANG_PUSH              \
+    C4_SUPPRESS_WARNING_CLANG(w)
+#else // C4_CLANG
+#define C4_SUPPRESS_WARNING_CLANG_PUSH
+#define C4_SUPPRESS_WARNING_CLANG(w)
+#define C4_SUPPRESS_WARNING_CLANG_POP
+#define C4_SUPPRESS_WARNING_CLANG_WITH_PUSH(w)
+#endif // C4_CLANG
+
+
+#ifdef C4_GCC
+#define C4_PRAGMA_TO_STR(x) _Pragma(#x)
+#define C4_SUPPRESS_WARNING_GCC_PUSH _Pragma("GCC diagnostic push")
+#define C4_SUPPRESS_WARNING_GCC(w) C4_PRAGMA_TO_STR(GCC diagnostic ignored w)
+#define C4_SUPPRESS_WARNING_GCC_POP _Pragma("GCC diagnostic pop")
+#define C4_SUPPRESS_WARNING_GCC_WITH_PUSH(w)    \
+    C4_SUPPRESS_WARNING_GCC_PUSH                \
+    C4_SUPPRESS_WARNING_GCC(w)
+#else // C4_GCC
+#define C4_SUPPRESS_WARNING_GCC_PUSH
+#define C4_SUPPRESS_WARNING_GCC(w)
+#define C4_SUPPRESS_WARNING_GCC_POP
+#define C4_SUPPRESS_WARNING_GCC_WITH_PUSH(w)
+#endif // C4_GCC
+
+
+#define C4_SUPPRESS_WARNING_GCC_CLANG_PUSH \
+    C4_SUPPRESS_WARNING_GCC_PUSH     \
+    C4_SUPPRESS_WARNING_CLANG_PUSH
+
+#define C4_SUPPRESS_WARNING_GCC_CLANG(w) \
+    C4_SUPPRESS_WARNING_GCC(w)     \
+    C4_SUPPRESS_WARNING_CLANG(w)
+
+#define C4_SUPPRESS_WARNING_GCC_CLANG_WITH_PUSH(w) \
+    C4_SUPPRESS_WARNING_GCC_WITH_PUSH(w)     \
+    C4_SUPPRESS_WARNING_CLANG_WITH_PUSH(w)
+
+#define C4_SUPPRESS_WARNING_GCC_CLANG_POP \
+    C4_SUPPRESS_WARNING_GCC_POP     \
+    C4_SUPPRESS_WARNING_CLANG_POP
+
+} // namespace c4
+
+#ifdef __clang__
+#   pragma clang diagnostic pop
+#endif
+
+#endif /* _C4_ERROR_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/error.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/memory_util.hpp
+// https://github.com/biojppm/c4core/src/c4/memory_util.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_MEMORY_UTIL_HPP_
+#define _C4_MEMORY_UTIL_HPP_
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/config.hpp
+//#include "c4/config.hpp"
+#if !defined(C4_CONFIG_HPP_) && !defined(_C4_CONFIG_HPP_)
+#error "amalgamate: file c4/config.hpp must have been included at this point"
+#endif /* C4_CONFIG_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/error.hpp
+//#include "c4/error.hpp"
+#if !defined(C4_ERROR_HPP_) && !defined(_C4_ERROR_HPP_)
+#error "amalgamate: file c4/error.hpp must have been included at this point"
+#endif /* C4_ERROR_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/compiler.hpp
+//#include "c4/compiler.hpp"
+#if !defined(C4_COMPILER_HPP_) && !defined(_C4_COMPILER_HPP_)
+#error "amalgamate: file c4/compiler.hpp must have been included at this point"
+#endif /* C4_COMPILER_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/cpu.hpp
+//#include "c4/cpu.hpp"
+#if !defined(C4_CPU_HPP_) && !defined(_C4_CPU_HPP_)
+#error "amalgamate: file c4/cpu.hpp must have been included at this point"
+#endif /* C4_CPU_HPP_ */
+
+#ifdef C4_MSVC
+#include <intrin.h>
+#endif
+//included above:
+//#include <string.h>
+
+#if (defined(__GNUC__) && __GNUC__ >= 10) || defined(__has_builtin)
+#define _C4_USE_LSB_INTRINSIC(which) __has_builtin(which)
+#define _C4_USE_MSB_INTRINSIC(which) __has_builtin(which)
+#elif defined(C4_MSVC)
+#define _C4_USE_LSB_INTRINSIC(which) true
+#define _C4_USE_MSB_INTRINSIC(which) true
+#else
+// let's try our luck
+#define _C4_USE_LSB_INTRINSIC(which) true
+#define _C4_USE_MSB_INTRINSIC(which) true
+#endif
+
+
+/** @file memory_util.hpp Some memory utilities. */
+
+namespace c4 {
+
+/** set the given memory to zero */
+C4_ALWAYS_INLINE void mem_zero(void* mem, size_t num_bytes)
+{
+    memset(mem, 0, num_bytes);
+}
+/** set the given memory to zero */
+template<class T>
+C4_ALWAYS_INLINE void mem_zero(T* mem, size_t num_elms)
+{
+    memset(mem, 0, sizeof(T) * num_elms);
+}
+/** set the given memory to zero */
+template<class T>
+C4_ALWAYS_INLINE void mem_zero(T* mem)
+{
+    memset(mem, 0, sizeof(T));
+}
+
+C4_ALWAYS_INLINE C4_CONST bool mem_overlaps(void const* a, void const* b, size_t sza, size_t szb)
+{
+    // thanks @timwynants
+    return (((const char*)b + szb) > a && b < ((const char*)a+sza));
+}
+
+void mem_repeat(void* dest, void const* pattern, size_t pattern_size, size_t num_times);
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+template<class T>
+C4_ALWAYS_INLINE C4_CONST bool is_aligned(T *ptr, uintptr_t alignment=alignof(T))
+{
+    return (uintptr_t(ptr) & (alignment - uintptr_t(1))) == uintptr_t(0);
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// least significant bit
+
+/** @name msb Compute the least significant bit
+ * @note the input value must be nonzero
+ * @note the input type must be unsigned
+ */
+/** @{ */
+
+// https://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightLinear
+#define _c4_lsb_fallback                                                \
+    unsigned c = 0;                                                     \
+    v = (v ^ (v - 1)) >> 1; /* Set v's trailing 0s to 1s and zero rest */ \
+    for(; v; ++c)                                                       \
+        v >>= 1;                                                        \
+    return (unsigned) c
+
+// u8
+template<class I>
+C4_CONSTEXPR14
+auto lsb(I v) noexcept
+    -> typename std::enable_if<sizeof(I) == 1u, unsigned>::type
+{
+    C4_STATIC_ASSERT(std::is_unsigned<I>::value);
+    C4_ASSERT(v != 0);
+    #if _C4_USE_LSB_INTRINSIC(__builtin_ctz)
+        // upcast to use the intrinsic, it's cheaper.
+        #ifdef C4_MSVC
+            #if !defined(C4_CPU_ARM64) && !defined(C4_CPU_ARM)
+                unsigned long bit;
+                _BitScanForward(&bit, (unsigned long)v);
+                return bit;
+            #else
+                _c4_lsb_fallback;
+            #endif
+        #else
+            return (unsigned)__builtin_ctz((unsigned)v);
+        #endif
+    #else
+        _c4_lsb_fallback;
+    #endif
+}
+
+// u16
+template<class I>
+C4_CONSTEXPR14
+auto lsb(I v) noexcept
+    -> typename std::enable_if<sizeof(I) == 2u, unsigned>::type
+{
+    C4_STATIC_ASSERT(std::is_unsigned<I>::value);
+    C4_ASSERT(v != 0);
+    #if _C4_USE_LSB_INTRINSIC(__builtin_ctz)
+        // upcast to use the intrinsic, it's cheaper.
+        // Then remember that the upcast makes it to 31bits
+        #ifdef C4_MSVC
+            #if !defined(C4_CPU_ARM64) && !defined(C4_CPU_ARM)
+                unsigned long bit;
+                _BitScanForward(&bit, (unsigned long)v);
+                return bit;
+            #else
+                _c4_lsb_fallback;
+            #endif
+        #else
+            return (unsigned)__builtin_ctz((unsigned)v);
+        #endif
+    #else
+        _c4_lsb_fallback;
+    #endif
+}
+
+// u32
+template<class I>
+C4_CONSTEXPR14
+auto lsb(I v) noexcept
+    -> typename std::enable_if<sizeof(I) == 4u, unsigned>::type
+{
+    C4_STATIC_ASSERT(std::is_unsigned<I>::value);
+    C4_ASSERT(v != 0);
+    #if _C4_USE_LSB_INTRINSIC(__builtin_ctz)
+        #ifdef C4_MSVC
+            #if !defined(C4_CPU_ARM64) && !defined(C4_CPU_ARM)
+                unsigned long bit;
+                _BitScanForward(&bit, v);
+                return bit;
+            #else
+                _c4_lsb_fallback;
+            #endif
+        #else
+            return (unsigned)__builtin_ctz((unsigned)v);
+        #endif
+    #else
+        _c4_lsb_fallback;
+    #endif
+}
+
+// u64 in 64bits
+template<class I>
+C4_CONSTEXPR14
+auto lsb(I v) noexcept
+    -> typename std::enable_if<sizeof(I) == 8u && sizeof(unsigned long) == 8u, unsigned>::type
+{
+    C4_STATIC_ASSERT(std::is_unsigned<I>::value);
+    C4_ASSERT(v != 0);
+    #if _C4_USE_LSB_INTRINSIC(__builtin_ctzl)
+        #if defined(C4_MSVC)
+            #if !defined(C4_CPU_ARM64) && !defined(C4_CPU_ARM)
+                unsigned long bit;
+                _BitScanForward64(&bit, v);
+                return bit;
+            #else
+                _c4_lsb_fallback;
+            #endif
+        #else
+            return (unsigned)__builtin_ctzl((unsigned long)v);
+        #endif
+    #else
+        _c4_lsb_fallback;
+    #endif
+}
+
+// u64 in 32bits
+template<class I>
+C4_CONSTEXPR14
+auto lsb(I v) noexcept
+    -> typename std::enable_if<sizeof(I) == 8u && sizeof(unsigned long long) == 8u && sizeof(unsigned long) != sizeof(unsigned long long), unsigned>::type
+{
+    C4_STATIC_ASSERT(std::is_unsigned<I>::value);
+    C4_ASSERT(v != 0);
+    #if _C4_USE_LSB_INTRINSIC(__builtin_ctzll)
+        #if defined(C4_MSVC)
+            #if !defined(C4_CPU_X86) && !defined(C4_CPU_ARM64) && !defined(C4_CPU_ARM)
+                unsigned long bit;
+                _BitScanForward64(&bit, v);
+                return bit;
+            #else
+                _c4_lsb_fallback;
+            #endif
+        #else
+            return (unsigned)__builtin_ctzll((unsigned long long)v);
+        #endif
+    #else
+        _c4_lsb_fallback;
+    #endif
+}
+
+#undef _c4_lsb_fallback
+
+/** @} */
+
+
+namespace detail {
+template<class I, I val, unsigned num_bits, bool finished> struct _lsb11;
+template<class I, I val, unsigned num_bits>
+struct _lsb11<I, val, num_bits, false>
+{
+    enum : unsigned { num = _lsb11<I, (val>>1), num_bits+I(1), (((val>>1)&I(1))!=I(0))>::num };
+};
+template<class I, I val, unsigned num_bits>
+struct _lsb11<I, val, num_bits, true>
+{
+    enum : unsigned { num = num_bits };
+};
+} // namespace detail
+
+
+/** TMP version of lsb(); this needs to be implemented with template
+ * meta-programming because C++11 cannot use a constexpr function with
+ * local variables
+ * @see lsb */
+template<class I, I number>
+struct lsb11
+{
+    static_assert(number != 0, "lsb: number must be nonzero");
+    enum : unsigned { value = detail::_lsb11<I, number, 0, ((number&I(1))!=I(0))>::num};
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// most significant bit
+
+
+/** @name msb Compute the most significant bit
+ * @note the input value must be nonzero
+ * @note the input type must be unsigned
+ */
+/** @{ */
+
+
+#define _c4_msb8_fallback                       \
+    unsigned n = 0;                             \
+    if(v & I(0xf0)) v >>= 4, n |= I(4);         \
+    if(v & I(0x0c)) v >>= 2, n |= I(2);         \
+    if(v & I(0x02)) v >>= 1, n |= I(1);         \
+    return n
+
+#define _c4_msb16_fallback                      \
+    unsigned n = 0;                             \
+    if(v & I(0xff00)) v >>= 8, n |= I(8);       \
+    if(v & I(0x00f0)) v >>= 4, n |= I(4);       \
+    if(v & I(0x000c)) v >>= 2, n |= I(2);       \
+    if(v & I(0x0002)) v >>= 1, n |= I(1);       \
+    return n
+
+#define _c4_msb32_fallback                      \
+    unsigned n = 0;                             \
+    if(v & I(0xffff0000)) v >>= 16, n |= 16;    \
+    if(v & I(0x0000ff00)) v >>= 8, n |= 8;      \
+    if(v & I(0x000000f0)) v >>= 4, n |= 4;      \
+    if(v & I(0x0000000c)) v >>= 2, n |= 2;      \
+    if(v & I(0x00000002)) v >>= 1, n |= 1;      \
+    return n
+
+#define _c4_msb64_fallback                              \
+    unsigned n = 0;                                     \
+    if(v & I(0xffffffff00000000)) v >>= 32, n |= I(32); \
+    if(v & I(0x00000000ffff0000)) v >>= 16, n |= I(16); \
+    if(v & I(0x000000000000ff00)) v >>= 8, n |= I(8);   \
+    if(v & I(0x00000000000000f0)) v >>= 4, n |= I(4);   \
+    if(v & I(0x000000000000000c)) v >>= 2, n |= I(2);   \
+    if(v & I(0x0000000000000002)) v >>= 1, n |= I(1);   \
+    return n
+
+
+// u8
+template<class I>
+C4_CONSTEXPR14
+auto msb(I v) noexcept
+    -> typename std::enable_if<sizeof(I) == 1u, unsigned>::type
+{
+    C4_STATIC_ASSERT(std::is_unsigned<I>::value);
+    C4_ASSERT(v != 0);
+    #if _C4_USE_MSB_INTRINSIC(__builtin_clz)
+        // upcast to use the intrinsic, it's cheaper.
+        // Then remember that the upcast makes it to 31bits
+        #ifdef C4_MSVC
+            #if !defined(C4_CPU_ARM64) && !defined(C4_CPU_ARM)
+                unsigned long bit;
+                _BitScanReverse(&bit, (unsigned long)v);
+                return bit;
+            #else
+                _c4_msb8_fallback;
+            #endif
+        #else
+            return 31u - (unsigned)__builtin_clz((unsigned)v);
+        #endif
+    #else
+        _c4_msb8_fallback;
+    #endif
+}
+
+// u16
+template<class I>
+C4_CONSTEXPR14
+auto msb(I v) noexcept
+    -> typename std::enable_if<sizeof(I) == 2u, unsigned>::type
+{
+    C4_STATIC_ASSERT(std::is_unsigned<I>::value);
+    C4_ASSERT(v != 0);
+    #if _C4_USE_MSB_INTRINSIC(__builtin_clz)
+        // upcast to use the intrinsic, it's cheaper.
+        // Then remember that the upcast makes it to 31bits
+        #ifdef C4_MSVC
+            #if !defined(C4_CPU_ARM64) && !defined(C4_CPU_ARM)
+                unsigned long bit;
+                _BitScanReverse(&bit, (unsigned long)v);
+                return bit;
+            #else
+                _c4_msb16_fallback;
+            #endif
+        #else
+            return 31u - (unsigned)__builtin_clz((unsigned)v);
+        #endif
+    #else
+        _c4_msb16_fallback;
+    #endif
+}
+
+// u32
+template<class I>
+C4_CONSTEXPR14
+auto msb(I v) noexcept
+    -> typename std::enable_if<sizeof(I) == 4u, unsigned>::type
+{
+    C4_STATIC_ASSERT(std::is_unsigned<I>::value);
+    C4_ASSERT(v != 0);
+    #if _C4_USE_MSB_INTRINSIC(__builtin_clz)
+        #ifdef C4_MSVC
+            #if !defined(C4_CPU_ARM64) && !defined(C4_CPU_ARM)
+                unsigned long bit;
+                _BitScanReverse(&bit, v);
+                return bit;
+            #else
+                _c4_msb32_fallback;
+            #endif
+        #else
+            return 31u - (unsigned)__builtin_clz((unsigned)v);
+        #endif
+    #else
+        _c4_msb32_fallback;
+    #endif
+}
+
+// u64 in 64bits
+template<class I>
+C4_CONSTEXPR14
+auto msb(I v) noexcept
+    -> typename std::enable_if<sizeof(I) == 8u && sizeof(unsigned long) == 8u, unsigned>::type
+{
+    C4_STATIC_ASSERT(std::is_unsigned<I>::value);
+    C4_ASSERT(v != 0);
+    #if _C4_USE_MSB_INTRINSIC(__builtin_clzl)
+        #ifdef C4_MSVC
+            #if !defined(C4_CPU_ARM64) && !defined(C4_CPU_ARM)
+                unsigned long bit;
+                _BitScanReverse64(&bit, v);
+                return bit;
+            #else
+                _c4_msb64_fallback;
+            #endif
+        #else
+            return 63u - (unsigned)__builtin_clzl((unsigned long)v);
+        #endif
+    #else
+        _c4_msb64_fallback;
+    #endif
+}
+
+// u64 in 32bits
+template<class I>
+C4_CONSTEXPR14
+auto msb(I v) noexcept
+    -> typename std::enable_if<sizeof(I) == 8u && sizeof(unsigned long long) == 8u && sizeof(unsigned long) != sizeof(unsigned long long), unsigned>::type
+{
+    C4_STATIC_ASSERT(std::is_unsigned<I>::value);
+    C4_ASSERT(v != 0);
+    #if _C4_USE_MSB_INTRINSIC(__builtin_clzll)
+        #ifdef C4_MSVC
+            #if !defined(C4_CPU_X86) && !defined(C4_CPU_ARM64) && !defined(C4_CPU_ARM)
+                unsigned long bit;
+                _BitScanReverse64(&bit, v);
+                return bit;
+            #else
+                _c4_msb64_fallback;
+            #endif
+        #else
+            return 63u - (unsigned)__builtin_clzll((unsigned long long)v);
+        #endif
+    #else
+        _c4_msb64_fallback;
+    #endif
+}
+
+#undef _c4_msb8_fallback
+#undef _c4_msb16_fallback
+#undef _c4_msb32_fallback
+#undef _c4_msb64_fallback
+
+/** @} */
+
+
+namespace detail {
+template<class I, I val, I num_bits, bool finished> struct _msb11;
+template<class I, I val, I num_bits>
+struct _msb11< I, val, num_bits, false>
+{
+    enum : unsigned { num = _msb11<I, (val>>1), num_bits+I(1), ((val>>1)==I(0))>::num };
+};
+template<class I, I val, I num_bits>
+struct _msb11<I, val, num_bits, true>
+{
+    static_assert(val == 0, "bad implementation");
+    enum : unsigned { num = (unsigned)(num_bits-1) };
+};
+} // namespace detail
+
+
+/** TMP version of msb(); this needs to be implemented with template
+ * meta-programming because C++11 cannot use a constexpr function with
+ * local variables
+ * @see msb */
+template<class I, I number>
+struct msb11
+{
+    enum : unsigned { value = detail::_msb11<I, number, 0, (number==I(0))>::num };
+};
+
+
+
+#undef _C4_USE_LSB_INTRINSIC
+#undef _C4_USE_MSB_INTRINSIC
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+// there is an implicit conversion below; it happens when E or B are
+// narrower than int, and thus any operation will upcast the result to
+// int, and then downcast to assign
+C4_SUPPRESS_WARNING_GCC_CLANG_WITH_PUSH("-Wconversion")
+
+/** integer power; this function is constexpr-14 because of the local
+ * variables */
+template<class B, class E>
+C4_CONSTEXPR14 C4_CONST auto ipow(B base, E exponent) noexcept -> typename std::enable_if<std::is_signed<E>::value, B>::type
+{
+    C4_STATIC_ASSERT(std::is_integral<E>::value);
+    B r = B(1);
+    if(exponent >= 0)
+    {
+        for(E e = 0; e < exponent; ++e)
+            r *= base;
+    }
+    else
+    {
+        exponent *= E(-1);
+        for(E e = 0; e < exponent; ++e)
+            r /= base;
+    }
+    return r;
+}
+
+/** integer power; this function is constexpr-14 because of the local
+ * variables */
+template<class B, B base, class E>
+C4_CONSTEXPR14 C4_CONST auto ipow(E exponent) noexcept -> typename std::enable_if<std::is_signed<E>::value, B>::type
+{
+    C4_STATIC_ASSERT(std::is_integral<E>::value);
+    B r = B(1);
+    if(exponent >= 0)
+    {
+        for(E e = 0; e < exponent; ++e)
+            r *= base;
+    }
+    else
+    {
+        exponent *= E(-1);
+        for(E e = 0; e < exponent; ++e)
+            r /= base;
+    }
+    return r;
+}
+
+/** integer power; this function is constexpr-14 because of the local
+ * variables */
+template<class B, class Base, Base base, class E>
+C4_CONSTEXPR14 C4_CONST auto ipow(E exponent) noexcept -> typename std::enable_if<std::is_signed<E>::value, B>::type
+{
+    C4_STATIC_ASSERT(std::is_integral<E>::value);
+    B r = B(1);
+    B bbase = B(base);
+    if(exponent >= 0)
+    {
+        for(E e = 0; e < exponent; ++e)
+            r *= bbase;
+    }
+    else
+    {
+        exponent *= E(-1);
+        for(E e = 0; e < exponent; ++e)
+            r /= bbase;
+    }
+    return r;
+}
+
+/** integer power; this function is constexpr-14 because of the local
+ * variables */
+template<class B, class E>
+C4_CONSTEXPR14 C4_CONST auto ipow(B base, E exponent) noexcept -> typename std::enable_if<!std::is_signed<E>::value, B>::type
+{
+    C4_STATIC_ASSERT(std::is_integral<E>::value);
+    B r = B(1);
+    for(E e = 0; e < exponent; ++e)
+        r *= base;
+    return r;
+}
+
+/** integer power; this function is constexpr-14 because of the local
+ * variables */
+template<class B, B base, class E>
+C4_CONSTEXPR14 C4_CONST auto ipow(E exponent) noexcept -> typename std::enable_if<!std::is_signed<E>::value, B>::type
+{
+    C4_STATIC_ASSERT(std::is_integral<E>::value);
+    B r = B(1);
+    for(E e = 0; e < exponent; ++e)
+        r *= base;
+    return r;
+}
+/** integer power; this function is constexpr-14 because of the local
+ * variables */
+template<class B, class Base, Base base, class E>
+C4_CONSTEXPR14 C4_CONST auto ipow(E exponent) noexcept -> typename std::enable_if<!std::is_signed<E>::value, B>::type
+{
+    C4_STATIC_ASSERT(std::is_integral<E>::value);
+    B r = B(1);
+    B bbase = B(base);
+    for(E e = 0; e < exponent; ++e)
+        r *= bbase;
+    return r;
+}
+
+C4_SUPPRESS_WARNING_GCC_CLANG_POP
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** return a mask with all bits set [first_bit,last_bit[; this function
+ * is constexpr-14 because of the local variables */
+template<class I>
+C4_CONSTEXPR14 I contiguous_mask(I first_bit, I last_bit)
+{
+    I r = 0;
+    for(I i = first_bit; i < last_bit; ++i)
+    {
+        r |= (I(1) << i);
+    }
+    return r;
+}
+
+
+namespace detail {
+
+template<class I, I val, I first, I last, bool finished>
+struct _ctgmsk11;
+
+template<class I, I val, I first, I last>
+struct _ctgmsk11< I, val, first, last, true>
+{
+    enum : I { value = _ctgmsk11<I, val|(I(1)<<first), first+I(1), last, (first+1!=last)>::value };
+};
+
+template<class I, I val, I first, I last>
+struct _ctgmsk11< I, val, first, last, false>
+{
+    enum : I { value = val };
+};
+
+} // namespace detail
+
+
+/** TMP version of contiguous_mask(); this needs to be implemented with template
+ * meta-programming because C++11 cannot use a constexpr function with
+ * local variables
+ * @see contiguous_mask */
+template<class I, I first_bit, I last_bit>
+struct contiguous_mask11
+{
+    enum : I { value = detail::_ctgmsk11<I, I(0), first_bit, last_bit, (first_bit!=last_bit)>::value };
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+/** use Empty Base Class Optimization to reduce the size of a pair of
+ * potentially empty types*/
+
+namespace detail {
+typedef enum {
+    tpc_same,
+    tpc_same_empty,
+    tpc_both_empty,
+    tpc_first_empty,
+    tpc_second_empty,
+    tpc_general
+} TightPairCase_e;
+
+template<class First, class Second>
+constexpr TightPairCase_e tpc_which_case()
+{
+    return std::is_same<First, Second>::value ?
+               std::is_empty<First>::value ?
+                   tpc_same_empty
+                   :
+                   tpc_same
+               :
+               std::is_empty<First>::value && std::is_empty<Second>::value ?
+                   tpc_both_empty
+                   :
+                   std::is_empty<First>::value ?
+                       tpc_first_empty
+                       :
+                       std::is_empty<Second>::value ?
+                           tpc_second_empty
+                           :
+                           tpc_general
+           ;
+}
+
+template<class First, class Second, TightPairCase_e Case>
+struct tight_pair
+{
+private:
+
+    First m_first;
+    Second m_second;
+
+public:
+
+    using first_type = First;
+    using second_type = Second;
+
+    tight_pair() : m_first(), m_second() {}
+    tight_pair(First const& f, Second const& s) : m_first(f), m_second(s) {}
+
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 First       & first ()       { return m_first; }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 First  const& first () const { return m_first; }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 Second      & second()       { return m_second; }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 Second const& second() const { return m_second; }
+};
+
+template<class First, class Second>
+struct tight_pair<First, Second, tpc_same_empty> : public First
+{
+    static_assert(std::is_same<First, Second>::value, "bad implementation");
+
+    using first_type = First;
+    using second_type = Second;
+
+    tight_pair() : First() {}
+    tight_pair(First const& f, Second const& /*s*/) : First(f) {}
+
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 First      & first ()       { return static_cast<First      &>(*this); }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 First const& first () const { return static_cast<First const&>(*this); }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 Second      & second()       { return reinterpret_cast<Second      &>(*this); }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 Second const& second() const { return reinterpret_cast<Second const&>(*this); }
+};
+
+template<class First, class Second>
+struct tight_pair<First, Second, tpc_both_empty> : public First, public Second
+{
+    using first_type = First;
+    using second_type = Second;
+
+    tight_pair() : First(), Second() {}
+    tight_pair(First const& f, Second const& s) : First(f), Second(s) {}
+
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 First      & first ()       { return static_cast<First      &>(*this); }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 First const& first () const { return static_cast<First const&>(*this); }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 Second      & second()       { return static_cast<Second      &>(*this); }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 Second const& second() const { return static_cast<Second const&>(*this); }
+};
+
+template<class First, class Second>
+struct tight_pair<First, Second, tpc_same> : public First
+{
+    Second m_second;
+
+    using first_type = First;
+    using second_type = Second;
+
+    tight_pair() : First() {}
+    tight_pair(First const& f, Second const& s) : First(f), m_second(s) {}
+
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 First      & first ()       { return static_cast<First      &>(*this); }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 First const& first () const { return static_cast<First const&>(*this); }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 Second      & second()       { return m_second; }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 Second const& second() const { return m_second; }
+};
+
+template<class First, class Second>
+struct tight_pair<First, Second, tpc_first_empty> : public First
+{
+    Second m_second;
+
+    using first_type = First;
+    using second_type = Second;
+
+    tight_pair() : First(), m_second() {}
+    tight_pair(First const& f, Second const& s) : First(f), m_second(s) {}
+
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 First      & first ()       { return static_cast<First      &>(*this); }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 First const& first () const { return static_cast<First const&>(*this); }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 Second      & second()       { return m_second; }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 Second const& second() const { return m_second; }
+};
+
+template<class First, class Second>
+struct tight_pair<First, Second, tpc_second_empty> : public Second
+{
+    First m_first;
+
+    using first_type = First;
+    using second_type = Second;
+
+    tight_pair() : Second(), m_first() {}
+    tight_pair(First const& f, Second const& s) : Second(s), m_first(f) {}
+
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 First      & first ()       { return m_first; }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 First const& first () const { return m_first; }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 Second      & second()       { return static_cast<Second      &>(*this); }
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 Second const& second() const { return static_cast<Second const&>(*this); }
+};
+
+} // namespace detail
+
+template<class First, class Second>
+using tight_pair = detail::tight_pair<First, Second, detail::tpc_which_case<First,Second>()>;
+
+} // namespace c4
+
+#endif /* _C4_MEMORY_UTIL_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/memory_util.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/memory_resource.hpp
+// https://github.com/biojppm/c4core/src/c4/memory_resource.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_MEMORY_RESOURCE_HPP_
+#define _C4_MEMORY_RESOURCE_HPP_
+
+/** @file memory_resource.hpp Provides facilities to allocate typeless
+ *  memory, via the memory resource model consecrated with C++17. */
+
+/** @defgroup memory memory utilities */
+
+/** @defgroup raw_memory_alloc Raw memory allocation
+ * @ingroup memory
+ */
+
+/** @defgroup memory_resources Memory resources
+ * @ingroup memory
+ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/config.hpp
+//#include "c4/config.hpp"
+#if !defined(C4_CONFIG_HPP_) && !defined(_C4_CONFIG_HPP_)
+#error "amalgamate: file c4/config.hpp must have been included at this point"
+#endif /* C4_CONFIG_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/error.hpp
+//#include "c4/error.hpp"
+#if !defined(C4_ERROR_HPP_) && !defined(_C4_ERROR_HPP_)
+#error "amalgamate: file c4/error.hpp must have been included at this point"
+#endif /* C4_ERROR_HPP_ */
+
+
+namespace c4 {
+
+// need these forward decls here
+struct MemoryResource;
+struct MemoryResourceMalloc;
+struct MemoryResourceStack;
+MemoryResourceMalloc* get_memory_resource_malloc();
+MemoryResourceStack* get_memory_resource_stack();
+namespace detail { MemoryResource*& get_memory_resource(); }
+
+
+// c-style allocation ---------------------------------------------------------
+
+// this API provides aligned allocation functions.
+// These functions forward the call to a user-modifiable function.
+
+
+// aligned allocation.
+
+/** Aligned allocation. Merely calls the current get_aalloc() function.
+ * @see get_aalloc()
+ * @ingroup raw_memory_alloc */
+void* aalloc(size_t sz, size_t alignment);
+
+/** Aligned free. Merely calls the current get_afree() function.
+ * @see get_afree()
+ * @ingroup raw_memory_alloc */
+void afree(void* ptr);
+
+/** Aligned reallocation. Merely calls the current get_arealloc() function.
+ * @see get_arealloc()
+ * @ingroup raw_memory_alloc */
+void* arealloc(void* ptr, size_t oldsz, size_t newsz, size_t alignment);
+
+
+// allocation setup facilities.
+
+/** Function pointer type for aligned allocation
+ * @see set_aalloc()
+ * @ingroup raw_memory_alloc */
+using aalloc_pfn = void* (*)(size_t size, size_t alignment);
+
+/** Function pointer type for aligned deallocation
+ * @see set_afree()
+ * @ingroup raw_memory_alloc */
+using afree_pfn = void  (*)(void *ptr);
+
+/** Function pointer type for aligned reallocation
+ * @see set_arealloc()
+ * @ingroup raw_memory_alloc */
+using arealloc_pfn = void* (*)(void *ptr, size_t oldsz, size_t newsz, size_t alignment);
+
+
+// allocation function pointer setters/getters
+
+/** Set the global aligned allocation function.
+ * @see aalloc()
+ * @see get_aalloc()
+ * @ingroup raw_memory_alloc */
+void set_aalloc(aalloc_pfn fn);
+
+/** Set the global aligned deallocation function.
+ * @see afree()
+ * @see get_afree()
+ * @ingroup raw_memory_alloc */
+void set_afree(afree_pfn fn);
+
+/** Set the global aligned reallocation function.
+ * @see arealloc()
+ * @see get_arealloc()
+ * @ingroup raw_memory_alloc */
+void set_arealloc(arealloc_pfn fn);
+
+
+/** Get the global aligned reallocation function.
+ * @see arealloc()
+ * @ingroup raw_memory_alloc */
+aalloc_pfn get_aalloc();
+
+/** Get the global aligned deallocation function.
+ * @see afree()
+ * @ingroup raw_memory_alloc */
+afree_pfn get_afree();
+
+/** Get the global aligned reallocation function.
+ * @see arealloc()
+ * @ingroup raw_memory_alloc */
+arealloc_pfn get_arealloc();
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// c++-style allocation -------------------------------------------------------
+
+/** C++17-style memory_resource base class. See http://en.cppreference.com/w/cpp/experimental/memory_resource
+ * @ingroup memory_resources */
+struct MemoryResource
+{
+    const char *name = nullptr;
+    virtual ~MemoryResource() {}
+
+    void* allocate(size_t sz, size_t alignment=alignof(max_align_t), void *hint=nullptr)
+    {
+        void *mem = this->do_allocate(sz, alignment, hint);
+        C4_CHECK_MSG(mem != nullptr, "could not allocate %lu bytes", sz);
+        return mem;
+    }
+
+    void* reallocate(void* ptr, size_t oldsz, size_t newsz, size_t alignment=alignof(max_align_t))
+    {
+        void *mem = this->do_reallocate(ptr, oldsz, newsz, alignment);
+        C4_CHECK_MSG(mem != nullptr, "could not reallocate from %lu to %lu bytes", oldsz, newsz);
+        return mem;
+    }
+
+    void deallocate(void* ptr, size_t sz, size_t alignment=alignof(max_align_t))
+    {
+        this->do_deallocate(ptr, sz, alignment);
+    }
+
+protected:
+
+    virtual void* do_allocate(size_t sz, size_t alignment, void* hint) = 0;
+    virtual void* do_reallocate(void* ptr, size_t oldsz, size_t newsz, size_t alignment) = 0;
+    virtual void  do_deallocate(void* ptr, size_t sz, size_t alignment) = 0;
+
+};
+
+/** get the current global memory resource. To avoid static initialization
+ * order problems, this is implemented using a function call to ensure
+ * that it is available when first used.
+ * @ingroup memory_resources */
+C4_ALWAYS_INLINE MemoryResource* get_memory_resource()
+{
+    return detail::get_memory_resource();
+}
+
+/** set the global memory resource
+ * @ingroup memory_resources */
+C4_ALWAYS_INLINE void set_memory_resource(MemoryResource* mr)
+{
+    C4_ASSERT(mr != nullptr);
+    detail::get_memory_resource() = mr;
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+/** A c4::aalloc-based memory resource. Thread-safe if the implementation
+ * called by c4::aalloc() is safe.
+ * @ingroup memory_resources */
+struct MemoryResourceMalloc : public MemoryResource
+{
+
+    MemoryResourceMalloc() { name = "malloc"; }
+    virtual ~MemoryResourceMalloc() override {}
+
+protected:
+
+    virtual void* do_allocate(size_t sz, size_t alignment, void *hint) override
+    {
+        C4_UNUSED(hint);
+        return c4::aalloc(sz, alignment);
+    }
+
+    virtual void  do_deallocate(void* ptr, size_t sz, size_t alignment) override
+    {
+        C4_UNUSED(sz);
+        C4_UNUSED(alignment);
+        c4::afree(ptr);
+    }
+
+    virtual void* do_reallocate(void* ptr, size_t oldsz, size_t newsz, size_t alignment) override
+    {
+        return c4::arealloc(ptr, oldsz, newsz, alignment);
+    }
+
+};
+
+/** returns a malloc-based memory resource
+ * @ingroup memory_resources */
+C4_ALWAYS_INLINE MemoryResourceMalloc* get_memory_resource_malloc()
+{
+    /** @todo use a nifty counter:
+     * https://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Nifty_Counter */
+    static MemoryResourceMalloc mr;
+    return &mr;
+}
+
+namespace detail {
+C4_ALWAYS_INLINE MemoryResource* & get_memory_resource()
+{
+    /** @todo use a nifty counter:
+     * https://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Nifty_Counter */
+    thread_local static MemoryResource* mr = get_memory_resource_malloc();
+    return mr;
+}
+} // namespace detail
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+namespace detail {
+
+/** Allows a memory resource to obtain its memory from another memory resource.
+ * @ingroup memory_resources */
+struct DerivedMemoryResource : public MemoryResource
+{
+public:
+
+    DerivedMemoryResource(MemoryResource *mr_=nullptr) : m_local(mr_ ? mr_ : get_memory_resource()) {}
+
+private:
+
+    MemoryResource *m_local;
+
+protected:
+
+    virtual void* do_allocate(size_t sz, size_t alignment, void* hint) override
+    {
+        return m_local->allocate(sz, alignment, hint);
+    }
+
+    virtual void* do_reallocate(void* ptr, size_t oldsz, size_t newsz, size_t alignment) override
+    {
+        return m_local->reallocate(ptr, oldsz, newsz, alignment);
+    }
+
+    virtual void do_deallocate(void* ptr, size_t sz, size_t alignment) override
+    {
+        return m_local->deallocate(ptr, sz, alignment);
+    }
+};
+
+/** Provides common facilities for memory resource consisting of a single memory block
+ * @ingroup memory_resources */
+struct _MemoryResourceSingleChunk : public DerivedMemoryResource
+{
+
+    C4_NO_COPY_OR_MOVE(_MemoryResourceSingleChunk);
+
+    using impl_type = DerivedMemoryResource;
+
+public:
+
+    _MemoryResourceSingleChunk(MemoryResource *impl=nullptr) : DerivedMemoryResource(impl) { name = "linear_malloc"; }
+
+    /** initialize with owned memory, allocated from the given (or the global) memory resource */
+    _MemoryResourceSingleChunk(size_t sz, MemoryResource *impl=nullptr) : _MemoryResourceSingleChunk(impl) { acquire(sz); }
+    /** initialize with borrowed memory */
+    _MemoryResourceSingleChunk(void *mem, size_t sz) : _MemoryResourceSingleChunk() { acquire(mem, sz); }
+
+    virtual ~_MemoryResourceSingleChunk() override { release(); }
+
+public:
+
+    void const* mem() const { return m_mem; }
+
+    size_t capacity() const { return m_size; }
+    size_t size() const { return m_pos; }
+    size_t slack() const { C4_ASSERT(m_size >= m_pos); return m_size - m_pos; }
+
+public:
+
+    char  *m_mem{nullptr};
+    size_t m_size{0};
+    size_t m_pos{0};
+    bool   m_owner;
+
+public:
+
+    /** set the internal pointer to the beginning of the linear buffer */
+    void clear() { m_pos = 0; }
+
+    /** initialize with owned memory, allocated from the global memory resource */
+    void acquire(size_t sz);
+    /** initialize with borrowed memory */
+    void acquire(void *mem, size_t sz);
+    /** release the memory */
+    void release();
+
+};
+
+} // namespace detail
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+/** provides a linear memory resource. Allocates incrementally from a linear
+ * buffer, without ever deallocating. Deallocations are a no-op, and the
+ * memory is freed only when the resource is release()d. The memory used by
+ * this object can be either owned or borrowed. When borrowed, no calls to
+ * malloc/free take place.
+ *
+ * @ingroup memory_resources */
+struct MemoryResourceLinear : public detail::_MemoryResourceSingleChunk
+{
+
+    C4_NO_COPY_OR_MOVE(MemoryResourceLinear);
+
+public:
+
+    using detail::_MemoryResourceSingleChunk::_MemoryResourceSingleChunk;
+
+protected:
+
+    virtual void* do_allocate(size_t sz, size_t alignment, void *hint) override;
+    virtual void  do_deallocate(void* ptr, size_t sz, size_t alignment) override;
+    virtual void* do_reallocate(void* ptr, size_t oldsz, size_t newsz, size_t alignment) override;
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+/** provides a stack-type malloc-based memory resource.
+ * @ingroup memory_resources */
+struct MemoryResourceStack : public detail::_MemoryResourceSingleChunk
+{
+
+    C4_NO_COPY_OR_MOVE(MemoryResourceStack);
+
+public:
+
+    using detail::_MemoryResourceSingleChunk::_MemoryResourceSingleChunk;
+
+protected:
+
+    virtual void* do_allocate(size_t sz, size_t alignment, void *hint) override;
+    virtual void  do_deallocate(void* ptr, size_t sz, size_t alignment) override;
+    virtual void* do_reallocate(void* ptr, size_t oldsz, size_t newsz, size_t alignment) override;
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+/** provides a linear array-based memory resource.
+ * @see MemoryResourceLinear
+ * @ingroup memory_resources */
+template<size_t N>
+struct MemoryResourceLinearArr : public MemoryResourceLinear
+{
+    #ifdef _MSC_VER
+    #pragma warning(push)
+    #pragma warning(disable: 4324) // structure was padded due to alignment specifier
+    #endif
+    alignas(alignof(max_align_t)) char m_arr[N];
+    #ifdef _MSC_VER
+    #pragma warning(pop)
+    #endif
+    MemoryResourceLinearArr() : MemoryResourceLinear(m_arr, N) { name = "linear_arr"; }
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+struct AllocationCounts
+{
+    struct Item
+    {
+        ssize_t allocs;
+        ssize_t size;
+
+        void add(size_t sz)
+        {
+            ++allocs;
+            size += static_cast<ssize_t>(sz);
+        }
+        void rem(size_t sz)
+        {
+            --allocs;
+            size -= static_cast<ssize_t>(sz);
+        }
+        Item max(Item const& that) const
+        {
+            Item r(*this);
+            r.allocs = r.allocs > that.allocs ? r.allocs : that.allocs;
+            r.size = r.size > that.size ? r.size : that.size;
+            return r;
+        }
+    };
+
+    Item curr  = {0, 0};
+    Item total = {0, 0};
+    Item max   = {0, 0};
+
+    void clear_counts()
+    {
+        curr  = {0, 0};
+        total = {0, 0};
+        max   = {0, 0};
+    }
+
+    void update(AllocationCounts const& that)
+    {
+        curr.allocs += that.curr.allocs;
+        curr.size += that.curr.size;
+        total.allocs += that.total.allocs;
+        total.size += that.total.size;
+        max.allocs += that.max.allocs;
+        max.size += that.max.size;
+    }
+
+    void add_counts(void* ptr, size_t sz)
+    {
+        if(ptr == nullptr) return;
+        curr.add(sz);
+        total.add(sz);
+        max = max.max(curr);
+    }
+
+    void rem_counts(void *ptr, size_t sz)
+    {
+        if(ptr == nullptr) return;
+        curr.rem(sz);
+    }
+
+    AllocationCounts operator- (AllocationCounts const& that) const
+    {
+        AllocationCounts r(*this);
+        r.curr.allocs -= that.curr.allocs;
+        r.curr.size -= that.curr.size;
+        r.total.allocs -= that.total.allocs;
+        r.total.size -= that.total.size;
+        r.max.allocs -= that.max.allocs;
+        r.max.size -= that.max.size;
+        return r;
+    }
+
+    AllocationCounts operator+ (AllocationCounts const& that) const
+    {
+        AllocationCounts r(*this);
+        r.curr.allocs += that.curr.allocs;
+        r.curr.size += that.curr.size;
+        r.total.allocs += that.total.allocs;
+        r.total.size += that.total.size;
+        r.max.allocs += that.max.allocs;
+        r.max.size += that.max.size;
+        return r;
+    }
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+/** a MemoryResource which latches onto another MemoryResource
+ * and counts allocations and sizes.
+ * @ingroup memory_resources */
+class MemoryResourceCounts : public MemoryResource
+{
+public:
+
+    MemoryResourceCounts() : m_resource(get_memory_resource())
+    {
+        C4_ASSERT(m_resource != this);
+        name = "MemoryResourceCounts";
+    }
+    MemoryResourceCounts(MemoryResource *res) : m_resource(res)
+    {
+        C4_ASSERT(m_resource != this);
+        name = "MemoryResourceCounts";
+    }
+
+    MemoryResource *resource() { return m_resource; }
+    AllocationCounts const& counts() const { return m_counts; }
+
+protected:
+
+    MemoryResource *m_resource;
+    AllocationCounts m_counts;
+
+protected:
+
+    virtual void* do_allocate(size_t sz, size_t alignment, void * /*hint*/) override
+    {
+        void *ptr = m_resource->allocate(sz, alignment);
+        m_counts.add_counts(ptr, sz);
+        return ptr;
+    }
+
+    virtual void  do_deallocate(void* ptr, size_t sz, size_t alignment) override
+    {
+        m_counts.rem_counts(ptr, sz);
+        m_resource->deallocate(ptr, sz, alignment);
+    }
+
+    virtual void* do_reallocate(void* ptr, size_t oldsz, size_t newsz, size_t alignment) override
+    {
+        m_counts.rem_counts(ptr, oldsz);
+        void* nptr = m_resource->reallocate(ptr, oldsz, newsz, alignment);
+        m_counts.add_counts(nptr, newsz);
+        return nptr;
+    }
+
+};
+
+//-----------------------------------------------------------------------------
+/** RAII class which binds a memory resource with a scope duration.
+ * @ingroup memory_resources */
+struct ScopedMemoryResource
+{
+    MemoryResource *m_original;
+
+    ScopedMemoryResource(MemoryResource *r)
+    :
+        m_original(get_memory_resource())
+    {
+        set_memory_resource(r);
+    }
+
+    ~ScopedMemoryResource()
+    {
+        set_memory_resource(m_original);
+    }
+};
+
+//-----------------------------------------------------------------------------
+/** RAII class which counts allocations and frees inside a scope. Can
+ * optionally set also the memory resource to be used.
+ * @ingroup memory_resources */
+struct ScopedMemoryResourceCounts
+{
+    MemoryResourceCounts mr;
+
+    ScopedMemoryResourceCounts() : mr()
+    {
+        set_memory_resource(&mr);
+    }
+    ScopedMemoryResourceCounts(MemoryResource *m) : mr(m)
+    {
+        set_memory_resource(&mr);
+    }
+    ~ScopedMemoryResourceCounts()
+    {
+        set_memory_resource(mr.resource());
+    }
+};
+
+} // namespace c4
+
+#endif /* _C4_MEMORY_RESOURCE_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/memory_resource.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/ctor_dtor.hpp
+// https://github.com/biojppm/c4core/src/c4/ctor_dtor.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_CTOR_DTOR_HPP_
+#define _C4_CTOR_DTOR_HPP_
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/preprocessor.hpp
+//#include "c4/preprocessor.hpp"
+#if !defined(C4_PREPROCESSOR_HPP_) && !defined(_C4_PREPROCESSOR_HPP_)
+#error "amalgamate: file c4/preprocessor.hpp must have been included at this point"
+#endif /* C4_PREPROCESSOR_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/language.hpp
+//#include "c4/language.hpp"
+#if !defined(C4_LANGUAGE_HPP_) && !defined(_C4_LANGUAGE_HPP_)
+#error "amalgamate: file c4/language.hpp must have been included at this point"
+#endif /* C4_LANGUAGE_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/memory_util.hpp
+//#include "c4/memory_util.hpp"
+#if !defined(C4_MEMORY_UTIL_HPP_) && !defined(_C4_MEMORY_UTIL_HPP_)
+#error "amalgamate: file c4/memory_util.hpp must have been included at this point"
+#endif /* C4_MEMORY_UTIL_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/error.hpp
+//#include "c4/error.hpp"
+#if !defined(C4_ERROR_HPP_) && !defined(_C4_ERROR_HPP_)
+#error "amalgamate: file c4/error.hpp must have been included at this point"
+#endif /* C4_ERROR_HPP_ */
+
+
+//included above:
+//#include <type_traits>
+//included above:
+//#include <utility> // std::forward
+
+/** @file ctor_dtor.hpp object construction and destruction facilities.
+ * Some of these are not yet available in C++11. */
+
+namespace c4 {
+
+/** default-construct an object, trivial version */
+template <class U> C4_ALWAYS_INLINE typename std::enable_if<std::is_trivially_default_constructible<U>::value, void>::type
+construct(U *ptr) noexcept
+{
+    memset(ptr, 0, sizeof(U));
+}
+/** default-construct an object, non-trivial version */
+template<class U> C4_ALWAYS_INLINE typename std ::enable_if< ! std::is_trivially_default_constructible<U>::value, void>::type
+construct(U* ptr) noexcept
+{
+    new ((void*)ptr) U();
+}
+
+/** default-construct n objects, trivial version */
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if<std::is_trivially_default_constructible<U>::value, void>::type
+construct_n(U* ptr, I n) noexcept
+{
+    memset(ptr, 0, n * sizeof(U));
+}
+/** default-construct n objects, non-trivial version */
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_trivially_default_constructible<U>::value, void>::type
+construct_n(U* ptr, I n) noexcept
+{
+    for(I i = 0; i < n; ++i)
+    {
+        new ((void*)(ptr + i)) U();
+    }
+}
+
+#ifdef __clang__
+#   pragma clang diagnostic push
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic push
+#   if __GNUC__ >= 6
+#       pragma GCC diagnostic ignored "-Wnull-dereference"
+#   endif
+#endif
+
+template<class U, class ...Args>
+inline void construct(U* ptr, Args&&... args)
+{
+    new ((void*)ptr) U(std::forward<Args>(args)...);
+}
+template<class U, class I, class ...Args>
+inline void construct_n(U* ptr, I n, Args&&... args)
+{
+    for(I i = 0; i < n; ++i)
+    {
+        new ((void*)(ptr + i)) U(args...);
+    }
+}
+
+#ifdef __clang__
+#   pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic pop
+#endif
+
+
+//-----------------------------------------------------------------------------
+// copy-construct
+
+template<class U> C4_ALWAYS_INLINE typename std::enable_if<std::is_trivially_copy_constructible<U>::value, void>::type
+copy_construct(U* dst, U const* src) noexcept
+{
+    C4_ASSERT(dst != src);
+    memcpy(dst, src, sizeof(U));
+}
+template<class U> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_trivially_copy_constructible<U>::value, void>::type
+copy_construct(U* dst, U const* src)
+{
+    C4_ASSERT(dst != src);
+    new ((void*)dst) U(*src);
+}
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if<std::is_trivially_copy_constructible<U>::value, void>::type
+copy_construct_n(U* dst, U const* src, I n) noexcept
+{
+    C4_ASSERT(dst != src);
+    memcpy(dst, src, n * sizeof(U));
+}
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_trivially_copy_constructible<U>::value, void>::type
+copy_construct_n(U* dst, U const* src, I n)
+{
+    C4_ASSERT(dst != src);
+    for(I i = 0; i < n; ++i)
+    {
+        new ((void*)(dst + i)) U(*(src + i));
+    }
+}
+
+template<class U> C4_ALWAYS_INLINE typename std::enable_if<std::is_scalar<U>::value, void>::type
+copy_construct(U* dst, U src) noexcept // pass by value for scalar types
+{
+    *dst = src;
+}
+template<class U> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_scalar<U>::value, void>::type
+copy_construct(U* dst, U const& src) // pass by reference for non-scalar types
+{
+    C4_ASSERT(dst != &src);
+    new ((void*)dst) U(src);
+}
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if<std::is_scalar<U>::value, void>::type
+copy_construct_n(U* dst, U src, I n) noexcept // pass by value for scalar types
+{
+    for(I i = 0; i < n; ++i)
+    {
+        dst[i] = src;
+    }
+}
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_scalar<U>::value, void>::type
+copy_construct_n(U* dst, U const& src, I n) // pass by reference for non-scalar types
+{
+    C4_ASSERT(dst != &src);
+    for(I i = 0; i < n; ++i)
+    {
+        new ((void*)(dst + i)) U(src);
+    }
+}
+
+template<class U, size_t N>
+C4_ALWAYS_INLINE void copy_construct(U (&dst)[N], U const (&src)[N]) noexcept
+{
+    copy_construct_n(dst, src, N);
+}
+
+//-----------------------------------------------------------------------------
+// copy-assign
+
+template<class U> C4_ALWAYS_INLINE typename std::enable_if<std::is_trivially_copy_assignable<U>::value, void>::type
+copy_assign(U* dst, U const* src) noexcept
+{
+    C4_ASSERT(dst != src);
+    memcpy(dst, src, sizeof(U));
+}
+template<class U> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_trivially_copy_assignable<U>::value, void>::type
+copy_assign(U* dst, U const* src) noexcept
+{
+    C4_ASSERT(dst != src);
+    *dst = *src;
+}
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if<std::is_trivially_copy_assignable<U>::value, void>::type
+copy_assign_n(U* dst, U const* src, I n) noexcept
+{
+    C4_ASSERT(dst != src);
+    memcpy(dst, src, n * sizeof(U));
+}
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_trivially_copy_assignable<U>::value, void>::type
+copy_assign_n(U* dst, U const* src, I n) noexcept
+{
+    C4_ASSERT(dst != src);
+    for(I i = 0; i < n; ++i)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<class U> C4_ALWAYS_INLINE typename std::enable_if<std::is_scalar<U>::value, void>::type
+copy_assign(U* dst, U src) noexcept // pass by value for scalar types
+{
+    *dst = src;
+}
+template<class U> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_scalar<U>::value, void>::type
+copy_assign(U* dst, U const& src) noexcept // pass by reference for non-scalar types
+{
+    C4_ASSERT(dst != &src);
+    *dst = src;
+}
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if<std::is_scalar<U>::value, void>::type
+copy_assign_n(U* dst, U src, I n) noexcept // pass by value for scalar types
+{
+    for(I i = 0; i < n; ++i)
+    {
+        dst[i] = src;
+    }
+}
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_scalar<U>::value, void>::type
+copy_assign_n(U* dst, U const& src, I n) noexcept // pass by reference for non-scalar types
+{
+    C4_ASSERT(dst != &src);
+    for(I i = 0; i < n; ++i)
+    {
+        dst[i] = src;
+    }
+}
+
+template<class U, size_t N>
+C4_ALWAYS_INLINE void copy_assign(U (&dst)[N], U const (&src)[N]) noexcept
+{
+    copy_assign_n(dst, src, N);
+}
+
+//-----------------------------------------------------------------------------
+// move-construct
+
+template<class U> C4_ALWAYS_INLINE typename std::enable_if<std::is_trivially_move_constructible<U>::value, void>::type
+move_construct(U* dst, U* src) noexcept
+{
+    C4_ASSERT(dst != src);
+    memcpy(dst, src, sizeof(U));
+}
+template<class U> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_trivially_move_constructible<U>::value, void>::type
+move_construct(U* dst, U* src) noexcept
+{
+    C4_ASSERT(dst != src);
+    new ((void*)dst) U(std::move(*src));
+}
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if<std::is_trivially_move_constructible<U>::value, void>::type
+move_construct_n(U* dst, U* src, I n) noexcept
+{
+    C4_ASSERT(dst != src);
+    memcpy(dst, src, n * sizeof(U));
+}
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_trivially_move_constructible<U>::value, void>::type
+move_construct_n(U* dst, U* src, I n) noexcept
+{
+    C4_ASSERT(dst != src);
+    for(I i = 0; i < n; ++i)
+    {
+        new ((void*)(dst + i)) U(std::move(src[i]));
+    }
+}
+
+//-----------------------------------------------------------------------------
+// move-assign
+
+template<class U> C4_ALWAYS_INLINE typename std::enable_if<std::is_trivially_move_assignable<U>::value, void>::type
+move_assign(U* dst, U* src) noexcept
+{
+    C4_ASSERT(dst != src);
+    memcpy(dst, src, sizeof(U));
+}
+template<class U> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_trivially_move_assignable<U>::value, void>::type
+move_assign(U* dst, U* src) noexcept
+{
+    C4_ASSERT(dst != src);
+    *dst = std::move(*src);
+}
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if<std::is_trivially_move_assignable<U>::value, void>::type
+move_assign_n(U* dst, U* src, I n) noexcept
+{
+    C4_ASSERT(dst != src);
+    memcpy(dst, src, n * sizeof(U));
+}
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_trivially_move_assignable<U>::value, void>::type
+move_assign_n(U* dst, U* src, I n) noexcept
+{
+    C4_ASSERT(dst != src);
+    for(I i = 0; i < n; ++i)
+    {
+        *(dst + i) = std::move(*(src + i));
+    }
+}
+
+//-----------------------------------------------------------------------------
+// destroy
+
+template<class U> C4_ALWAYS_INLINE typename std::enable_if<std::is_trivially_destructible<U>::value, void>::type
+destroy(U* ptr) noexcept
+{
+    C4_UNUSED(ptr); // nothing to do
+}
+template<class U> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_trivially_destructible<U>::value, void>::type
+destroy(U* ptr) noexcept
+{
+    ptr->~U();
+}
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if<std::is_trivially_destructible<U>::value, void>::type
+destroy_n(U* ptr, I n) noexcept
+{
+    C4_UNUSED(ptr);
+    C4_UNUSED(n); // nothing to do
+}
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_trivially_destructible<U>::value, void>::type
+destroy_n(U* ptr, I n) noexcept
+{
+    for(I i = 0; i <n; ++i)
+    {
+        ptr[i].~U();
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+/** makes room at the beginning of buf, which has a current size of n */
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if<std::is_trivially_move_constructible<U>::value, void>::type
+make_room(U *buf, I bufsz, I room) C4_NOEXCEPT_A
+{
+    C4_ASSERT(bufsz >= 0 && room >= 0);
+    if(room >= bufsz)
+    {
+        memcpy (buf + room, buf, bufsz * sizeof(U));
+    }
+    else
+    {
+        memmove(buf + room, buf, bufsz * sizeof(U));
+    }
+}
+/** makes room at the beginning of buf, which has a current size of bufsz */
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_trivially_move_constructible<U>::value, void>::type
+make_room(U *buf, I bufsz, I room) C4_NOEXCEPT_A
+{
+    C4_ASSERT(bufsz >= 0 && room >= 0);
+    if(room >= bufsz)
+    {
+        for(I i = 0; i < bufsz; ++i)
+        {
+            new ((void*)(buf + (i + room))) U(std::move(buf[i]));
+        }
+    }
+    else
+    {
+        for(I i = 0; i < bufsz; ++i)
+        {
+            I w = bufsz-1 - i; // do a backwards loop
+            new ((void*)(buf + (w + room))) U(std::move(buf[w]));
+        }
+    }
+}
+
+/** make room to the right of pos */
+template<class U, class I>
+C4_ALWAYS_INLINE void make_room(U *buf, I bufsz, I currsz, I pos, I room)
+{
+    C4_ASSERT(pos >= 0 && pos <= currsz);
+    C4_ASSERT(currsz <= bufsz);
+    C4_ASSERT(room + currsz <= bufsz);
+    C4_UNUSED(bufsz);
+    make_room(buf + pos, currsz - pos, room);
+}
+
+
+/** make room to the right of pos, copying to the beginning of a different buffer */
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if<std::is_trivially_move_constructible<U>::value, void>::type
+make_room(U *dst, U const* src, I srcsz, I room, I pos) C4_NOEXCEPT_A
+{
+    C4_ASSERT(srcsz >= 0 && room >= 0 && pos >= 0);
+    C4_ASSERT(pos < srcsz || (pos == 0 && srcsz == 0));
+    memcpy(dst             , src      , pos           * sizeof(U));
+    memcpy(dst + room + pos, src + pos, (srcsz - pos) * sizeof(U));
+}
+/** make room to the right of pos, copying to the beginning of a different buffer */
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_trivially_move_constructible<U>::value, void>::type
+make_room(U *dst, U const* src, I srcsz, I room, I pos)
+{
+    C4_ASSERT(srcsz >= 0 && room >= 0 && pos >= 0);
+    C4_ASSERT(pos < srcsz || (pos == 0 && srcsz == 0));
+    for(I i = 0; i < pos; ++i)
+    {
+        new ((void*)(dst + i)) U(std::move(src[i]));
+    }
+    src += pos;
+    dst += room + pos;
+    for(I i = 0, e = srcsz - pos; i < e; ++i)
+    {
+        new ((void*)(dst + i)) U(std::move(src[i]));
+    }
+}
+
+template<class U, class I>
+C4_ALWAYS_INLINE void make_room
+(
+    U      * dst, I dstsz,
+    U const* src, I srcsz,
+    I room, I pos
+)
+{
+    C4_ASSERT(pos >= 0 && pos < srcsz || (srcsz == 0 && pos == 0));
+    C4_ASSERT(pos >= 0 && pos < dstsz || (dstsz == 0 && pos == 0));
+    C4_ASSERT(srcsz+room <= dstsz);
+    C4_UNUSED(dstsz);
+    make_room(dst, src, srcsz, room, pos);
+}
+
+
+//-----------------------------------------------------------------------------
+/** destroy room at the beginning of buf, which has a current size of n */
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if<std::is_scalar<U>::value || (std::is_standard_layout<U>::value && std::is_trivial<U>::value), void>::type
+destroy_room(U *buf, I n, I room) C4_NOEXCEPT_A
+{
+    C4_ASSERT(n >= 0 && room >= 0);
+    C4_ASSERT(room <= n);
+    if(room < n)
+    {
+        memmove(buf, buf + room, (n - room) * sizeof(U));
+    }
+    else
+    {
+        // nothing to do - no need to destroy scalar types
+    }
+}
+/** destroy room at the beginning of buf, which has a current size of n */
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if< ! (std::is_scalar<U>::value || (std::is_standard_layout<U>::value && std::is_trivial<U>::value)), void>::type
+destroy_room(U *buf, I n, I room)
+{
+    C4_ASSERT(n >= 0 && room >= 0);
+    C4_ASSERT(room <= n);
+    if(room < n)
+    {
+        for(I i = 0, e = n - room; i < e; ++i)
+        {
+            buf[i] = std::move(buf[i + room]);
+        }
+    }
+    else
+    {
+        for(I i = 0; i < n; ++i)
+        {
+            buf[i].~U();
+        }
+    }
+}
+
+/** destroy room to the right of pos, copying to a different buffer */
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if<std::is_trivially_move_constructible<U>::value, void>::type
+destroy_room(U *dst, U const* src, I n, I room, I pos) C4_NOEXCEPT_A
+{
+    C4_ASSERT(n >= 0 && room >= 0 && pos >= 0);
+    C4_ASSERT(pos <n);
+    C4_ASSERT(pos + room <= n);
+    memcpy(dst, src, pos * sizeof(U));
+    memcpy(dst + pos, src + room + pos, (n - pos - room) * sizeof(U));
+}
+/** destroy room to the right of pos, copying to a different buffer */
+template<class U, class I> C4_ALWAYS_INLINE typename std::enable_if< ! std::is_trivially_move_constructible<U>::value, void>::type
+destroy_room(U *dst, U const* src, I n, I room, I pos)
+{
+    C4_ASSERT(n >= 0 && room >= 0 && pos >= 0);
+    C4_ASSERT(pos < n);
+    C4_ASSERT(pos + room <= n);
+    for(I i = 0; i < pos; ++i)
+    {
+        new ((void*)(dst + i)) U(std::move(src[i]));
+    }
+    src += room + pos;
+    dst += pos;
+    for(I i = 0, e = n - pos - room; i < e; ++i)
+    {
+        new ((void*)(dst + i)) U(std::move(src[i]));
+    }
+}
+
+} // namespace c4
+
+#undef _C4REQUIRE
+
+#endif /* _C4_CTOR_DTOR_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/ctor_dtor.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/allocator.hpp
+// https://github.com/biojppm/c4core/src/c4/allocator.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_ALLOCATOR_HPP_
+#define _C4_ALLOCATOR_HPP_
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/memory_resource.hpp
+//#include "c4/memory_resource.hpp"
+#if !defined(C4_MEMORY_RESOURCE_HPP_) && !defined(_C4_MEMORY_RESOURCE_HPP_)
+#error "amalgamate: file c4/memory_resource.hpp must have been included at this point"
+#endif /* C4_MEMORY_RESOURCE_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/ctor_dtor.hpp
+//#include "c4/ctor_dtor.hpp"
+#if !defined(C4_CTOR_DTOR_HPP_) && !defined(_C4_CTOR_DTOR_HPP_)
+#error "amalgamate: file c4/ctor_dtor.hpp must have been included at this point"
+#endif /* C4_CTOR_DTOR_HPP_ */
+
+
+#include <memory> // std::allocator_traits
+//included above:
+//#include <type_traits>
+
+/** @file allocator.hpp Contains classes to make typeful allocations (note
+ * that memory resources are typeless) */
+
+/** @defgroup mem_res_providers Memory resource providers
+ * @brief Policy classes which provide a memory resource for
+ * use in an allocator.
+ * @ingroup memory
+ */
+
+/** @defgroup allocators Allocators
+ * @brief Lightweight classes that act as handles to specific memory
+ * resources and provide typeful memory.
+ * @ingroup memory
+ */
+
+namespace c4 {
+
+namespace detail {
+template<class T> inline size_t size_for      (size_t num_objs) noexcept { return num_objs * sizeof(T); }
+template<       > inline size_t size_for<void>(size_t num_objs) noexcept { return num_objs;             }
+} // namespace detail
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** provides a per-allocator memory resource
+ * @ingroup mem_res_providers */
+class MemRes
+{
+public:
+
+    MemRes() : m_resource(get_memory_resource()) {}
+    MemRes(MemoryResource* r) noexcept : m_resource(r ? r : get_memory_resource()) {}
+
+    inline MemoryResource* resource() const { return m_resource; }
+
+private:
+
+    MemoryResource* m_resource;
+
+};
+
+
+/** the allocators using this will default to the global memory resource
+ * @ingroup mem_res_providers */
+class MemResGlobal
+{
+public:
+
+    MemResGlobal() {}
+    MemResGlobal(MemoryResource* r) noexcept { C4_UNUSED(r); C4_ASSERT(r == get_memory_resource()); }
+
+    inline MemoryResource* resource() const { return get_memory_resource(); }
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+namespace detail {
+template<class MemRes>
+struct _AllocatorUtil;
+
+template<class T, class ...Args>
+struct has_no_alloc
+    : public std::integral_constant<bool,
+                                    !(std::uses_allocator<T, MemoryResource*>::value)
+                                    && std::is_constructible<T, Args...>::value> {};
+
+// std::uses_allocator_v<U, MemoryResource> && std::is_constructible<U, std::allocator_arg_t, MemoryResource*, Args...>
+// ie can construct(std::allocator_arg_t, MemoryResource*, Args...)
+template<class T, class ...Args>
+struct has_alloc_arg
+    : public std::integral_constant<bool,
+                                    std::uses_allocator<T, MemoryResource*>::value
+                                    && std::is_constructible<T, std::allocator_arg_t, MemoryResource*, Args...>::value> {};
+// std::uses_allocator<U> && std::is_constructible<U, Args..., MemoryResource*>
+// ie, can construct(Args..., MemoryResource*)
+template<class T, class ...Args>
+struct has_alloc
+    : public std::integral_constant<bool,
+                                    std::uses_allocator<T, MemoryResource*>::value
+                                    && std::is_constructible<T, Args..., MemoryResource*>::value> {};
+
+} // namespace detail
+
+
+template<class MemRes>
+struct detail::_AllocatorUtil : public MemRes
+{
+    using MemRes::MemRes;
+
+    /** for construct:
+     * @see http://en.cppreference.com/w/cpp/experimental/polymorphic_allocator/construct */
+
+    // 1. types with no allocators
+    template <class U, class... Args>
+    C4_ALWAYS_INLINE typename std::enable_if<detail::has_no_alloc<U, Args...>::value, void>::type
+    construct(U *ptr, Args &&...args)
+    {
+        c4::construct(ptr, std::forward<Args>(args)...);
+    }
+    template<class U, class I, class... Args>
+    C4_ALWAYS_INLINE typename std::enable_if<detail::has_no_alloc<U, Args...>::value, void>::type
+    construct_n(U* ptr, I n, Args&&... args)
+    {
+        c4::construct_n(ptr, n, std::forward<Args>(args)...);
+    }
+
+    // 2. types using allocators (ie, containers)
+
+    // 2.1. can construct(std::allocator_arg_t, MemoryResource*, Args...)
+    template<class U, class... Args>
+    C4_ALWAYS_INLINE typename std::enable_if<detail::has_alloc_arg<U, Args...>::value, void>::type
+    construct(U* ptr, Args&&... args)
+    {
+        c4::construct(ptr, std::allocator_arg, this->resource(), std::forward<Args>(args)...);
+    }
+    template<class U, class I, class... Args>
+    C4_ALWAYS_INLINE typename std::enable_if<detail::has_alloc_arg<U, Args...>::value, void>::type
+    construct_n(U* ptr, I n, Args&&... args)
+    {
+        c4::construct_n(ptr, n, std::allocator_arg, this->resource(), std::forward<Args>(args)...);
+    }
+
+    // 2.2. can construct(Args..., MemoryResource*)
+    template<class U, class... Args>
+    C4_ALWAYS_INLINE typename std::enable_if<detail::has_alloc<U, Args...>::value, void>::type
+    construct(U* ptr, Args&&... args)
+    {
+        c4::construct(ptr, std::forward<Args>(args)..., this->resource());
+    }
+    template<class U, class I, class... Args>
+    C4_ALWAYS_INLINE typename std::enable_if<detail::has_alloc<U, Args...>::value, void>::type
+    construct_n(U* ptr, I n, Args&&... args)
+    {
+        c4::construct_n(ptr, n, std::forward<Args>(args)..., this->resource());
+    }
+
+    template<class U>
+    static C4_ALWAYS_INLINE void destroy(U* ptr)
+    {
+        c4::destroy(ptr);
+    }
+    template<class U, class I>
+    static C4_ALWAYS_INLINE void destroy_n(U* ptr, I n)
+    {
+        c4::destroy_n(ptr, n);
+    }
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** An allocator is simply a proxy to a memory resource.
+ * @param T
+ * @param MemResProvider
+ * @ingroup allocators */
+template<class T, class MemResProvider=MemResGlobal>
+class Allocator : public detail::_AllocatorUtil<MemResProvider>
+{
+public:
+
+    using impl_type = detail::_AllocatorUtil<MemResProvider>;
+
+    using value_type = T;
+    using pointer = T*;
+    using const_pointer = T const*;
+    using reference = T&;
+    using const_reference = T const&;
+    using size_type = size_t;
+    using difference_type = std::ptrdiff_t;
+    using propagate_on_container_move_assigment = std::true_type;
+
+public:
+
+    template<class U, class MRProv>
+    bool operator== (Allocator<U, MRProv> const& that) const
+    {
+        return this->resource() == that.resource();
+    }
+    template<class U, class MRProv>
+    bool operator!= (Allocator<U, MRProv> const& that) const
+    {
+        return this->resource() != that.resource();
+    }
+
+public:
+
+    template<class U, class MRProv> friend class Allocator;
+    template<class U>
+    struct rebind
+    {
+        using other = Allocator<U, MemResProvider>;
+    };
+    template<class U>
+    typename rebind<U>::other rebound()
+    {
+        return typename rebind<U>::other(*this);
+    }
+
+public:
+
+    using impl_type::impl_type;
+    Allocator() : impl_type() {} // VS demands this
+
+    template<class U> Allocator(Allocator<U, MemResProvider> const& that) : impl_type(that.resource()) {}
+
+    Allocator(Allocator const&) = default;
+    Allocator(Allocator     &&) = default;
+
+    Allocator& operator= (Allocator const&) = default; // WTF? why? @see http://en.cppreference.com/w/cpp/memory/polymorphic_allocator
+    Allocator& operator= (Allocator     &&) = default;
+
+    /** returns a default-constructed polymorphic allocator object
+     * @see http://en.cppreference.com/w/cpp/memory/polymorphic_allocator/select_on_container_copy_construction      */
+    Allocator select_on_container_copy_construct() const { return Allocator(*this); }
+
+    T* allocate(size_t num_objs, size_t alignment=alignof(T))
+    {
+        C4_ASSERT(this->resource() != nullptr);
+        C4_ASSERT(alignment >= alignof(T));
+        void* vmem = this->resource()->allocate(detail::size_for<T>(num_objs), alignment);
+        T* mem = static_cast<T*>(vmem);
+        return mem;
+    }
+
+    void deallocate(T * ptr, size_t num_objs, size_t alignment=alignof(T))
+    {
+        C4_ASSERT(this->resource() != nullptr);
+        C4_ASSERT(alignment>= alignof(T));
+        this->resource()->deallocate(ptr, detail::size_for<T>(num_objs), alignment);
+    }
+
+    T* reallocate(T* ptr, size_t oldnum, size_t newnum, size_t alignment=alignof(T))
+    {
+        C4_ASSERT(this->resource() != nullptr);
+        C4_ASSERT(alignment >= alignof(T));
+        void* vmem = this->resource()->reallocate(ptr, detail::size_for<T>(oldnum), detail::size_for<T>(newnum), alignment);
+        T* mem = static_cast<T*>(vmem);
+        return mem;
+    }
+
+};
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** @ingroup allocators */
+template<class T, size_t N=16, size_t Alignment=alignof(T), class MemResProvider=MemResGlobal>
+class SmallAllocator : public detail::_AllocatorUtil<MemResProvider>
+{
+    static_assert(Alignment >= alignof(T), "invalid alignment");
+
+    using impl_type = detail::_AllocatorUtil<MemResProvider>;
+
+    alignas(Alignment) char m_arr[N * sizeof(T)];
+    size_t m_num{0};
+
+public:
+
+    using value_type = T;
+    using pointer = T*;
+    using const_pointer = T const*;
+    using reference = T&;
+    using const_reference = T const&;
+    using size_type = size_t;
+    using difference_type = std::ptrdiff_t;
+    using propagate_on_container_move_assigment = std::true_type;
+
+    template<class U>
+    bool operator== (SmallAllocator<U,N,Alignment,MemResProvider> const&) const
+    {
+        return false;
+    }
+    template<class U>
+    bool operator!= (SmallAllocator<U,N,Alignment,MemResProvider> const&) const
+    {
+        return true;
+    }
+
+public:
+
+    template<class U, size_t, size_t, class> friend class SmallAllocator;
+    template<class U>
+    struct rebind
+    {
+        using other = SmallAllocator<U, N, alignof(U), MemResProvider>;
+    };
+    template<class U>
+    typename rebind<U>::other rebound()
+    {
+        return typename rebind<U>::other(*this);
+    }
+
+public:
+
+    using impl_type::impl_type;
+    SmallAllocator() : impl_type() {} // VS demands this
+
+    template<class U, size_t N2, size_t A2, class MP2>
+    SmallAllocator(SmallAllocator<U,N2,A2,MP2> const& that) : impl_type(that.resource())
+    {
+        C4_ASSERT(that.m_num == 0);
+    }
+
+    SmallAllocator(SmallAllocator const&) = default;
+    SmallAllocator(SmallAllocator     &&) = default;
+
+    SmallAllocator& operator= (SmallAllocator const&) = default; // WTF? why? @see http://en.cppreference.com/w/cpp/memory/polymorphic_allocator
+    SmallAllocator& operator= (SmallAllocator     &&) = default;
+
+    /** returns a default-constructed polymorphic allocator object
+     * @see http://en.cppreference.com/w/cpp/memory/polymorphic_allocator/select_on_container_copy_construction      */
+    SmallAllocator select_on_container_copy_construct() const { return SmallAllocator(*this); }
+
+    T* allocate(size_t num_objs, size_t alignment=Alignment)
+    {
+        C4_ASSERT(this->resource() != nullptr);
+        C4_ASSERT(alignment >= alignof(T));
+        void *vmem;
+        if(m_num + num_objs <= N)
+        {
+            vmem = (m_arr + m_num * sizeof(T));
+        }
+        else
+        {
+            vmem = this->resource()->allocate(num_objs * sizeof(T), alignment);
+        }
+        m_num += num_objs;
+        T *mem = static_cast<T*>(vmem);
+        return mem;
+    }
+
+    void deallocate(T * ptr, size_t num_objs, size_t alignment=Alignment)
+    {
+        C4_ASSERT(m_num >= num_objs);
+        m_num -= num_objs;
+        if((char*)ptr >= m_arr && (char*)ptr < m_arr + (N * sizeof(T)))
+        {
+            return;
+        }
+        C4_ASSERT(this->resource() != nullptr);
+        C4_ASSERT(alignment >= alignof(T));
+        this->resource()->deallocate(ptr, num_objs * sizeof(T), alignment);
+    }
+
+    T* reallocate(T * ptr, size_t oldnum, size_t newnum, size_t alignment=Alignment)
+    {
+        C4_ASSERT(this->resource() != nullptr);
+        C4_ASSERT(alignment >= alignof(T));
+        if(oldnum <= N && newnum <= N)
+        {
+            return m_arr;
+        }
+        else if(oldnum <= N && newnum > N)
+        {
+            return allocate(newnum, alignment);
+        }
+        else if(oldnum > N && newnum <= N)
+        {
+            deallocate(ptr, oldnum, alignment);
+            return m_arr;
+        }
+        void* vmem = this->resource()->reallocate(ptr, oldnum * sizeof(T), newnum * sizeof(T), alignment);
+        T* mem = static_cast<T*>(vmem);
+        return mem;
+    }
+
+};
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** An allocator making use of the global memory resource.
+ * @ingroup allocators */
+template<class T> using allocator = Allocator<T, MemResGlobal>;
+/** An allocator with a per-instance memory resource
+ * @ingroup allocators */
+template<class T> using allocator_mr = Allocator<T, MemRes>;
+
+/** @ingroup allocators */
+template<class T, size_t N=16, size_t Alignment=alignof(T)> using small_allocator = SmallAllocator<T, N, Alignment, MemResGlobal>;
+/** @ingroup allocators */
+template<class T, size_t N=16, size_t Alignment=alignof(T)> using small_allocator_mr = SmallAllocator<T, N, Alignment, MemRes>;
+
+} // namespace c4
+
+#endif /* _C4_ALLOCATOR_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/allocator.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/char_traits.hpp
+// https://github.com/biojppm/c4core/src/c4/char_traits.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_CHAR_TRAITS_HPP_
+#define _C4_CHAR_TRAITS_HPP_
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/config.hpp
+//#include "c4/config.hpp"
+#if !defined(C4_CONFIG_HPP_) && !defined(_C4_CONFIG_HPP_)
+#error "amalgamate: file c4/config.hpp must have been included at this point"
+#endif /* C4_CONFIG_HPP_ */
+
+
+#include <string> // needed because of std::char_traits
+#include <cctype>
+#include <cwctype>
+
+namespace c4 {
+
+C4_ALWAYS_INLINE bool isspace(char c) { return std::isspace(c) != 0; }
+C4_ALWAYS_INLINE bool isspace(wchar_t c) { return std::iswspace(static_cast<wint_t>(c)) != 0; }
+
+//-----------------------------------------------------------------------------
+template<typename C>
+struct char_traits;
+
+template<>
+struct char_traits<char> : public std::char_traits<char>
+{
+    constexpr static const char whitespace_chars[] = " \f\n\r\t\v";
+    constexpr static const size_t num_whitespace_chars = sizeof(whitespace_chars) - 1;
+};
+
+template<>
+struct char_traits<wchar_t> : public std::char_traits<wchar_t>
+{
+    constexpr static const wchar_t whitespace_chars[] = L" \f\n\r\t\v";
+    constexpr static const size_t num_whitespace_chars = sizeof(whitespace_chars) - 1;
+};
+
+
+//-----------------------------------------------------------------------------
+namespace detail {
+template<typename C>
+struct needed_chars;
+template<>
+struct needed_chars<char>
+{
+    template<class SizeType>
+    C4_ALWAYS_INLINE constexpr static SizeType for_bytes(SizeType num_bytes)
+    {
+        return num_bytes;
+    }
+};
+template<>
+struct needed_chars<wchar_t>
+{
+    template<class SizeType>
+    C4_ALWAYS_INLINE constexpr static SizeType for_bytes(SizeType num_bytes)
+    {
+        // wchar_t is not necessarily 2 bytes.
+        return (num_bytes / static_cast<SizeType>(sizeof(wchar_t))) + ((num_bytes & static_cast<SizeType>(SizeType(sizeof(wchar_t)) - SizeType(1))) != 0);
+    }
+};
+} // namespace detail
+
+/** get the number of C characters needed to store a number of bytes */
+template<typename C, typename SizeType>
+C4_ALWAYS_INLINE constexpr SizeType num_needed_chars(SizeType num_bytes)
+{
+    return detail::needed_chars<C>::for_bytes(num_bytes);
+}
+
+
+//-----------------------------------------------------------------------------
+
+/** get the given text string as either char or wchar_t according to the given type */
+#define C4_TXTTY(txt, type) \
+    /* is there a smarter way to do this? */\
+    c4::detail::literal_as<type>::get(txt, C4_WIDEN(txt))
+
+namespace detail {
+template<typename C>
+struct literal_as;
+
+template<>
+struct literal_as<char>
+{
+    C4_ALWAYS_INLINE static constexpr const char* get(const char* str, const wchar_t *)
+    {
+        return str;
+    }
+};
+template<>
+struct literal_as<wchar_t>
+{
+    C4_ALWAYS_INLINE static constexpr const wchar_t* get(const char*, const wchar_t *wstr)
+    {
+        return wstr;
+    }
+};
+} // namespace detail
+
+} // namespace c4
+
+#endif /* _C4_CHAR_TRAITS_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/char_traits.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/hash.hpp
+// https://github.com/biojppm/c4core/src/c4/hash.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_HASH_HPP_
+#define _C4_HASH_HPP_
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/config.hpp
+//#include "c4/config.hpp"
+#if !defined(C4_CONFIG_HPP_) && !defined(_C4_CONFIG_HPP_)
+#error "amalgamate: file c4/config.hpp must have been included at this point"
+#endif /* C4_CONFIG_HPP_ */
+
+#include <climits>
+
+/** @file hash.hpp */
+
+/** @defgroup hash Hash utils
+ * @see http://aras-p.info/blog/2016/08/02/Hash-Functions-all-the-way-down/ */
+
+namespace c4 {
+
+namespace detail {
+
+/** @internal
+ * @ingroup hash
+ * @see this was taken a great answer in stackoverflow:
+ * https://stackoverflow.com/a/34597785/5875572
+ * @see http://aras-p.info/blog/2016/08/02/Hash-Functions-all-the-way-down/ */
+template<typename ResultT, ResultT OffsetBasis, ResultT Prime>
+class basic_fnv1a final
+{
+
+  static_assert(std::is_unsigned<ResultT>::value, "need unsigned integer");
+
+public:
+
+    using result_type = ResultT;
+
+private:
+
+    result_type state_ {};
+
+public:
+
+    C4_CONSTEXPR14 basic_fnv1a() noexcept : state_ {OffsetBasis} {}
+
+    C4_CONSTEXPR14 void update(const void *const data, const size_t size) noexcept
+    {
+        auto cdata = static_cast<const unsigned char *>(data);
+        auto acc = this->state_;
+        for(size_t i = 0; i < size; ++i)
+        {
+            const auto next = size_t(cdata[i]);
+            acc = (acc ^ next) * Prime;
+        }
+        this->state_ = acc;
+    }
+
+    C4_CONSTEXPR14 result_type digest() const noexcept
+    {
+        return this->state_;
+    }
+
+};
+
+using fnv1a_32 = basic_fnv1a<uint32_t, UINT32_C(          2166136261), UINT32_C(     16777619)>;
+using fnv1a_64 = basic_fnv1a<uint64_t, UINT64_C(14695981039346656037), UINT64_C(1099511628211)>;
+
+template<size_t Bits> struct fnv1a;
+template<> struct fnv1a<32> { using type = fnv1a_32; };
+template<> struct fnv1a<64> { using type = fnv1a_64; };
+
+} // namespace detail
+
+
+/** @ingroup hash */
+template<size_t Bits>
+using fnv1a_t = typename detail::fnv1a<Bits>::type;
+
+
+/** @ingroup hash */
+C4_CONSTEXPR14 inline size_t hash_bytes(const void *const data, const size_t size) noexcept
+{
+    fnv1a_t<CHAR_BIT * sizeof(size_t)> fn{};
+    fn.update(data, size);
+    return fn.digest();
+}
+
+/**
+ * @overload hash_bytes
+ * @ingroup hash */
+template<size_t N>
+C4_CONSTEXPR14 inline size_t hash_bytes(const char (&str)[N]) noexcept
+{
+    fnv1a_t<CHAR_BIT * sizeof(size_t)> fn{};
+    fn.update(str, N);
+    return fn.digest();
+}
+
+} // namespace c4
+
+
+#endif // _C4_HASH_HPP_
+
+
+// (end https://github.com/biojppm/c4core/src/c4/hash.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/szconv.hpp
+// https://github.com/biojppm/c4core/src/c4/szconv.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_SZCONV_HPP_
+#define _C4_SZCONV_HPP_
+
+/** @file szconv.hpp utilities to deal safely with narrowing conversions */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/config.hpp
+//#include "c4/config.hpp"
+#if !defined(C4_CONFIG_HPP_) && !defined(_C4_CONFIG_HPP_)
+#error "amalgamate: file c4/config.hpp must have been included at this point"
+#endif /* C4_CONFIG_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/error.hpp
+//#include "c4/error.hpp"
+#if !defined(C4_ERROR_HPP_) && !defined(_C4_ERROR_HPP_)
+#error "amalgamate: file c4/error.hpp must have been included at this point"
+#endif /* C4_ERROR_HPP_ */
+
+
+#include <limits>
+
+namespace c4 {
+
+/** @todo this would be so much easier with calls to numeric_limits::max()... */
+template<class SizeOut, class SizeIn>
+struct is_narrower_size : std::conditional
+<
+   (std::is_signed<SizeOut>::value == std::is_signed<SizeIn>::value)
+   ?
+   (sizeof(SizeOut) < sizeof(SizeIn))
+   :
+   (
+       (sizeof(SizeOut) < sizeof(SizeIn))
+       ||
+       (
+           (sizeof(SizeOut) == sizeof(SizeIn))
+           &&
+           (std::is_signed<SizeOut>::value && std::is_unsigned<SizeIn>::value)
+       )
+   ),
+   std::true_type,
+   std::false_type
+>::type
+{
+    static_assert(std::is_integral<SizeIn >::value, "must be integral type");
+    static_assert(std::is_integral<SizeOut>::value, "must be integral type");
+};
+
+
+/** when SizeOut is wider than SizeIn, assignment can occur without reservations */
+template<class SizeOut, class SizeIn>
+C4_ALWAYS_INLINE
+typename std::enable_if< ! is_narrower_size<SizeOut, SizeIn>::value, SizeOut>::type
+szconv(SizeIn sz) noexcept
+{
+    return static_cast<SizeOut>(sz);
+}
+
+/** when SizeOut is narrower than SizeIn, narrowing will occur, so we check
+ * for overflow. Note that this check is done only if C4_XASSERT is enabled.
+ * @see C4_XASSERT */
+template<class SizeOut, class SizeIn>
+C4_ALWAYS_INLINE
+typename std::enable_if<is_narrower_size<SizeOut, SizeIn>::value, SizeOut>::type
+szconv(SizeIn sz) C4_NOEXCEPT_X
+{
+    C4_XASSERT(sz >= 0);
+    C4_XASSERT_MSG((SizeIn)sz <= (SizeIn)std::numeric_limits<SizeOut>::max(), "size conversion overflow: in=%zu", (size_t)sz);
+    SizeOut szo = static_cast<SizeOut>(sz);
+    return szo;
+}
+
+} // namespace c4
+
+#endif /* _C4_SZCONV_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/szconv.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/blob.hpp
+// https://github.com/biojppm/c4core/src/c4/blob.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_BLOB_HPP_
+#define _C4_BLOB_HPP_
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/types.hpp
+//#include "c4/types.hpp"
+#if !defined(C4_TYPES_HPP_) && !defined(_C4_TYPES_HPP_)
+#error "amalgamate: file c4/types.hpp must have been included at this point"
+#endif /* C4_TYPES_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/error.hpp
+//#include "c4/error.hpp"
+#if !defined(C4_ERROR_HPP_) && !defined(_C4_ERROR_HPP_)
+#error "amalgamate: file c4/error.hpp must have been included at this point"
+#endif /* C4_ERROR_HPP_ */
+
+
+/** @file blob.hpp Mutable and immutable binary data blobs.
+*/
+
+namespace c4 {
+
+template<class T>
+struct blob_
+{
+    T *    buf;
+    size_t len;
+
+    C4_ALWAYS_INLINE blob_() noexcept : buf(), len() {}
+
+    C4_ALWAYS_INLINE blob_(blob_ const& that) noexcept = default;
+    C4_ALWAYS_INLINE blob_(blob_     && that) noexcept = default;
+    C4_ALWAYS_INLINE blob_& operator=(blob_     && that) noexcept = default;
+    C4_ALWAYS_INLINE blob_& operator=(blob_ const& that) noexcept = default;
+
+    // need to sfinae out copy constructors! (why? isn't the above sufficient?)
+    #define _C4_REQUIRE_NOT_SAME class=typename std::enable_if<( ! std::is_same<U, blob_>::value) && ( ! std::is_pointer<U>::value), T>::type
+    template<class U, _C4_REQUIRE_NOT_SAME> C4_ALWAYS_INLINE blob_(U &var) noexcept : buf(reinterpret_cast<T*>(&var)), len(sizeof(U)) {}
+    template<class U, _C4_REQUIRE_NOT_SAME> C4_ALWAYS_INLINE blob_& operator= (U &var) noexcept { buf = reinterpret_cast<T*>(&var); len = sizeof(U); return *this; }
+    #undef _C4_REQUIRE_NOT_SAME
+
+    template<class U, size_t N> C4_ALWAYS_INLINE blob_(U (&arr)[N]) noexcept : buf(reinterpret_cast<T*>(arr)), len(sizeof(U) * N) {}
+    template<class U, size_t N> C4_ALWAYS_INLINE blob_& operator= (U (&arr)[N]) noexcept { buf = reinterpret_cast<T*>(arr); len = sizeof(U) * N; return *this; }
+
+    template<class U>
+    C4_ALWAYS_INLINE blob_(U          *ptr, size_t n) noexcept : buf(reinterpret_cast<T*>(ptr)), len(sizeof(U) * n) { C4_ASSERT(is_aligned(ptr)); }
+    C4_ALWAYS_INLINE blob_(void       *ptr, size_t n) noexcept : buf(reinterpret_cast<T*>(ptr)), len(n) {}
+    C4_ALWAYS_INLINE blob_(void const *ptr, size_t n) noexcept : buf(reinterpret_cast<T*>(ptr)), len(n) {}
+};
+
+/** an immutable binary blob */
+using cblob = blob_<cbyte>;
+/** a mutable binary blob */
+using  blob = blob_< byte>;
+
+C4_MUST_BE_TRIVIAL_COPY(blob);
+C4_MUST_BE_TRIVIAL_COPY(cblob);
+
+} // namespace c4
+
+#endif // _C4_BLOB_HPP_
+
+
+// (end https://github.com/biojppm/c4core/src/c4/blob.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/substr_fwd.hpp
+// https://github.com/biojppm/c4core/src/c4/substr_fwd.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_SUBSTR_FWD_HPP_
+#define _C4_SUBSTR_FWD_HPP_
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/export.hpp
+//#include "c4/export.hpp"
+#if !defined(C4_EXPORT_HPP_) && !defined(_C4_EXPORT_HPP_)
+#error "amalgamate: file c4/export.hpp must have been included at this point"
+#endif /* C4_EXPORT_HPP_ */
+
+
+namespace c4 {
+
+#ifndef DOXYGEN
+template<class C> struct basic_substring;
+using csubstr = C4CORE_EXPORT basic_substring<const char>;
+using substr = C4CORE_EXPORT basic_substring<char>;
+#endif // !DOXYGEN
+
+} // namespace c4
+
+#endif /* _C4_SUBSTR_FWD_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/substr_fwd.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/substr.hpp
+// https://github.com/biojppm/c4core/src/c4/substr.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_SUBSTR_HPP_
+#define _C4_SUBSTR_HPP_
+
+/** @file substr.hpp read+write string views */
+
+//included above:
+//#include <string.h>
+//included above:
+//#include <ctype.h>
+//included above:
+//#include <type_traits>
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/config.hpp
+//#include "c4/config.hpp"
+#if !defined(C4_CONFIG_HPP_) && !defined(_C4_CONFIG_HPP_)
+#error "amalgamate: file c4/config.hpp must have been included at this point"
+#endif /* C4_CONFIG_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/error.hpp
+//#include "c4/error.hpp"
+#if !defined(C4_ERROR_HPP_) && !defined(_C4_ERROR_HPP_)
+#error "amalgamate: file c4/error.hpp must have been included at this point"
+#endif /* C4_ERROR_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/substr_fwd.hpp
+//#include "c4/substr_fwd.hpp"
+#if !defined(C4_SUBSTR_FWD_HPP_) && !defined(_C4_SUBSTR_FWD_HPP_)
+#error "amalgamate: file c4/substr_fwd.hpp must have been included at this point"
+#endif /* C4_SUBSTR_FWD_HPP_ */
+
+
+#ifdef __clang__
+#   pragma clang diagnostic push
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wtype-limits" // disable warnings on size_t>=0, used heavily in assertions below. These assertions are a preparation step for providing the index type as a template parameter.
+#   pragma GCC diagnostic ignored "-Wuseless-cast"
+#endif
+
+
+namespace c4 {
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+namespace detail {
+
+template<typename C>
+static inline void _do_reverse(C *C4_RESTRICT first, C *C4_RESTRICT last)
+{
+    while(last > first)
+    {
+        C tmp = *last;
+        *last-- = *first;
+        *first++ = tmp;
+    }
+}
+
+} // namespace detail
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+// utility macros to deuglify SFINAE code; undefined after the class.
+// https://stackoverflow.com/questions/43051882/how-to-disable-a-class-member-funrtion-for-certain-template-types
+#define C4_REQUIRE_RW(ret_type) \
+    template <typename U=C> \
+    typename std::enable_if< ! std::is_const<U>::value, ret_type>::type
+// non-const-to-const
+#define C4_NC2C(ty) \
+    typename std::enable_if<std::is_const<C>::value && ( ! std::is_const<ty>::value), ty>::type
+
+
+/** a non-owning string-view, consisting of a character pointer
+ * and a length.
+ *
+ * @note The pointer is explicitly restricted.
+ * @note Because of a C++ limitation, there cannot coexist overloads for
+ * constructing from a char[N] and a char*; the latter will always be chosen
+ * by the compiler. To construct an object of this type, call to_substr() or
+ * to_csubstr(). For a more detailed explanation on why the overloads cannot
+ * coexist, see http://cplusplus.bordoon.com/specializeForCharacterArrays.html
+ *
+ * @see to_substr()
+ * @see to_csubstr()
+ */
+template<class C>
+struct C4CORE_EXPORT basic_substring
+{
+public:
+
+    /** a restricted pointer to the first character of the substring */
+    C * C4_RESTRICT str;
+    /** the length of the substring */
+    size_t          len;
+
+public:
+
+    /** @name Types */
+    /** @{ */
+
+    using  CC  = typename std::add_const<C>::type;     //!< CC=const char
+    using NCC_ = typename std::remove_const<C>::type; //!< NCC_=non const char
+
+    using ro_substr = basic_substring<CC>;
+    using rw_substr = basic_substring<NCC_>;
+
+    using char_type = C;
+    using size_type = size_t;
+
+    using iterator = C*;
+    using const_iterator = CC*;
+
+    enum : size_t { npos = (size_t)-1, NONE = (size_t)-1 };
+
+    /// convert automatically to substring of const C
+    operator ro_substr () const { ro_substr s(str, len); return s; }
+
+    /** @} */
+
+public:
+
+    /** @name Default construction and assignment */
+    /** @{ */
+
+    constexpr basic_substring() : str(nullptr), len(0) {}
+
+    constexpr basic_substring(basic_substring const&) = default;
+    constexpr basic_substring(basic_substring     &&) = default;
+    constexpr basic_substring(std::nullptr_t) : str(nullptr), len(0) {}
+
+    basic_substring& operator= (basic_substring const&) = default;
+    basic_substring& operator= (basic_substring     &&) = default;
+    basic_substring& operator= (std::nullptr_t) { str = nullptr; len = 0; return *this; }
+
+    /** @} */
+
+public:
+
+    /** @name Construction and assignment from characters with the same type */
+    /** @{ */
+
+    //basic_substring(C *s_) : str(s_), len(s_ ? strlen(s_) : 0) {}
+    /** the overload for receiving a single C* pointer will always
+     * hide the array[N] overload. So it is disabled. If you want to
+     * construct a substr from a single pointer containing a C-style string,
+     * you can call c4::to_substr()/c4::to_csubstr().
+     * @see c4::to_substr()
+     * @see c4::to_csubstr() */
+    template<size_t N>
+    constexpr basic_substring(C (&s_)[N]) noexcept : str(s_), len(N-1) {}
+    basic_substring(C *s_, size_t len_) : str(s_), len(len_) { C4_ASSERT(str || !len_); }
+    basic_substring(C *beg_, C *end_) : str(beg_), len(static_cast<size_t>(end_ - beg_)) { C4_ASSERT(end_ >= beg_); }
+
+    //basic_substring& operator= (C *s_) { this->assign(s_); return *this; }
+    template<size_t N>
+    basic_substring& operator= (C (&s_)[N]) { this->assign<N>(s_); return *this; }
+
+    //void assign(C *s_) { str = (s_); len = (s_ ? strlen(s_) : 0); }
+    /** the overload for receiving a single C* pointer will always
+     * hide the array[N] overload. So it is disabled. If you want to
+     * construct a substr from a single pointer containing a C-style string,
+     * you can call c4::to_substr()/c4::to_csubstr().
+     * @see c4::to_substr()
+     * @see c4::to_csubstr() */
+    template<size_t N>
+    void assign(C (&s_)[N]) { str = (s_); len = (N-1); }
+    void assign(C *s_, size_t len_) { str = s_; len = len_; C4_ASSERT(str || !len_); }
+    void assign(C *beg_, C *end_) { C4_ASSERT(end_ >= beg_); str = (beg_); len = (end_ - beg_); }
+
+    void clear() { str = nullptr; len = 0; }
+
+    /** @} */
+
+public:
+
+    /** @name Construction from non-const characters */
+    /** @{ */
+
+    // when the char type is const, allow construction and assignment from non-const chars
+
+    /** only available when the char type is const */
+    template<size_t N, class U=NCC_> explicit basic_substring(C4_NC2C(U) (&s_)[N]) { str = s_; len = N-1; }
+    /** only available when the char type is const */
+    template<          class U=NCC_>          basic_substring(C4_NC2C(U) *s_, size_t len_) { str = s_; len = len_; }
+    /** only available when the char type is const */
+    template<          class U=NCC_>          basic_substring(C4_NC2C(U) *beg_, C4_NC2C(U) *end_) { C4_ASSERT(end_ >= beg_); str = beg_; len = end_ - beg_;  }
+
+    /** only available when the char type is const */
+    template<size_t N, class U=NCC_> void assign(C4_NC2C(U) (&s_)[N]) { str = s_; len = N-1; }
+    /** only available when the char type is const */
+    template<          class U=NCC_> void assign(C4_NC2C(U) *s_, size_t len_) { str = s_; len = len_; }
+    /** only available when the char type is const */
+    template<          class U=NCC_> void assign(C4_NC2C(U) *beg_, C4_NC2C(U) *end_) { C4_ASSERT(end_ >= beg_); str = beg_; len = end_ - beg_;  }
+
+    /** only available when the char type is const */
+    template<size_t N, class U=NCC_>
+    basic_substring& operator=(C4_NC2C(U) (&s_)[N]) { str = s_; len = N-1; return *this; }
+
+    /** @} */
+
+public:
+
+    /** @name Standard accessor methods */
+    /** @{ */
+
+    C4_ALWAYS_INLINE C4_PURE bool   has_str()   const noexcept { return ! empty() && str[0] != C(0); }
+    C4_ALWAYS_INLINE C4_PURE bool   empty()     const noexcept { return (len == 0 || str == nullptr); }
+    C4_ALWAYS_INLINE C4_PURE bool   not_empty() const noexcept { return (len != 0 && str != nullptr); }
+    C4_ALWAYS_INLINE C4_PURE size_t size()      const noexcept { return len; }
+
+    C4_ALWAYS_INLINE C4_PURE iterator begin() noexcept { return str; }
+    C4_ALWAYS_INLINE C4_PURE iterator end  () noexcept { return str + len; }
+
+    C4_ALWAYS_INLINE C4_PURE const_iterator begin() const noexcept { return str; }
+    C4_ALWAYS_INLINE C4_PURE const_iterator end  () const noexcept { return str + len; }
+
+    C4_ALWAYS_INLINE C4_PURE C      * data()       noexcept { return str; }
+    C4_ALWAYS_INLINE C4_PURE C const* data() const noexcept { return str; }
+
+    C4_ALWAYS_INLINE C4_PURE C      & operator[] (size_t i)       noexcept { C4_ASSERT(i >= 0 && i < len); return str[i]; }
+    C4_ALWAYS_INLINE C4_PURE C const& operator[] (size_t i) const noexcept { C4_ASSERT(i >= 0 && i < len); return str[i]; }
+
+    C4_ALWAYS_INLINE C4_PURE C      & front()       noexcept { C4_ASSERT(len > 0 && str != nullptr); return *str; }
+    C4_ALWAYS_INLINE C4_PURE C const& front() const noexcept { C4_ASSERT(len > 0 && str != nullptr); return *str; }
+
+    C4_ALWAYS_INLINE C4_PURE C      & back()       noexcept { C4_ASSERT(len > 0 && str != nullptr); return *(str + len - 1); }
+    C4_ALWAYS_INLINE C4_PURE C const& back() const noexcept { C4_ASSERT(len > 0 && str != nullptr); return *(str + len - 1); }
+
+    /** @} */
+
+public:
+
+    /** @name Comparison methods */
+    /** @{ */
+
+    C4_PURE int compare(C const c) const noexcept
+    {
+        C4_XASSERT((str != nullptr) || len == 0);
+        if(C4_LIKELY(str != nullptr && len > 0))
+            return (*str != c) ? *str - c : (static_cast<int>(len) - 1);
+        else
+            return -1;
+    }
+
+    C4_PURE int compare(const char *C4_RESTRICT that, size_t sz) const noexcept
+    {
+        C4_XASSERT(that || sz  == 0);
+        C4_XASSERT(str  || len == 0);
+        if(C4_LIKELY(str && that))
+        {
+            {
+                const size_t min = len < sz ? len : sz;
+                for(size_t i = 0; i < min; ++i)
+                    if(str[i] != that[i])
+                        return str[i] < that[i] ? -1 : 1;
+            }
+            if(len < sz)
+                return -1;
+            else if(len == sz)
+                return 0;
+            else
+                return 1;
+        }
+        else if(len == sz)
+        {
+            C4_XASSERT(len == 0 && sz == 0);
+            return 0;
+        }
+        return len < sz ? -1 : 1;
+    }
+
+    C4_ALWAYS_INLINE C4_PURE int compare(ro_substr const that) const noexcept { return this->compare(that.str, that.len); }
+
+    C4_ALWAYS_INLINE C4_PURE bool operator== (std::nullptr_t) const noexcept { return str == nullptr; }
+    C4_ALWAYS_INLINE C4_PURE bool operator!= (std::nullptr_t) const noexcept { return str != nullptr; }
+
+    C4_ALWAYS_INLINE C4_PURE bool operator== (C const c) const noexcept { return this->compare(c) == 0; }
+    C4_ALWAYS_INLINE C4_PURE bool operator!= (C const c) const noexcept { return this->compare(c) != 0; }
+    C4_ALWAYS_INLINE C4_PURE bool operator<  (C const c) const noexcept { return this->compare(c) <  0; }
+    C4_ALWAYS_INLINE C4_PURE bool operator>  (C const c) const noexcept { return this->compare(c) >  0; }
+    C4_ALWAYS_INLINE C4_PURE bool operator<= (C const c) const noexcept { return this->compare(c) <= 0; }
+    C4_ALWAYS_INLINE C4_PURE bool operator>= (C const c) const noexcept { return this->compare(c) >= 0; }
+
+    template<class U> C4_ALWAYS_INLINE C4_PURE bool operator== (basic_substring<U> const that) const noexcept { return this->compare(that) == 0; }
+    template<class U> C4_ALWAYS_INLINE C4_PURE bool operator!= (basic_substring<U> const that) const noexcept { return this->compare(that) != 0; }
+    template<class U> C4_ALWAYS_INLINE C4_PURE bool operator<  (basic_substring<U> const that) const noexcept { return this->compare(that) <  0; }
+    template<class U> C4_ALWAYS_INLINE C4_PURE bool operator>  (basic_substring<U> const that) const noexcept { return this->compare(that) >  0; }
+    template<class U> C4_ALWAYS_INLINE C4_PURE bool operator<= (basic_substring<U> const that) const noexcept { return this->compare(that) <= 0; }
+    template<class U> C4_ALWAYS_INLINE C4_PURE bool operator>= (basic_substring<U> const that) const noexcept { return this->compare(that) >= 0; }
+
+    template<size_t N> C4_ALWAYS_INLINE C4_PURE bool operator== (const char (&that)[N]) const noexcept { return this->compare(that, N-1) == 0; }
+    template<size_t N> C4_ALWAYS_INLINE C4_PURE bool operator!= (const char (&that)[N]) const noexcept { return this->compare(that, N-1) != 0; }
+    template<size_t N> C4_ALWAYS_INLINE C4_PURE bool operator<  (const char (&that)[N]) const noexcept { return this->compare(that, N-1) <  0; }
+    template<size_t N> C4_ALWAYS_INLINE C4_PURE bool operator>  (const char (&that)[N]) const noexcept { return this->compare(that, N-1) >  0; }
+    template<size_t N> C4_ALWAYS_INLINE C4_PURE bool operator<= (const char (&that)[N]) const noexcept { return this->compare(that, N-1) <= 0; }
+    template<size_t N> C4_ALWAYS_INLINE C4_PURE bool operator>= (const char (&that)[N]) const noexcept { return this->compare(that, N-1) >= 0; }
+
+    /** @} */
+
+public:
+
+    /** @name Sub-selection methods */
+    /** @{ */
+
+    /** true if *this is a substring of that (ie, from the same buffer) */
+    C4_ALWAYS_INLINE C4_PURE bool is_sub(ro_substr const that) const noexcept
+    {
+        return that.is_super(*this);
+    }
+
+    /** true if that is a substring of *this (ie, from the same buffer) */
+    C4_ALWAYS_INLINE C4_PURE bool is_super(ro_substr const that) const noexcept
+    {
+        if(C4_LIKELY(len > 0))
+            return that.str >= str && that.str+that.len <= str+len;
+        else
+            return that.len == 0 && that.str == str && str != nullptr;
+    }
+
+    /** true if there is overlap of at least one element between that and *this */
+    C4_ALWAYS_INLINE C4_PURE bool overlaps(ro_substr const that) const noexcept
+    {
+        // thanks @timwynants
+        return that.str+that.len > str && that.str < str+len;
+    }
+
+public:
+
+    /** return [first,len[ */
+    C4_ALWAYS_INLINE C4_PURE basic_substring sub(size_t first) const noexcept
+    {
+        C4_ASSERT(first >= 0 && first <= len);
+        return basic_substring(str + first, len - first);
+    }
+
+    /** return [first,first+num[. If num==npos, return [first,len[ */
+    C4_ALWAYS_INLINE C4_PURE basic_substring sub(size_t first, size_t num) const noexcept
+    {
+        C4_ASSERT(first >= 0 && first <= len);
+        C4_ASSERT((num >= 0 && num <= len) || (num == npos));
+        size_t rnum = num != npos ? num : len - first;
+        C4_ASSERT((first >= 0 && first + rnum <= len) || (num == 0));
+        return basic_substring(str + first, rnum);
+    }
+
+    /** return [first,last[. If last==npos, return [first,len[ */
+    C4_ALWAYS_INLINE C4_PURE basic_substring range(size_t first, size_t last=npos) const noexcept
+    {
+        C4_ASSERT(first >= 0 && first <= len);
+        last = last != npos ? last : len;
+        C4_ASSERT(first <= last);
+        C4_ASSERT(last  >= 0 && last  <= len);
+        return basic_substring(str + first, last - first);
+    }
+
+    /** return the first @p num elements: [0,num[*/
+    C4_ALWAYS_INLINE C4_PURE basic_substring first(size_t num) const noexcept
+    {
+        C4_ASSERT(num <= len || num == npos);
+        return basic_substring(str, num != npos ? num : len);
+    }
+
+    /** return the last @num elements: [len-num,len[*/
+    C4_ALWAYS_INLINE C4_PURE basic_substring last(size_t num) const noexcept
+    {
+        C4_ASSERT(num <= len || num == npos);
+        return num != npos ?
+            basic_substring(str + len - num, num) :
+            *this;
+    }
+
+    /** offset from the ends: return [left,len-right[ ; ie, trim a
+        number of characters from the left and right. This is
+        equivalent to python's negative list indices. */
+    C4_ALWAYS_INLINE C4_PURE basic_substring offs(size_t left, size_t right) const noexcept
+    {
+        C4_ASSERT(left  >= 0 && left  <= len);
+        C4_ASSERT(right >= 0 && right <= len);
+        C4_ASSERT(left  <= len - right + 1);
+        return basic_substring(str + left, len - right - left);
+    }
+
+    /** return [0, pos[ . Same as .first(pos), but provided for compatibility with .right_of() */
+    C4_ALWAYS_INLINE C4_PURE basic_substring left_of(size_t pos) const noexcept
+    {
+        C4_ASSERT(pos <= len || pos == npos);
+        return (pos != npos) ?
+            basic_substring(str, pos) :
+            *this;
+    }
+
+    /** return [0, pos+include_pos[ . Same as .first(pos+1), but provided for compatibility with .right_of() */
+    C4_ALWAYS_INLINE C4_PURE basic_substring left_of(size_t pos, bool include_pos) const noexcept
+    {
+        C4_ASSERT(pos <= len || pos == npos);
+        return (pos != npos) ?
+            basic_substring(str, pos+include_pos) :
+            *this;
+    }
+
+    /** return [pos+1, len[ */
+    C4_ALWAYS_INLINE C4_PURE basic_substring right_of(size_t pos) const noexcept
+    {
+        C4_ASSERT(pos <= len || pos == npos);
+        return (pos != npos) ?
+            basic_substring(str + (pos + 1), len - (pos + 1)) :
+            basic_substring(str + len, size_t(0));
+    }
+
+    /** return [pos+!include_pos, len[ */
+    C4_ALWAYS_INLINE C4_PURE basic_substring right_of(size_t pos, bool include_pos) const noexcept
+    {
+        C4_ASSERT(pos <= len || pos == npos);
+        return (pos != npos) ?
+            basic_substring(str + (pos + !include_pos), len - (pos + !include_pos)) :
+            basic_substring(str + len, size_t(0));
+    }
+
+public:
+
+    /** given @p subs a substring of the current string, get the
+     * portion of the current string to the left of it */
+    C4_ALWAYS_INLINE C4_PURE basic_substring left_of(ro_substr const subs) const noexcept
+    {
+        C4_ASSERT(is_super(subs) || subs.empty());
+        auto ssb = subs.begin();
+        auto b = begin();
+        auto e = end();
+        if(ssb >= b && ssb <= e)
+            return sub(0, static_cast<size_t>(ssb - b));
+        else
+            return sub(0, 0);
+    }
+
+    /** given @p subs a substring of the current string, get the
+     * portion of the current string to the right of it */
+    C4_ALWAYS_INLINE C4_PURE basic_substring right_of(ro_substr const subs) const noexcept
+    {
+        C4_ASSERT(is_super(subs) || subs.empty());
+        auto sse = subs.end();
+        auto b = begin();
+        auto e = end();
+        if(sse >= b && sse <= e)
+            return sub(static_cast<size_t>(sse - b), static_cast<size_t>(e - sse));
+        else
+            return sub(0, 0);
+    }
+
+    /** @} */
+
+public:
+
+    /** @name Removing characters (trim()) / patterns (strip()) from the tips of the string */
+    /** @{ */
+
+    /** trim left */
+    basic_substring triml(const C c) const
+    {
+        if( ! empty())
+        {
+            size_t pos = first_not_of(c);
+            if(pos != npos)
+                return sub(pos);
+        }
+        return sub(0, 0);
+    }
+    /** trim left ANY of the characters.
+     * @see stripl() to remove a pattern from the left */
+    basic_substring triml(ro_substr chars) const
+    {
+        if( ! empty())
+        {
+            size_t pos = first_not_of(chars);
+            if(pos != npos)
+                return sub(pos);
+        }
+        return sub(0, 0);
+    }
+
+    /** trim the character c from the right */
+    basic_substring trimr(const C c) const
+    {
+        if( ! empty())
+        {
+            size_t pos = last_not_of(c, npos);
+            if(pos != npos)
+                return sub(0, pos+1);
+        }
+        return sub(0, 0);
+    }
+    /** trim right ANY of the characters
+     * @see stripr() to remove a pattern from the right  */
+    basic_substring trimr(ro_substr chars) const
+    {
+        if( ! empty())
+        {
+            size_t pos = last_not_of(chars, npos);
+            if(pos != npos)
+                return sub(0, pos+1);
+        }
+        return sub(0, 0);
+    }
+
+    /** trim the character c left and right */
+    basic_substring trim(const C c) const
+    {
+        return triml(c).trimr(c);
+    }
+    /** trim left and right ANY of the characters
+     * @see strip() to remove a pattern from the left and right */
+    basic_substring trim(ro_substr const chars) const
+    {
+        return triml(chars).trimr(chars);
+    }
+
+    /** remove a pattern from the left
+     * @see triml() to remove characters*/
+    basic_substring stripl(ro_substr pattern) const
+    {
+        if( ! begins_with(pattern))
+            return *this;
+        return sub(pattern.len < len ? pattern.len : len);
+    }
+
+    /** remove a pattern from the right
+     * @see trimr() to remove characters*/
+    basic_substring stripr(ro_substr pattern) const
+    {
+        if( ! ends_with(pattern))
+            return *this;
+        return left_of(len - (pattern.len < len ? pattern.len : len));
+    }
+
+    /** @} */
+
+public:
+
+    /** @name Lookup methods */
+    /** @{ */
+
+    inline size_t find(const C c, size_t start_pos=0) const
+    {
+        return first_of(c, start_pos);
+    }
+    inline size_t find(ro_substr pattern, size_t start_pos=0) const
+    {
+        C4_ASSERT(start_pos == npos || (start_pos >= 0 && start_pos <= len));
+        if(len < pattern.len) return npos;
+        for(size_t i = start_pos, e = len - pattern.len + 1; i < e; ++i)
+        {
+            bool gotit = true;
+            for(size_t j = 0; j < pattern.len; ++j)
+            {
+                C4_ASSERT(i + j < len);
+                if(str[i + j] != pattern.str[j])
+                {
+                    gotit = false;
+                    break;
+                }
+            }
+            if(gotit)
+            {
+                return i;
+            }
+        }
+        return npos;
+    }
+
+public:
+
+    /** count the number of occurrences of c */
+    inline size_t count(const C c, size_t pos=0) const
+    {
+        C4_ASSERT(pos >= 0 && pos <= len);
+        size_t num = 0;
+        pos = find(c, pos);
+        while(pos != npos)
+        {
+            ++num;
+            pos = find(c, pos + 1);
+        }
+        return num;
+    }
+
+    /** count the number of occurrences of s */
+    inline size_t count(ro_substr c, size_t pos=0) const
+    {
+        C4_ASSERT(pos >= 0 && pos <= len);
+        size_t num = 0;
+        pos = find(c, pos);
+        while(pos != npos)
+        {
+            ++num;
+            pos = find(c, pos + c.len);
+        }
+        return num;
+    }
+
+    /** get the substr consisting of the first occurrence of @p c after @p pos, or an empty substr if none occurs */
+    inline basic_substring select(const C c, size_t pos=0) const
+    {
+        pos = find(c, pos);
+        return pos != npos ? sub(pos, 1) : basic_substring();
+    }
+
+    /** get the substr consisting of the first occurrence of @p pattern after @p pos, or an empty substr if none occurs */
+    inline basic_substring select(ro_substr pattern, size_t pos=0) const
+    {
+        pos = find(pattern, pos);
+        return pos != npos ? sub(pos, pattern.len) : basic_substring();
+    }
+
+public:
+
+    struct first_of_any_result
+    {
+        size_t which;
+        size_t pos;
+        inline operator bool() const { return which != NONE && pos != npos; }
+    };
+
+    first_of_any_result first_of_any(ro_substr s0, ro_substr s1) const
+    {
+        ro_substr s[2] = {s0, s1};
+        return first_of_any_iter(&s[0], &s[0] + 2);
+    }
+
+    first_of_any_result first_of_any(ro_substr s0, ro_substr s1, ro_substr s2) const
+    {
+        ro_substr s[3] = {s0, s1, s2};
+        return first_of_any_iter(&s[0], &s[0] + 3);
+    }
+
+    first_of_any_result first_of_any(ro_substr s0, ro_substr s1, ro_substr s2, ro_substr s3) const
+    {
+        ro_substr s[4] = {s0, s1, s2, s3};
+        return first_of_any_iter(&s[0], &s[0] + 4);
+    }
+
+    first_of_any_result first_of_any(ro_substr s0, ro_substr s1, ro_substr s2, ro_substr s3, ro_substr s4) const
+    {
+        ro_substr s[5] = {s0, s1, s2, s3, s4};
+        return first_of_any_iter(&s[0], &s[0] + 5);
+    }
+
+    template<class It>
+    first_of_any_result first_of_any_iter(It first_span, It last_span) const
+    {
+        for(size_t i = 0; i < len; ++i)
+        {
+            size_t curr = 0;
+            for(It it = first_span; it != last_span; ++curr, ++it)
+            {
+                auto const& chars = *it;
+                if((i + chars.len) > len) continue;
+                bool gotit = true;
+                for(size_t j = 0; j < chars.len; ++j)
+                {
+                    C4_ASSERT(i + j < len);
+                    if(str[i + j] != chars[j])
+                    {
+                        gotit = false;
+                        break;
+                    }
+                }
+                if(gotit)
+                {
+                    return {curr, i};
+                }
+            }
+        }
+        return {NONE, npos};
+    }
+
+public:
+
+    /** true if the first character of the string is @p c */
+    bool begins_with(const C c) const
+    {
+        return len > 0 ? str[0] == c : false;
+    }
+
+    /** true if the first @p num characters of the string are @p c */
+    bool begins_with(const C c, size_t num) const
+    {
+        if(len < num)
+        {
+            return false;
+        }
+        for(size_t i = 0; i < num; ++i)
+        {
+            if(str[i] != c)
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /** true if the string begins with the given @p pattern */
+    bool begins_with(ro_substr pattern) const
+    {
+        if(len < pattern.len)
+        {
+            return false;
+        }
+        for(size_t i = 0; i < pattern.len; ++i)
+        {
+            if(str[i] != pattern[i])
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /** true if the first character of the string is any of the given @p chars */
+    bool begins_with_any(ro_substr chars) const
+    {
+        if(len == 0)
+        {
+            return false;
+        }
+        for(size_t i = 0; i < chars.len; ++i)
+        {
+            if(str[0] == chars.str[i])
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    /** true if the last character of the string is @p c */
+    bool ends_with(const C c) const
+    {
+        return len > 0 ? str[len-1] == c : false;
+    }
+
+    /** true if the last @p num characters of the string are @p c */
+    bool ends_with(const C c, size_t num) const
+    {
+        if(len < num)
+        {
+            return false;
+        }
+        for(size_t i = len - num; i < len; ++i)
+        {
+            if(str[i] != c)
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /** true if the string ends with the given @p pattern */
+    bool ends_with(ro_substr pattern) const
+    {
+        if(len < pattern.len)
+        {
+            return false;
+        }
+        for(size_t i = 0, s = len-pattern.len; i < pattern.len; ++i)
+        {
+            if(str[s+i] != pattern[i])
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /** true if the last character of the string is any of the given @p chars */
+    bool ends_with_any(ro_substr chars) const
+    {
+        if(len == 0)
+        {
+            return false;
+        }
+        for(size_t i = 0; i < chars.len; ++i)
+        {
+            if(str[len - 1] == chars[i])
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+public:
+
+    /** @return the first position where c is found in the string, or npos if none is found */
+    size_t first_of(const C c, size_t start=0) const
+    {
+        C4_ASSERT(start == npos || (start >= 0 && start <= len));
+        for(size_t i = start; i < len; ++i)
+        {
+            if(str[i] == c)
+                return i;
+        }
+        return npos;
+    }
+
+    /** @return the last position where c is found in the string, or npos if none is found */
+    size_t last_of(const C c, size_t start=npos) const
+    {
+        C4_ASSERT(start == npos || (start >= 0 && start <= len));
+        if(start == npos)
+            start = len;
+        for(size_t i = start-1; i != size_t(-1); --i)
+        {
+            if(str[i] == c)
+                return i;
+        }
+        return npos;
+    }
+
+    /** @return the first position where ANY of the chars is found in the string, or npos if none is found */
+    size_t first_of(ro_substr chars, size_t start=0) const
+    {
+        C4_ASSERT(start == npos || (start >= 0 && start <= len));
+        for(size_t i = start; i < len; ++i)
+        {
+            for(size_t j = 0; j < chars.len; ++j)
+            {
+                if(str[i] == chars[j])
+                    return i;
+            }
+        }
+        return npos;
+    }
+
+    /** @return the last position where ANY of the chars is found in the string, or npos if none is found */
+    size_t last_of(ro_substr chars, size_t start=npos) const
+    {
+        C4_ASSERT(start == npos || (start >= 0 && start <= len));
+        if(start == npos)
+            start = len;
+        for(size_t i = start-1; i != size_t(-1); --i)
+        {
+            for(size_t j = 0; j < chars.len; ++j)
+            {
+                if(str[i] == chars[j])
+                    return i;
+            }
+        }
+        return npos;
+    }
+
+public:
+
+    size_t first_not_of(const C c, size_t start=0) const
+    {
+        C4_ASSERT((start >= 0 && start <= len) || (start == len && len == 0));
+        for(size_t i = start; i < len; ++i)
+        {
+            if(str[i] != c)
+                return i;
+        }
+        return npos;
+    }
+
+    size_t last_not_of(const C c, size_t start=npos) const
+    {
+        C4_ASSERT(start == npos || (start >= 0 && start <= len));
+        if(start == npos)
+            start = len;
+        for(size_t i = start-1; i != size_t(-1); --i)
+        {
+            if(str[i] != c)
+                return i;
+        }
+        return npos;
+    }
+
+    size_t first_not_of(ro_substr chars, size_t start=0) const
+    {
+        C4_ASSERT((start >= 0 && start <= len) || (start == len && len == 0));
+        for(size_t i = start; i < len; ++i)
+        {
+            bool gotit = true;
+            for(size_t j = 0; j < chars.len; ++j)
+            {
+                if(str[i] == chars.str[j])
+                {
+                    gotit = false;
+                    break;
+                }
+            }
+            if(gotit)
+            {
+                return i;
+            }
+        }
+        return npos;
+    }
+
+    size_t last_not_of(ro_substr chars, size_t start=npos) const
+    {
+        C4_ASSERT(start == npos || (start >= 0 && start <= len));
+        if(start == npos)
+            start = len;
+        for(size_t i = start-1; i != size_t(-1); --i)
+        {
+            bool gotit = true;
+            for(size_t j = 0; j < chars.len; ++j)
+            {
+                if(str[i] == chars.str[j])
+                {
+                    gotit = false;
+                    break;
+                }
+            }
+            if(gotit)
+            {
+                return i;
+            }
+        }
+        return npos;
+    }
+
+    /** @} */
+
+public:
+
+    /** @name Range lookup methods */
+    /** @{ */
+
+    /** get the range delimited by an open-close pair of characters.
+     * @note There must be no nested pairs.
+     * @note No checks for escapes are performed. */
+    basic_substring pair_range(CC open, CC close) const
+    {
+        size_t b = find(open);
+        if(b == npos)
+            return basic_substring();
+        size_t e = find(close, b+1);
+        if(e == npos)
+            return basic_substring();
+        basic_substring ret = range(b, e+1);
+        C4_ASSERT(ret.sub(1).find(open) == npos);
+        return ret;
+    }
+
+    /** get the range delimited by a single open-close character (eg, quotes).
+     * @note The open-close character can be escaped. */
+    basic_substring pair_range_esc(CC open_close, CC escape=CC('\\'))
+    {
+        size_t b = find(open_close);
+        if(b == npos) return basic_substring();
+        for(size_t i = b+1; i < len; ++i)
+        {
+            CC c = str[i];
+            if(c == open_close)
+            {
+                if(str[i-1] != escape)
+                {
+                    return range(b, i+1);
+                }
+            }
+        }
+        return basic_substring();
+    }
+
+    /** get the range delimited by an open-close pair of characters,
+     * with possibly nested occurrences. No checks for escapes are
+     * performed. */
+    basic_substring pair_range_nested(CC open, CC close) const
+    {
+        size_t b = find(open);
+        if(b == npos) return basic_substring();
+        size_t e, curr = b+1, count = 0;
+        const char both[] = {open, close, '\0'};
+        while((e = first_of(both, curr)) != npos)
+        {
+            if(str[e] == open)
+            {
+                ++count;
+                curr = e+1;
+            }
+            else if(str[e] == close)
+            {
+                if(count == 0) return range(b, e+1);
+                --count;
+                curr = e+1;
+            }
+        }
+        return basic_substring();
+    }
+
+    basic_substring unquoted() const
+    {
+        constexpr const C dq('"'), sq('\'');
+        if(len >= 2 && (str[len - 2] != C('\\')) &&
+           ((begins_with(sq) && ends_with(sq))
+            ||
+            (begins_with(dq) && ends_with(dq))))
+        {
+            return range(1, len -1);
+        }
+        return *this;
+    }
+
+    /** @} */
+
+public:
+
+    /** @name Number-matching query methods */
+    /** @{ */
+
+    /** @return true if the substring contents are a floating-point or integer number.
+     * @note any leading or trailing whitespace will return false. */
+    bool is_number() const
+    {
+        if(empty() || (first_non_empty_span().empty()))
+            return false;
+        if(first_uint_span() == *this)
+            return true;
+        if(first_int_span() == *this)
+            return true;
+        if(first_real_span() == *this)
+            return true;
+        return false;
+    }
+
+    /** @return true if the substring contents are a real number.
+     * @note any leading or trailing whitespace will return false. */
+    bool is_real() const
+    {
+        if(empty() || (first_non_empty_span().empty()))
+            return false;
+        if(first_real_span() == *this)
+            return true;
+        return false;
+    }
+
+    /** @return true if the substring contents are an integer number.
+     * @note any leading or trailing whitespace will return false. */
+    bool is_integer() const
+    {
+        if(empty() || (first_non_empty_span().empty()))
+            return false;
+        if(first_uint_span() == *this)
+            return true;
+        if(first_int_span() == *this)
+            return true;
+        return false;
+    }
+
+    /** @return true if the substring contents are an unsigned integer number.
+     * @note any leading or trailing whitespace will return false. */
+    bool is_unsigned_integer() const
+    {
+        if(empty() || (first_non_empty_span().empty()))
+            return false;
+        if(first_uint_span() == *this)
+            return true;
+        return false;
+    }
+
+    /** get the first span consisting exclusively of non-empty characters */
+    basic_substring first_non_empty_span() const
+    {
+        constexpr const ro_substr empty_chars(" \n\r\t");
+        size_t pos = first_not_of(empty_chars);
+        if(pos == npos)
+            return first(0);
+        auto ret = sub(pos);
+        pos = ret.first_of(empty_chars);
+        return ret.first(pos);
+    }
+
+    /** get the first span which can be interpreted as an unsigned integer */
+    basic_substring first_uint_span() const
+    {
+        basic_substring ne = first_non_empty_span();
+        if(ne.empty())
+            return ne;
+        if(ne.str[0] == '-')
+            return first(0);
+        size_t skip_start = (ne.str[0] == '+') ? 1 : 0;
+        return ne._first_integral_span(skip_start);
+    }
+
+    /** get the first span which can be interpreted as a signed integer */
+    basic_substring first_int_span() const
+    {
+        basic_substring ne = first_non_empty_span();
+        if(ne.empty())
+            return ne;
+        size_t skip_start = (ne.str[0] == '+' || ne.str[0] == '-') ? 1 : 0;
+        return ne._first_integral_span(skip_start);
+    }
+
+    basic_substring _first_integral_span(size_t skip_start) const
+    {
+        C4_ASSERT(!empty());
+        if(skip_start == len)
+            return first(0);
+        C4_ASSERT(skip_start < len);
+        if(len >= skip_start + 3)
+        {
+            if(str[skip_start] != '0')
+            {
+                for(size_t i = skip_start; i < len; ++i)
+                {
+                    char c = str[i];
+                    if(c < '0' || c > '9')
+                        return i > skip_start && _is_delim_char(c) ? first(i) : first(0);
+                }
+            }
+            else
+            {
+                char next = str[skip_start + 1];
+                if(next == 'x' || next == 'X')
+                {
+                    skip_start += 2;
+                    for(size_t i = skip_start; i < len; ++i)
+                    {
+                        const char c = str[i];
+                        if( ! _is_hex_char(c))
+                            return i > skip_start && _is_delim_char(c) ? first(i) : first(0);
+                    }
+                    return *this;
+                }
+                else if(next == 'b' || next == 'B')
+                {
+                    skip_start += 2;
+                    for(size_t i = skip_start; i < len; ++i)
+                    {
+                        const char c = str[i];
+                        if(c != '0' && c != '1')
+                            return i > skip_start && _is_delim_char(c) ? first(i) : first(0);
+                    }
+                    return *this;
+                }
+                else if(next == 'o' || next == 'O')
+                {
+                    skip_start += 2;
+                    for(size_t i = skip_start; i < len; ++i)
+                    {
+                        const char c = str[i];
+                        if(c < '0' || c > '7')
+                            return i > skip_start && _is_delim_char(c) ? first(i) : first(0);
+                    }
+                    return *this;
+                }
+            }
+        }
+        // must be a decimal, or it is not a an number
+        for(size_t i = skip_start; i < len; ++i)
+        {
+            const char c = str[i];
+            if(c < '0' || c > '9')
+                return i > skip_start && _is_delim_char(c) ? first(i) : first(0);
+        }
+        return *this;
+    }
+
+    /** get the first span which can be interpreted as a real (floating-point) number */
+    basic_substring first_real_span() const
+    {
+        basic_substring ne = first_non_empty_span();
+        if(ne.empty())
+            return ne;
+        size_t skip_start = (ne.str[0] == '+' || ne.str[0] == '-');
+        C4_ASSERT(skip_start == 0 || skip_start == 1);
+        // if we have at least three digits after the leading sign, it
+        // can be decimal, or hex, or bin or oct. Ex:
+        // non-decimal: 0x0, 0b0, 0o0
+        // decimal: 1.0, 10., 1e1, 100, inf, nan, infinity
+        if(ne.len >= skip_start+3)
+        {
+            // if it does not have leading 0, it must be decimal, or it is not a real
+            if(ne.str[skip_start] != '0')
+            {
+                if(ne.str[skip_start] == 'i') // is it infinity or inf?
+                {
+                    basic_substring word = ne._word_follows(skip_start + 1, "nfinity");
+                    if(word.len)
+                        return word;
+                    return ne._word_follows(skip_start + 1, "nf");
+                }
+                else if(ne.str[skip_start] == 'n') // is it nan?
+                {
+                    return ne._word_follows(skip_start + 1, "an");
+                }
+                else // must be a decimal, or it is not a real
+                {
+                    return ne._first_real_span_dec(skip_start);
+                }
+            }
+            else // starts with 0. is it 0x, 0b or 0o?
+            {
+                const char next = ne.str[skip_start + 1];
+                // hexadecimal
+                if(next == 'x' || next == 'X')
+                    return ne._first_real_span_hex(skip_start + 2);
+                // binary
+                else if(next == 'b' || next == 'B')
+                    return ne._first_real_span_bin(skip_start + 2);
+                // octal
+                else if(next == 'o' || next == 'O')
+                    return ne._first_real_span_oct(skip_start + 2);
+                // none of the above. may still be a decimal.
+                else
+                    return ne._first_real_span_dec(skip_start); // do not skip the 0.
+            }
+        }
+        // less than 3 chars after the leading sign. It is either a
+        // decimal or it is not a real. (cannot be any of 0x0, etc).
+        return ne._first_real_span_dec(skip_start);
+    }
+
+    /** true if the character is a delimiter character *at the end* */
+    static constexpr C4_ALWAYS_INLINE C4_CONST bool _is_delim_char(char c) noexcept
+    {
+        return c == ' ' || c == '\n'
+            || c == ']' || c == ')'  || c == '}'
+            || c == ',' || c == ';' || c == '\r' || c == '\t' || c == '\0';
+    }
+
+    /** true if the character is in [0-9a-fA-F] */
+    static constexpr C4_ALWAYS_INLINE C4_CONST bool _is_hex_char(char c) noexcept
+    {
+        return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+    }
+
+    C4_NO_INLINE C4_PURE basic_substring _word_follows(size_t pos, csubstr word) const noexcept
+    {
+        size_t posend = pos + word.len;
+        if(len >= posend && sub(pos, word.len) == word)
+            if(len == posend || _is_delim_char(str[posend]))
+                return first(posend);
+        return first(0);
+    }
+
+    // this function is declared inside the class to avoid a VS error with __declspec(dllimport)
+    C4_NO_INLINE C4_PURE basic_substring _first_real_span_dec(size_t pos) const noexcept
+    {
+        bool intchars = false;
+        bool fracchars = false;
+        bool powchars;
+        // integral part
+        for( ; pos < len; ++pos)
+        {
+            const char c = str[pos];
+            if(c >= '0' && c <= '9')
+            {
+                intchars = true;
+            }
+            else if(c == '.')
+            {
+                ++pos;
+                goto fractional_part_dec;
+            }
+            else if(c == 'e' || c == 'E')
+            {
+                ++pos;
+                goto power_part_dec;
+            }
+            else if(_is_delim_char(c))
+            {
+                return intchars ? first(pos) : first(0);
+            }
+            else
+            {
+                return first(0);
+            }
+        }
+        // no . or p were found; this is either an integral number
+        // or not a number at all
+        return intchars ?
+            *this :
+            first(0);
+    fractional_part_dec:
+        C4_ASSERT(pos > 0);
+        C4_ASSERT(str[pos - 1] == '.');
+        for( ; pos < len; ++pos)
+        {
+            const char c = str[pos];
+            if(c >= '0' && c <= '9')
+            {
+                fracchars = true;
+            }
+            else if(c == 'e' || c == 'E')
+            {
+                ++pos;
+                goto power_part_dec;
+            }
+            else if(_is_delim_char(c))
+            {
+                return intchars || fracchars ? first(pos) : first(0);
+            }
+            else
+            {
+                return first(0);
+            }
+        }
+        return intchars || fracchars ?
+            *this :
+            first(0);
+    power_part_dec:
+        C4_ASSERT(pos > 0);
+        C4_ASSERT(str[pos - 1] == 'e' || str[pos - 1] == 'E');
+        // either a + or a - is expected here, followed by more chars.
+        // also, using (pos+1) in this check will cause an early
+        // return when no more chars follow the sign.
+        if(len <= (pos+1) || ((!intchars) && (!fracchars)))
+            return first(0);
+        ++pos; // this was the sign.
+        // ... so the (pos+1) ensures that we enter the loop and
+        // hence that there exist chars in the power part
+        powchars = false;
+        for( ; pos < len; ++pos)
+        {
+            const char c = str[pos];
+            if(c >= '0' && c <= '9')
+                powchars = true;
+            else if(powchars && _is_delim_char(c))
+                return first(pos);
+            else
+                return first(0);
+        }
+        return *this;
+    }
+
+    // this function is declared inside the class to avoid a VS error with __declspec(dllimport)
+    C4_NO_INLINE C4_PURE basic_substring _first_real_span_hex(size_t pos) const noexcept
+    {
+        bool intchars = false;
+        bool fracchars = false;
+        bool powchars;
+        // integral part
+        for( ; pos < len; ++pos)
+        {
+            const char c = str[pos];
+            if(_is_hex_char(c))
+            {
+                intchars = true;
+            }
+            else if(c == '.')
+            {
+                ++pos;
+                goto fractional_part_hex;
+            }
+            else if(c == 'p' || c == 'P')
+            {
+                ++pos;
+                goto power_part_hex;
+            }
+            else if(_is_delim_char(c))
+            {
+                return intchars ? first(pos) : first(0);
+            }
+            else
+            {
+                return first(0);
+            }
+        }
+        // no . or p were found; this is either an integral number
+        // or not a number at all
+        return intchars ?
+            *this :
+            first(0);
+    fractional_part_hex:
+        C4_ASSERT(pos > 0);
+        C4_ASSERT(str[pos - 1] == '.');
+        for( ; pos < len; ++pos)
+        {
+            const char c = str[pos];
+            if(_is_hex_char(c))
+            {
+                fracchars = true;
+            }
+            else if(c == 'p' || c == 'P')
+            {
+                ++pos;
+                goto power_part_hex;
+            }
+            else if(_is_delim_char(c))
+            {
+                return intchars || fracchars ? first(pos) : first(0);
+            }
+            else
+            {
+                return first(0);
+            }
+        }
+        return intchars || fracchars ?
+            *this :
+            first(0);
+    power_part_hex:
+        C4_ASSERT(pos > 0);
+        C4_ASSERT(str[pos - 1] == 'p' || str[pos - 1] == 'P');
+        // either a + or a - is expected here, followed by more chars.
+        // also, using (pos+1) in this check will cause an early
+        // return when no more chars follow the sign.
+        if(len <= (pos+1) || (str[pos] != '+' && str[pos] != '-') || ((!intchars) && (!fracchars)))
+            return first(0);
+        ++pos; // this was the sign.
+        // ... so the (pos+1) ensures that we enter the loop and
+        // hence that there exist chars in the power part
+        powchars = false;
+        for( ; pos < len; ++pos)
+        {
+            const char c = str[pos];
+            if(c >= '0' && c <= '9')
+                powchars = true;
+            else if(powchars && _is_delim_char(c))
+                return first(pos);
+            else
+                return first(0);
+        }
+        return *this;
+    }
+
+    // this function is declared inside the class to avoid a VS error with __declspec(dllimport)
+    C4_NO_INLINE C4_PURE basic_substring _first_real_span_bin(size_t pos) const noexcept
+    {
+        bool intchars = false;
+        bool fracchars = false;
+        bool powchars;
+        // integral part
+        for( ; pos < len; ++pos)
+        {
+            const char c = str[pos];
+            if(c == '0' || c == '1')
+            {
+                intchars = true;
+            }
+            else if(c == '.')
+            {
+                ++pos;
+                goto fractional_part_bin;
+            }
+            else if(c == 'p' || c == 'P')
+            {
+                ++pos;
+                goto power_part_bin;
+            }
+            else if(_is_delim_char(c))
+            {
+                return intchars ? first(pos) : first(0);
+            }
+            else
+            {
+                return first(0);
+            }
+        }
+        // no . or p were found; this is either an integral number
+        // or not a number at all
+        return intchars ?
+            *this :
+            first(0);
+    fractional_part_bin:
+        C4_ASSERT(pos > 0);
+        C4_ASSERT(str[pos - 1] == '.');
+        for( ; pos < len; ++pos)
+        {
+            const char c = str[pos];
+            if(c == '0' || c == '1')
+            {
+                fracchars = true;
+            }
+            else if(c == 'p' || c == 'P')
+            {
+                ++pos;
+                goto power_part_bin;
+            }
+            else if(_is_delim_char(c))
+            {
+                return intchars || fracchars ? first(pos) : first(0);
+            }
+            else
+            {
+                return first(0);
+            }
+        }
+        return intchars || fracchars ?
+            *this :
+            first(0);
+    power_part_bin:
+        C4_ASSERT(pos > 0);
+        C4_ASSERT(str[pos - 1] == 'p' || str[pos - 1] == 'P');
+        // either a + or a - is expected here, followed by more chars.
+        // also, using (pos+1) in this check will cause an early
+        // return when no more chars follow the sign.
+        if(len <= (pos+1) || (str[pos] != '+' && str[pos] != '-') || ((!intchars) && (!fracchars)))
+            return first(0);
+        ++pos; // this was the sign.
+        // ... so the (pos+1) ensures that we enter the loop and
+        // hence that there exist chars in the power part
+        powchars = false;
+        for( ; pos < len; ++pos)
+        {
+            const char c = str[pos];
+            if(c >= '0' && c <= '9')
+                powchars = true;
+            else if(powchars && _is_delim_char(c))
+                return first(pos);
+            else
+                return first(0);
+        }
+        return *this;
+    }
+
+    // this function is declared inside the class to avoid a VS error with __declspec(dllimport)
+    C4_NO_INLINE C4_PURE basic_substring _first_real_span_oct(size_t pos) const noexcept
+    {
+        bool intchars = false;
+        bool fracchars = false;
+        bool powchars;
+        // integral part
+        for( ; pos < len; ++pos)
+        {
+            const char c = str[pos];
+            if(c >= '0' && c <= '7')
+            {
+                intchars = true;
+            }
+            else if(c == '.')
+            {
+                ++pos;
+                goto fractional_part_oct;
+            }
+            else if(c == 'p' || c == 'P')
+            {
+                ++pos;
+                goto power_part_oct;
+            }
+            else if(_is_delim_char(c))
+            {
+                return intchars ? first(pos) : first(0);
+            }
+            else
+            {
+                return first(0);
+            }
+        }
+        // no . or p were found; this is either an integral number
+        // or not a number at all
+        return intchars ?
+            *this :
+            first(0);
+    fractional_part_oct:
+        C4_ASSERT(pos > 0);
+        C4_ASSERT(str[pos - 1] == '.');
+        for( ; pos < len; ++pos)
+        {
+            const char c = str[pos];
+            if(c >= '0' && c <= '7')
+            {
+                fracchars = true;
+            }
+            else if(c == 'p' || c == 'P')
+            {
+                ++pos;
+                goto power_part_oct;
+            }
+            else if(_is_delim_char(c))
+            {
+                return intchars || fracchars ? first(pos) : first(0);
+            }
+            else
+            {
+                return first(0);
+            }
+        }
+        return intchars || fracchars ?
+            *this :
+            first(0);
+    power_part_oct:
+        C4_ASSERT(pos > 0);
+        C4_ASSERT(str[pos - 1] == 'p' || str[pos - 1] == 'P');
+        // either a + or a - is expected here, followed by more chars.
+        // also, using (pos+1) in this check will cause an early
+        // return when no more chars follow the sign.
+        if(len <= (pos+1) || (str[pos] != '+' && str[pos] != '-') || ((!intchars) && (!fracchars)))
+            return first(0);
+        ++pos; // this was the sign.
+        // ... so the (pos+1) ensures that we enter the loop and
+        // hence that there exist chars in the power part
+        powchars = false;
+        for( ; pos < len; ++pos)
+        {
+            const char c = str[pos];
+            if(c >= '0' && c <= '9')
+                powchars = true;
+            else if(powchars && _is_delim_char(c))
+                return first(pos);
+            else
+                return first(0);
+        }
+        return *this;
+    }
+
+    /** @} */
+
+public:
+
+    /** @name Splitting methods */
+    /** @{ */
+
+    /** returns true if the string has not been exhausted yet, meaning
+     * it's ok to call next_split() again. When no instance of sep
+     * exists in the string, returns the full string. When the input
+     * is an empty string, the output string is the empty string. */
+    bool next_split(C sep, size_t *C4_RESTRICT start_pos, basic_substring *C4_RESTRICT out) const
+    {
+        if(C4_LIKELY(*start_pos < len))
+        {
+            for(size_t i = *start_pos, e = len; i < e; i++)
+            {
+                if(str[i] == sep)
+                {
+                    out->assign(str + *start_pos, i - *start_pos);
+                    *start_pos = i+1;
+                    return true;
+                }
+            }
+            out->assign(str + *start_pos, len - *start_pos);
+            *start_pos = len + 1;
+            return true;
+        }
+        else
+        {
+            bool valid = len > 0 && (*start_pos == len);
+            if(valid && !empty() && str[len-1] == sep)
+            {
+                out->assign(str + len, (size_t)0); // the cast is needed to prevent overload ambiguity
+            }
+            else
+            {
+                out->assign(str + len + 1, (size_t)0); // the cast is needed to prevent overload ambiguity
+            }
+            *start_pos = len + 1;
+            return valid;
+        }
+    }
+
+private:
+
+    struct split_proxy_impl
+    {
+        struct split_iterator_impl
+        {
+            split_proxy_impl const* m_proxy;
+            basic_substring m_str;
+            size_t m_pos;
+            NCC_ m_sep;
+
+            split_iterator_impl(split_proxy_impl const* proxy, size_t pos, C sep)
+                : m_proxy(proxy), m_pos(pos), m_sep(sep)
+            {
+                _tick();
+            }
+
+            void _tick()
+            {
+                m_proxy->m_str.next_split(m_sep, &m_pos, &m_str);
+            }
+
+            split_iterator_impl& operator++ () { _tick(); return *this; }
+            split_iterator_impl  operator++ (int) { split_iterator_impl it = *this; _tick(); return it; }
+
+            basic_substring& operator*  () { return  m_str; }
+            basic_substring* operator-> () { return &m_str; }
+
+            bool operator!= (split_iterator_impl const& that) const
+            {
+                return !(this->operator==(that));
+            }
+            bool operator== (split_iterator_impl const& that) const
+            {
+                C4_XASSERT((m_sep == that.m_sep) && "cannot compare split iterators with different separators");
+                if(m_str.size() != that.m_str.size())
+                    return false;
+                if(m_str.data() != that.m_str.data())
+                    return false;
+                return m_pos == that.m_pos;
+            }
+        };
+
+        basic_substring m_str;
+        size_t m_start_pos;
+        C m_sep;
+
+        split_proxy_impl(basic_substring str_, size_t start_pos, C sep)
+            : m_str(str_), m_start_pos(start_pos), m_sep(sep)
+        {
+        }
+
+        split_iterator_impl begin() const
+        {
+            auto it = split_iterator_impl(this, m_start_pos, m_sep);
+            return it;
+        }
+        split_iterator_impl end() const
+        {
+            size_t pos = m_str.size() + 1;
+            auto it = split_iterator_impl(this, pos, m_sep);
+            return it;
+        }
+    };
+
+public:
+
+    using split_proxy = split_proxy_impl;
+
+    /** a view into the splits */
+    split_proxy split(C sep, size_t start_pos=0) const
+    {
+        C4_XASSERT((start_pos >= 0 && start_pos < len) || empty());
+        auto ss = sub(0, len);
+        auto it = split_proxy(ss, start_pos, sep);
+        return it;
+    }
+
+public:
+
+    /** pop right: return the first split from the right. Use
+     * gpop_left() to get the reciprocal part.
+     */
+    basic_substring pop_right(C sep=C('/'), bool skip_empty=false) const
+    {
+        if(C4_LIKELY(len > 1))
+        {
+            auto pos = last_of(sep);
+            if(pos != npos)
+            {
+                if(pos + 1 < len) // does not end with sep
+                {
+                    return sub(pos + 1); // return from sep to end
+                }
+                else // the string ends with sep
+                {
+                    if( ! skip_empty)
+                    {
+                        return sub(pos + 1, 0);
+                    }
+                    auto ppos = last_not_of(sep); // skip repeated seps
+                    if(ppos == npos) // the string is all made of seps
+                    {
+                        return sub(0, 0);
+                    }
+                    // find the previous sep
+                    auto pos0 = last_of(sep, ppos);
+                    if(pos0 == npos) // only the last sep exists
+                    {
+                        return sub(0); // return the full string (because skip_empty is true)
+                    }
+                    ++pos0;
+                    return sub(pos0);
+                }
+            }
+            else // no sep was found, return the full string
+            {
+                return *this;
+            }
+        }
+        else if(len == 1)
+        {
+            if(begins_with(sep))
+            {
+                return sub(0, 0);
+            }
+            return *this;
+        }
+        else // an empty string
+        {
+            return basic_substring();
+        }
+    }
+
+    /** return the first split from the left. Use gpop_right() to get
+     * the reciprocal part. */
+    basic_substring pop_left(C sep = C('/'), bool skip_empty=false) const
+    {
+        if(C4_LIKELY(len > 1))
+        {
+            auto pos = first_of(sep);
+            if(pos != npos)
+            {
+                if(pos > 0)  // does not start with sep
+                {
+                    return sub(0, pos); //  return everything up to it
+                }
+                else  // the string starts with sep
+                {
+                    if( ! skip_empty)
+                    {
+                        return sub(0, 0);
+                    }
+                    auto ppos = first_not_of(sep); // skip repeated seps
+                    if(ppos == npos) // the string is all made of seps
+                    {
+                        return sub(0, 0);
+                    }
+                    // find the next sep
+                    auto pos0 = first_of(sep, ppos);
+                    if(pos0 == npos) // only the first sep exists
+                    {
+                        return sub(0); // return the full string (because skip_empty is true)
+                    }
+                    C4_XASSERT(pos0 > 0);
+                    // return everything up to the second sep
+                    return sub(0, pos0);
+                }
+            }
+            else // no sep was found, return the full string
+            {
+                return sub(0);
+            }
+        }
+        else if(len == 1)
+        {
+            if(begins_with(sep))
+            {
+                return sub(0, 0);
+            }
+            return sub(0);
+        }
+        else // an empty string
+        {
+            return basic_substring();
+        }
+    }
+
+public:
+
+    /** greedy pop left. eg, csubstr("a/b/c").gpop_left('/')="c" */
+    basic_substring gpop_left(C sep = C('/'), bool skip_empty=false) const
+    {
+        auto ss = pop_right(sep, skip_empty);
+        ss = left_of(ss);
+        if(ss.find(sep) != npos)
+        {
+            if(ss.ends_with(sep))
+            {
+                if(skip_empty)
+                {
+                    ss = ss.trimr(sep);
+                }
+                else
+                {
+                    ss = ss.sub(0, ss.len-1); // safe to subtract because ends_with(sep) is true
+                }
+            }
+        }
+        return ss;
+    }
+
+    /** greedy pop right. eg, csubstr("a/b/c").gpop_right('/')="a" */
+    basic_substring gpop_right(C sep = C('/'), bool skip_empty=false) const
+    {
+        auto ss = pop_left(sep, skip_empty);
+        ss = right_of(ss);
+        if(ss.find(sep) != npos)
+        {
+            if(ss.begins_with(sep))
+            {
+                if(skip_empty)
+                {
+                    ss = ss.triml(sep);
+                }
+                else
+                {
+                    ss = ss.sub(1);
+                }
+            }
+        }
+        return ss;
+    }
+
+    /** @} */
+
+public:
+
+    /** @name Path-like manipulation methods */
+    /** @{ */
+
+    basic_substring basename(C sep=C('/')) const
+    {
+        auto ss = pop_right(sep, /*skip_empty*/true);
+        ss = ss.trimr(sep);
+        return ss;
+    }
+
+    basic_substring dirname(C sep=C('/')) const
+    {
+        auto ss = basename(sep);
+        ss = ss.empty() ? *this : left_of(ss);
+        return ss;
+    }
+
+    C4_ALWAYS_INLINE basic_substring name_wo_extshort() const
+    {
+        return gpop_left('.');
+    }
+
+    C4_ALWAYS_INLINE basic_substring name_wo_extlong() const
+    {
+        return pop_left('.');
+    }
+
+    C4_ALWAYS_INLINE basic_substring extshort() const
+    {
+        return pop_right('.');
+    }
+
+    C4_ALWAYS_INLINE basic_substring extlong() const
+    {
+        return gpop_right('.');
+    }
+
+    /** @} */
+
+public:
+
+    /** @name Content-modification methods (only for non-const C) */
+    /** @{ */
+
+    /** convert the string to upper-case
+     * @note this method requires that the string memory is writeable and is SFINAEd out for const C */
+    C4_REQUIRE_RW(void) toupper()
+    {
+        for(size_t i = 0; i < len; ++i)
+        {
+            str[i] = static_cast<C>(::toupper(str[i]));
+        }
+    }
+
+    /** convert the string to lower-case
+     * @note this method requires that the string memory is writeable and is SFINAEd out for const C */
+    C4_REQUIRE_RW(void) tolower()
+    {
+        for(size_t i = 0; i < len; ++i)
+        {
+            str[i] = static_cast<C>(::tolower(str[i]));
+        }
+    }
+
+public:
+
+    /** fill the entire contents with the given @p val
+     * @note this method requires that the string memory is writeable and is SFINAEd out for const C */
+    C4_REQUIRE_RW(void) fill(C val)
+    {
+        for(size_t i = 0; i < len; ++i)
+        {
+            str[i] = val;
+        }
+    }
+
+public:
+
+    /** set the current substring to a copy of the given csubstr
+     * @note this method requires that the string memory is writeable and is SFINAEd out for const C */
+    C4_REQUIRE_RW(void) copy_from(ro_substr that, size_t ifirst=0, size_t num=npos)
+    {
+        C4_ASSERT(ifirst >= 0 && ifirst <= len);
+        num = num != npos ? num : len - ifirst;
+        num = num < that.len ? num : that.len;
+        C4_ASSERT(ifirst + num >= 0 && ifirst + num <= len);
+        // calling memcpy with null strings is undefined behavior
+        // and will wreak havoc in calling code's branches.
+        // see https://github.com/biojppm/rapidyaml/pull/264#issuecomment-1262133637
+        if(num)
+            memcpy(str + sizeof(C) * ifirst, that.str, sizeof(C) * num);
+    }
+
+public:
+
+    /** reverse in place
+     * @note this method requires that the string memory is writeable and is SFINAEd out for const C */
+    C4_REQUIRE_RW(void) reverse()
+    {
+        if(len == 0) return;
+        detail::_do_reverse(str, str + len - 1);
+    }
+
+    /** revert a subpart in place
+     * @note this method requires that the string memory is writeable and is SFINAEd out for const C */
+    C4_REQUIRE_RW(void) reverse_sub(size_t ifirst, size_t num)
+    {
+        C4_ASSERT(ifirst >= 0 && ifirst <= len);
+        C4_ASSERT(ifirst + num >= 0 && ifirst + num <= len);
+        if(num == 0) return;
+        detail::_do_reverse(str + ifirst, str + ifirst + num - 1);
+    }
+
+    /** revert a range in place
+     * @note this method requires that the string memory is writeable and is SFINAEd out for const C */
+    C4_REQUIRE_RW(void) reverse_range(size_t ifirst, size_t ilast)
+    {
+        C4_ASSERT(ifirst >= 0 && ifirst <= len);
+        C4_ASSERT(ilast  >= 0 && ilast  <= len);
+        if(ifirst == ilast) return;
+        detail::_do_reverse(str + ifirst, str + ilast - 1);
+    }
+
+public:
+
+    /** erase part of the string. eg, with char s[] = "0123456789",
+     * substr(s).erase(3, 2) = "01256789", and s is now "01245678989"
+     * @note this method requires that the string memory is writeable and is SFINAEd out for const C */
+    C4_REQUIRE_RW(basic_substring) erase(size_t pos, size_t num)
+    {
+        C4_ASSERT(pos >= 0 && pos+num <= len);
+        size_t num_to_move = len - pos - num;
+        memmove(str + pos, str + pos + num, sizeof(C) * num_to_move);
+        return basic_substring{str, len - num};
+    }
+
+    /** @note this method requires that the string memory is writeable and is SFINAEd out for const C */
+    C4_REQUIRE_RW(basic_substring) erase_range(size_t first, size_t last)
+    {
+        C4_ASSERT(first <= last);
+        return erase(first, static_cast<size_t>(last-first));
+    }
+
+    /** erase a part of the string.
+     * @note @p sub must be a substring of this string
+     * @note this method requires that the string memory is writeable and is SFINAEd out for const C */
+    C4_REQUIRE_RW(basic_substring) erase(ro_substr sub)
+    {
+        C4_ASSERT(is_super(sub));
+        C4_ASSERT(sub.str >= str);
+        return erase(static_cast<size_t>(sub.str - str), sub.len);
+    }
+
+public:
+
+    /** replace every occurrence of character @p value with the character @p repl
+     * @return the number of characters that were replaced
+     * @note this method requires that the string memory is writeable and is SFINAEd out for const C */
+    C4_REQUIRE_RW(size_t) replace(C value, C repl, size_t pos=0)
+    {
+        C4_ASSERT((pos >= 0 && pos <= len) || pos == npos);
+        size_t did_it = 0;
+        while((pos = find(value, pos)) != npos)
+        {
+            str[pos++] = repl;
+            ++did_it;
+        }
+        return did_it;
+    }
+
+    /** replace every occurrence of each character in @p value with
+     * the character @p repl.
+     * @return the number of characters that were replaced
+     * @note this method requires that the string memory is writeable and is SFINAEd out for const C */
+    C4_REQUIRE_RW(size_t) replace(ro_substr chars, C repl, size_t pos=0)
+    {
+        C4_ASSERT((pos >= 0 && pos <= len) || pos == npos);
+        size_t did_it = 0;
+        while((pos = first_of(chars, pos)) != npos)
+        {
+            str[pos++] = repl;
+            ++did_it;
+        }
+        return did_it;
+    }
+
+    /** replace @p pattern with @p repl, and write the result into
+     * @dst. pattern and repl don't need equal sizes.
+     *
+     * @return the required size for dst. No overflow occurs if
+     * dst.len is smaller than the required size; this can be used to
+     * determine the required size for an existing container. */
+    size_t replace_all(rw_substr dst, ro_substr pattern, ro_substr repl, size_t pos=0) const
+    {
+        C4_ASSERT( ! pattern.empty()); //!< @todo relax this precondition
+        C4_ASSERT( ! this  ->overlaps(dst)); //!< @todo relax this precondition
+        C4_ASSERT( ! pattern.overlaps(dst));
+        C4_ASSERT( ! repl   .overlaps(dst));
+        C4_ASSERT((pos >= 0 && pos <= len) || pos == npos);
+        C4_SUPPRESS_WARNING_GCC_PUSH
+        C4_SUPPRESS_WARNING_GCC("-Warray-bounds")  // gcc11 has a false positive here
+        #if (!defined(__clang__)) && (defined(__GNUC__) && (__GNUC__ >= 7))
+        C4_SUPPRESS_WARNING_GCC("-Wstringop-overflow")  // gcc11 has a false positive here
+        #endif
+        #define _c4append(first, last)                                  \
+            {                                                           \
+                C4_ASSERT((last) >= (first));                           \
+                size_t num = static_cast<size_t>((last) - (first));     \
+                if(num > 0 && sz + num <= dst.len)                      \
+                {                                                       \
+                    memcpy(dst.str + sz, first, num * sizeof(C));       \
+                }                                                       \
+                sz += num;                                              \
+            }
+        size_t sz = 0;
+        size_t b = pos;
+        _c4append(str, str + pos);
+        do {
+            size_t e = find(pattern, b);
+            if(e == npos)
+            {
+                _c4append(str + b, str + len);
+                break;
+            }
+            _c4append(str + b, str + e);
+            _c4append(repl.begin(), repl.end());
+            b = e + pattern.size();
+        } while(b < len && b != npos);
+        return sz;
+        #undef _c4append
+        C4_SUPPRESS_WARNING_GCC_POP
+    }
+
+    /** @} */
+
+}; // template class basic_substring
+
+
+#undef C4_REQUIRE_RW
+#undef C4_REQUIRE_RO
+#undef C4_NC2C
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** Because of a C++ limitation, substr cannot provide simultaneous
+ * overloads for constructing from a char[N] and a char*; the latter
+ * will always be chosen by the compiler. So this specialization is
+ * provided to simplify obtaining a substr from a char*. Being a
+ * function has the advantage of highlighting the strlen() cost.
+ *
+ * @see to_csubstr
+ * @see For a more detailed explanation on why the overloads cannot
+ * coexist, see http://cplusplus.bordoon.com/specializeForCharacterArrays.html */
+inline substr to_substr(char *s)
+{
+    return substr(s, s ? strlen(s) : 0);
+}
+
+/** Because of a C++ limitation, substr cannot provide simultaneous
+ * overloads for constructing from a char[N] and a char*; the latter
+ * will always be chosen by the compiler. So this specialization is
+ * provided to simplify obtaining a substr from a char*. Being a
+ * function has the advantage of highlighting the strlen() cost.
+ *
+ * @see to_substr
+ * @see For a more detailed explanation on why the overloads cannot
+ * coexist, see http://cplusplus.bordoon.com/specializeForCharacterArrays.html */
+inline csubstr to_csubstr(char *s)
+{
+    return csubstr(s, s ? strlen(s) : 0);
+}
+
+/** Because of a C++ limitation, substr cannot provide simultaneous
+ * overloads for constructing from a const char[N] and a const char*;
+ * the latter will always be chosen by the compiler. So this
+ * specialization is provided to simplify obtaining a substr from a
+ * char*. Being a function has the advantage of highlighting the
+ * strlen() cost.
+ *
+ * @overload to_csubstr
+ * @see to_substr
+ * @see For a more detailed explanation on why the overloads cannot
+ * coexist, see http://cplusplus.bordoon.com/specializeForCharacterArrays.html */
+inline csubstr to_csubstr(const char *s)
+{
+    return csubstr(s, s ? strlen(s) : 0);
+}
+
+
+/** neutral version for use in generic code */
+inline csubstr to_csubstr(csubstr s)
+{
+    return s;
+}
+
+/** neutral version for use in generic code */
+inline csubstr to_csubstr(substr s)
+{
+    return s;
+}
+
+/** neutral version for use in generic code */
+inline substr to_substr(substr s)
+{
+    return s;
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+template<typename C, size_t N> inline bool operator== (const C (&s)[N], basic_substring<C> const that) { return that.compare(s) == 0; }
+template<typename C, size_t N> inline bool operator!= (const C (&s)[N], basic_substring<C> const that) { return that.compare(s) != 0; }
+template<typename C, size_t N> inline bool operator<  (const C (&s)[N], basic_substring<C> const that) { return that.compare(s) >  0; }
+template<typename C, size_t N> inline bool operator>  (const C (&s)[N], basic_substring<C> const that) { return that.compare(s) <  0; }
+template<typename C, size_t N> inline bool operator<= (const C (&s)[N], basic_substring<C> const that) { return that.compare(s) >= 0; }
+template<typename C, size_t N> inline bool operator>= (const C (&s)[N], basic_substring<C> const that) { return that.compare(s) <= 0; }
+
+template<typename C> inline bool operator== (C const c, basic_substring<C> const that) { return that.compare(c) == 0; }
+template<typename C> inline bool operator!= (C const c, basic_substring<C> const that) { return that.compare(c) != 0; }
+template<typename C> inline bool operator<  (C const c, basic_substring<C> const that) { return that.compare(c) >  0; }
+template<typename C> inline bool operator>  (C const c, basic_substring<C> const that) { return that.compare(c) <  0; }
+template<typename C> inline bool operator<= (C const c, basic_substring<C> const that) { return that.compare(c) >= 0; }
+template<typename C> inline bool operator>= (C const c, basic_substring<C> const that) { return that.compare(c) <= 0; }
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** @define C4_SUBSTR_NO_OSTREAM_LSHIFT doctest does not deal well with
+ * template operator<<
+ * @see https://github.com/onqtam/doctest/pull/431 */
+#ifndef C4_SUBSTR_NO_OSTREAM_LSHIFT
+#ifdef __clang__
+#   pragma clang diagnostic push
+#   pragma clang diagnostic ignored "-Wsign-conversion"
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wsign-conversion"
+#endif
+
+/** output the string to a stream */
+template<class OStream, class C>
+inline OStream& operator<< (OStream& os, basic_substring<C> s)
+{
+    os.write(s.str, s.len);
+    return os;
+}
+
+// this causes ambiguity
+///** this is used by google test */
+//template<class OStream, class C>
+//inline void PrintTo(basic_substring<C> s, OStream* os)
+//{
+//    os->write(s.str, s.len);
+//}
+
+#ifdef __clang__
+#   pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic pop
+#endif
+#endif // !C4_SUBSTR_NO_OSTREAM_LSHIFT
+
+} // namespace c4
+
+
+#ifdef __clang__
+#   pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic pop
+#endif
+
+#endif /* _C4_SUBSTR_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/substr.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/ext/fast_float.hpp
+// https://github.com/biojppm/c4core/src/c4/ext/fast_float.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_EXT_FAST_FLOAT_HPP_
+#define _C4_EXT_FAST_FLOAT_HPP_
+
+#ifdef _MSC_VER
+#   pragma warning(push)
+#   pragma warning(disable: 4996) // snprintf/scanf: this function or variable may be unsafe
+#elif defined(__clang__) || defined(__APPLE_CC__) || defined(_LIBCPP_VERSION)
+#   pragma clang diagnostic push
+#   if (defined(__clang_major__) && _clang_major__ >= 9) || defined(__APPLE_CC__)
+#       pragma clang diagnostic ignored "-Wfortify-source"
+#   endif
+#   pragma clang diagnostic ignored "-Wshift-count-overflow"
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wuseless-cast"
+#endif
+
+// fast_float by Daniel Lemire
+// fast_float by João Paulo Magalhaes
+
+
+// with contributions from Eugene Golushkov
+// with contributions from Maksim Kita
+// with contributions from Marcin Wojdyr
+// with contributions from Neal Richardson
+// with contributions from Tim Paine
+// with contributions from Fabio Pellacini
+
+
+// Permission is hereby granted, free of charge, to any
+// person obtaining a copy of this software and associated
+// documentation files (the "Software"), to deal in the
+// Software without restriction, including without
+// limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of
+// the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice
+// shall be included in all copies or substantial portions
+// of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+// SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+
+#ifndef FASTFLOAT_FAST_FLOAT_H
+#define FASTFLOAT_FAST_FLOAT_H
+
+#include <system_error>
+
+namespace fast_float {
+enum chars_format {
+    scientific = 1<<0,
+    fixed = 1<<2,
+    hex = 1<<3,
+    general = fixed | scientific
+};
+
+
+struct from_chars_result {
+  const char *ptr;
+  std::errc ec;
+};
+
+struct parse_options {
+  constexpr explicit parse_options(chars_format fmt = chars_format::general,
+                         char dot = '.')
+    : format(fmt), decimal_point(dot) {}
+
+  /** Which number formats are accepted */
+  chars_format format;
+  /** The character used as decimal point */
+  char decimal_point;
+};
+
+/**
+ * This function parses the character sequence [first,last) for a number. It parses floating-point numbers expecting
+ * a locale-indepent format equivalent to what is used by std::strtod in the default ("C") locale.
+ * The resulting floating-point value is the closest floating-point values (using either float or double),
+ * using the "round to even" convention for values that would otherwise fall right in-between two values.
+ * That is, we provide exact parsing according to the IEEE standard.
+ *
+ * Given a successful parse, the pointer (`ptr`) in the returned value is set to point right after the
+ * parsed number, and the `value` referenced is set to the parsed value. In case of error, the returned
+ * `ec` contains a representative error, otherwise the default (`std::errc()`) value is stored.
+ *
+ * The implementation does not throw and does not allocate memory (e.g., with `new` or `malloc`).
+ *
+ * Like the C++17 standard, the `fast_float::from_chars` functions take an optional last argument of
+ * the type `fast_float::chars_format`. It is a bitset value: we check whether
+ * `fmt & fast_float::chars_format::fixed` and `fmt & fast_float::chars_format::scientific` are set
+ * to determine whether we allowe the fixed point and scientific notation respectively.
+ * The default is  `fast_float::chars_format::general` which allows both `fixed` and `scientific`.
+ */
+template<typename T>
+from_chars_result from_chars(const char *first, const char *last,
+                             T &value, chars_format fmt = chars_format::general)  noexcept;
+
+/**
+ * Like from_chars, but accepts an `options` argument to govern number parsing.
+ */
+template<typename T>
+from_chars_result from_chars_advanced(const char *first, const char *last,
+                                      T &value, parse_options options)  noexcept;
+
+}
+#endif // FASTFLOAT_FAST_FLOAT_H
+
+
+#ifndef FASTFLOAT_FLOAT_COMMON_H
+#define FASTFLOAT_FLOAT_COMMON_H
+
+#include <cfloat>
+//included above:
+//#include <cstdint>
+#include <cassert>
+//included above:
+//#include <cstring>
+
+#if (defined(__x86_64) || defined(__x86_64__) || defined(_M_X64)   \
+       || defined(__amd64) || defined(__aarch64__) || defined(_M_ARM64) \
+       || defined(__MINGW64__)                                          \
+       || defined(__s390x__)                                            \
+       || (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)) \
+       || defined(__EMSCRIPTEN__))
+#define FASTFLOAT_64BIT
+#elif (defined(__i386) || defined(__i386__) || defined(_M_IX86)   \
+     || defined(__arm__) || defined(_M_ARM)                   \
+     || defined(__MINGW32__))
+#define FASTFLOAT_32BIT
+#else
+  // Need to check incrementally, since SIZE_MAX is a size_t, avoid overflow.
+  // We can never tell the register width, but the SIZE_MAX is a good approximation.
+  // UINTPTR_MAX and INTPTR_MAX are optional, so avoid them for max portability.
+  #if SIZE_MAX == 0xffff
+    #error Unknown platform (16-bit, unsupported)
+  #elif SIZE_MAX == 0xffffffff
+    #define FASTFLOAT_32BIT
+  #elif SIZE_MAX == 0xffffffffffffffff
+    #define FASTFLOAT_64BIT
+  #else
+    #error Unknown platform (not 32-bit, not 64-bit?)
+  #endif
+#endif
+
+#if ((defined(_WIN32) || defined(_WIN64)) && !defined(__clang__))
+//included above:
+//#include <intrin.h>
+#endif
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define FASTFLOAT_VISUAL_STUDIO 1
+#endif
+
+#ifdef _WIN32
+#define FASTFLOAT_IS_BIG_ENDIAN 0
+#else
+#if defined(__APPLE__) || defined(__FreeBSD__)
+#include <machine/endian.h>
+#elif defined(sun) || defined(__sun)
+#include <sys/byteorder.h>
+#else
+#include <endian.h>
+#endif
+#
+#ifndef __BYTE_ORDER__
+// safe choice
+#define FASTFLOAT_IS_BIG_ENDIAN 0
+#endif
+#
+#ifndef __ORDER_LITTLE_ENDIAN__
+// safe choice
+#define FASTFLOAT_IS_BIG_ENDIAN 0
+#endif
+#
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define FASTFLOAT_IS_BIG_ENDIAN 0
+#else
+#define FASTFLOAT_IS_BIG_ENDIAN 1
+#endif
+#endif
+
+#ifdef FASTFLOAT_VISUAL_STUDIO
+#define fastfloat_really_inline __forceinline
+#else
+#define fastfloat_really_inline inline __attribute__((always_inline))
+#endif
+
+#ifndef FASTFLOAT_ASSERT
+#define FASTFLOAT_ASSERT(x)  { if (!(x)) abort(); }
+#endif
+
+#ifndef FASTFLOAT_DEBUG_ASSERT
+//included above:
+//#include <cassert>
+#define FASTFLOAT_DEBUG_ASSERT(x) assert(x)
+#endif
+
+// rust style `try!()` macro, or `?` operator
+#define FASTFLOAT_TRY(x) { if (!(x)) return false; }
+
+namespace fast_float {
+
+// Compares two ASCII strings in a case insensitive manner.
+inline bool fastfloat_strncasecmp(const char *input1, const char *input2,
+                                  size_t length) {
+  char running_diff{0};
+  for (size_t i = 0; i < length; i++) {
+    running_diff |= (input1[i] ^ input2[i]);
+  }
+  return (running_diff == 0) || (running_diff == 32);
+}
+
+#ifndef FLT_EVAL_METHOD
+#error "FLT_EVAL_METHOD should be defined, please include cfloat."
+#endif
+
+// a pointer and a length to a contiguous block of memory
+template <typename T>
+struct span {
+  const T* ptr;
+  size_t length;
+  span(const T* _ptr, size_t _length) : ptr(_ptr), length(_length) {}
+  span() : ptr(nullptr), length(0) {}
+
+  constexpr size_t len() const noexcept {
+    return length;
+  }
+
+  const T& operator[](size_t index) const noexcept {
+    FASTFLOAT_DEBUG_ASSERT(index < length);
+    return ptr[index];
+  }
+};
+
+struct value128 {
+  uint64_t low;
+  uint64_t high;
+  value128(uint64_t _low, uint64_t _high) : low(_low), high(_high) {}
+  value128() : low(0), high(0) {}
+};
+
+/* result might be undefined when input_num is zero */
+fastfloat_really_inline int leading_zeroes(uint64_t input_num) {
+  assert(input_num > 0);
+#ifdef FASTFLOAT_VISUAL_STUDIO
+  #if defined(_M_X64) || defined(_M_ARM64)
+  unsigned long leading_zero = 0;
+  // Search the mask data from most significant bit (MSB)
+  // to least significant bit (LSB) for a set bit (1).
+  _BitScanReverse64(&leading_zero, input_num);
+  return (int)(63 - leading_zero);
+  #else
+  int last_bit = 0;
+  if(input_num & uint64_t(0xffffffff00000000)) input_num >>= 32, last_bit |= 32;
+  if(input_num & uint64_t(        0xffff0000)) input_num >>= 16, last_bit |= 16;
+  if(input_num & uint64_t(            0xff00)) input_num >>=  8, last_bit |=  8;
+  if(input_num & uint64_t(              0xf0)) input_num >>=  4, last_bit |=  4;
+  if(input_num & uint64_t(               0xc)) input_num >>=  2, last_bit |=  2;
+  if(input_num & uint64_t(               0x2)) input_num >>=  1, last_bit |=  1;
+  return 63 - last_bit;
+  #endif
+#else
+  return __builtin_clzll(input_num);
+#endif
+}
+
+#ifdef FASTFLOAT_32BIT
+
+// slow emulation routine for 32-bit
+fastfloat_really_inline uint64_t emulu(uint32_t x, uint32_t y) {
+    return x * (uint64_t)y;
+}
+
+// slow emulation routine for 32-bit
+#if !defined(__MINGW64__)
+fastfloat_really_inline uint64_t _umul128(uint64_t ab, uint64_t cd,
+                                          uint64_t *hi) {
+  uint64_t ad = emulu((uint32_t)(ab >> 32), (uint32_t)cd);
+  uint64_t bd = emulu((uint32_t)ab, (uint32_t)cd);
+  uint64_t adbc = ad + emulu((uint32_t)ab, (uint32_t)(cd >> 32));
+  uint64_t adbc_carry = !!(adbc < ad);
+  uint64_t lo = bd + (adbc << 32);
+  *hi = emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
+        (adbc_carry << 32) + !!(lo < bd);
+  return lo;
+}
+#endif // !__MINGW64__
+
+#endif // FASTFLOAT_32BIT
+
+
+// compute 64-bit a*b
+fastfloat_really_inline value128 full_multiplication(uint64_t a,
+                                                     uint64_t b) {
+  value128 answer;
+#ifdef _M_ARM64
+  // ARM64 has native support for 64-bit multiplications, no need to emulate
+  answer.high = __umulh(a, b);
+  answer.low = a * b;
+#elif defined(FASTFLOAT_32BIT) || (defined(_WIN64) && !defined(__clang__))
+  answer.low = _umul128(a, b, &answer.high); // _umul128 not available on ARM64
+#elif defined(FASTFLOAT_64BIT)
+  __uint128_t r = ((__uint128_t)a) * b;
+  answer.low = uint64_t(r);
+  answer.high = uint64_t(r >> 64);
+#else
+  #error Not implemented
+#endif
+  return answer;
+}
+
+struct adjusted_mantissa {
+  uint64_t mantissa{0};
+  int32_t power2{0}; // a negative value indicates an invalid result
+  adjusted_mantissa() = default;
+  bool operator==(const adjusted_mantissa &o) const {
+    return mantissa == o.mantissa && power2 == o.power2;
+  }
+  bool operator!=(const adjusted_mantissa &o) const {
+    return mantissa != o.mantissa || power2 != o.power2;
+  }
+};
+
+// Bias so we can get the real exponent with an invalid adjusted_mantissa.
+constexpr static int32_t invalid_am_bias = -0x8000;
+
+constexpr static double powers_of_ten_double[] = {
+    1e0,  1e1,  1e2,  1e3,  1e4,  1e5,  1e6,  1e7,  1e8,  1e9,  1e10, 1e11,
+    1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22};
+constexpr static float powers_of_ten_float[] = {1e0, 1e1, 1e2, 1e3, 1e4, 1e5,
+                                                1e6, 1e7, 1e8, 1e9, 1e10};
+
+template <typename T> struct binary_format {
+  static inline constexpr int mantissa_explicit_bits();
+  static inline constexpr int minimum_exponent();
+  static inline constexpr int infinite_power();
+  static inline constexpr int sign_index();
+  static inline constexpr int min_exponent_fast_path();
+  static inline constexpr int max_exponent_fast_path();
+  static inline constexpr int max_exponent_round_to_even();
+  static inline constexpr int min_exponent_round_to_even();
+  static inline constexpr uint64_t max_mantissa_fast_path();
+  static inline constexpr int largest_power_of_ten();
+  static inline constexpr int smallest_power_of_ten();
+  static inline constexpr T exact_power_of_ten(int64_t power);
+  static inline constexpr size_t max_digits();
+};
+
+template <> inline constexpr int binary_format<double>::mantissa_explicit_bits() {
+  return 52;
+}
+template <> inline constexpr int binary_format<float>::mantissa_explicit_bits() {
+  return 23;
+}
+
+template <> inline constexpr int binary_format<double>::max_exponent_round_to_even() {
+  return 23;
+}
+
+template <> inline constexpr int binary_format<float>::max_exponent_round_to_even() {
+  return 10;
+}
+
+template <> inline constexpr int binary_format<double>::min_exponent_round_to_even() {
+  return -4;
+}
+
+template <> inline constexpr int binary_format<float>::min_exponent_round_to_even() {
+  return -17;
+}
+
+template <> inline constexpr int binary_format<double>::minimum_exponent() {
+  return -1023;
+}
+template <> inline constexpr int binary_format<float>::minimum_exponent() {
+  return -127;
+}
+
+template <> inline constexpr int binary_format<double>::infinite_power() {
+  return 0x7FF;
+}
+template <> inline constexpr int binary_format<float>::infinite_power() {
+  return 0xFF;
+}
+
+template <> inline constexpr int binary_format<double>::sign_index() { return 63; }
+template <> inline constexpr int binary_format<float>::sign_index() { return 31; }
+
+template <> inline constexpr int binary_format<double>::min_exponent_fast_path() {
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  return 0;
+#else
+  return -22;
+#endif
+}
+template <> inline constexpr int binary_format<float>::min_exponent_fast_path() {
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  return 0;
+#else
+  return -10;
+#endif
+}
+
+template <> inline constexpr int binary_format<double>::max_exponent_fast_path() {
+  return 22;
+}
+template <> inline constexpr int binary_format<float>::max_exponent_fast_path() {
+  return 10;
+}
+
+template <> inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path() {
+  return uint64_t(2) << mantissa_explicit_bits();
+}
+template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path() {
+  return uint64_t(2) << mantissa_explicit_bits();
+}
+
+template <>
+inline constexpr double binary_format<double>::exact_power_of_ten(int64_t power) {
+  return powers_of_ten_double[power];
+}
+template <>
+inline constexpr float binary_format<float>::exact_power_of_ten(int64_t power) {
+
+  return powers_of_ten_float[power];
+}
+
+
+template <>
+inline constexpr int binary_format<double>::largest_power_of_ten() {
+  return 308;
+}
+template <>
+inline constexpr int binary_format<float>::largest_power_of_ten() {
+  return 38;
+}
+
+template <>
+inline constexpr int binary_format<double>::smallest_power_of_ten() {
+  return -342;
+}
+template <>
+inline constexpr int binary_format<float>::smallest_power_of_ten() {
+  return -65;
+}
+
+template <> inline constexpr size_t binary_format<double>::max_digits() {
+  return 769;
+}
+template <> inline constexpr size_t binary_format<float>::max_digits() {
+  return 114;
+}
+
+template<typename T>
+fastfloat_really_inline void to_float(bool negative, adjusted_mantissa am, T &value) {
+  uint64_t word = am.mantissa;
+  word |= uint64_t(am.power2) << binary_format<T>::mantissa_explicit_bits();
+  word = negative
+  ? word | (uint64_t(1) << binary_format<T>::sign_index()) : word;
+#if FASTFLOAT_IS_BIG_ENDIAN == 1
+   if (std::is_same<T, float>::value) {
+     ::memcpy(&value, (char *)&word + 4, sizeof(T)); // extract value at offset 4-7 if float on big-endian
+   } else {
+     ::memcpy(&value, &word, sizeof(T));
+   }
+#else
+   // For little-endian systems:
+   ::memcpy(&value, &word, sizeof(T));
+#endif
+}
+
+} // namespace fast_float
+
+#endif
+
+
+#ifndef FASTFLOAT_ASCII_NUMBER_H
+#define FASTFLOAT_ASCII_NUMBER_H
+
+//included above:
+//#include <cctype>
+//included above:
+//#include <cstdint>
+//included above:
+//#include <cstring>
+#include <iterator>
+
+
+namespace fast_float {
+
+// Next function can be micro-optimized, but compilers are entirely
+// able to optimize it well.
+fastfloat_really_inline bool is_integer(char c)  noexcept  { return c >= '0' && c <= '9'; }
+
+fastfloat_really_inline uint64_t byteswap(uint64_t val) {
+  return (val & 0xFF00000000000000) >> 56
+    | (val & 0x00FF000000000000) >> 40
+    | (val & 0x0000FF0000000000) >> 24
+    | (val & 0x000000FF00000000) >> 8
+    | (val & 0x00000000FF000000) << 8
+    | (val & 0x0000000000FF0000) << 24
+    | (val & 0x000000000000FF00) << 40
+    | (val & 0x00000000000000FF) << 56;
+}
+
+fastfloat_really_inline uint64_t read_u64(const char *chars) {
+  uint64_t val;
+  ::memcpy(&val, chars, sizeof(uint64_t));
+#if FASTFLOAT_IS_BIG_ENDIAN == 1
+  // Need to read as-if the number was in little-endian order.
+  val = byteswap(val);
+#endif
+  return val;
+}
+
+fastfloat_really_inline void write_u64(uint8_t *chars, uint64_t val) {
+#if FASTFLOAT_IS_BIG_ENDIAN == 1
+  // Need to read as-if the number was in little-endian order.
+  val = byteswap(val);
+#endif
+  ::memcpy(chars, &val, sizeof(uint64_t));
+}
+
+// credit  @aqrit
+fastfloat_really_inline uint32_t  parse_eight_digits_unrolled(uint64_t val) {
+  const uint64_t mask = 0x000000FF000000FF;
+  const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
+  const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
+  val -= 0x3030303030303030;
+  val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
+  val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;
+  return uint32_t(val);
+}
+
+fastfloat_really_inline uint32_t parse_eight_digits_unrolled(const char *chars)  noexcept  {
+  return parse_eight_digits_unrolled(read_u64(chars));
+}
+
+// credit @aqrit
+fastfloat_really_inline bool is_made_of_eight_digits_fast(uint64_t val)  noexcept  {
+  return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) &
+     0x8080808080808080));
+}
+
+fastfloat_really_inline bool is_made_of_eight_digits_fast(const char *chars)  noexcept  {
+  return is_made_of_eight_digits_fast(read_u64(chars));
+}
+
+typedef span<const char> byte_span;
+
+struct parsed_number_string {
+  int64_t exponent{0};
+  uint64_t mantissa{0};
+  const char *lastmatch{nullptr};
+  bool negative{false};
+  bool valid{false};
+  bool too_many_digits{false};
+  // contains the range of the significant digits
+  byte_span integer{};  // non-nullable
+  byte_span fraction{}; // nullable
+};
+
+// Assuming that you use no more than 19 digits, this will
+// parse an ASCII string.
+fastfloat_really_inline
+parsed_number_string parse_number_string(const char *p, const char *pend, parse_options options) noexcept {
+  const chars_format fmt = options.format;
+  const char decimal_point = options.decimal_point;
+
+  parsed_number_string answer;
+  answer.valid = false;
+  answer.too_many_digits = false;
+  answer.negative = (*p == '-');
+  if (*p == '-') { // C++17 20.19.3.(7.1) explicitly forbids '+' sign here
+    ++p;
+    if (p == pend) {
+      return answer;
+    }
+    if (!is_integer(*p) && (*p != decimal_point)) { // a sign must be followed by an integer or the dot
+      return answer;
+    }
+  }
+  const char *const start_digits = p;
+
+  uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
+
+  while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
+    i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
+    p += 8;
+  }
+  while ((p != pend) && is_integer(*p)) {
+    // a multiplication by 10 is cheaper than an arbitrary integer
+    // multiplication
+    i = 10 * i +
+        uint64_t(*p - '0'); // might overflow, we will handle the overflow later
+    ++p;
+  }
+  const char *const end_of_integer_part = p;
+  int64_t digit_count = int64_t(end_of_integer_part - start_digits);
+  answer.integer = byte_span(start_digits, size_t(digit_count));
+  int64_t exponent = 0;
+  if ((p != pend) && (*p == decimal_point)) {
+    ++p;
+    const char* before = p;
+    // can occur at most twice without overflowing, but let it occur more, since
+    // for integers with many digits, digit parsing is the primary bottleneck.
+    while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
+      i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
+      p += 8;
+    }
+    while ((p != pend) && is_integer(*p)) {
+      uint8_t digit = uint8_t(*p - '0');
+      ++p;
+      i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
+    }
+    exponent = before - p;
+    answer.fraction = byte_span(before, size_t(p - before));
+    digit_count -= exponent;
+  }
+  // we must have encountered at least one integer!
+  if (digit_count == 0) {
+    return answer;
+  }
+  int64_t exp_number = 0;            // explicit exponential part
+  if ((fmt & chars_format::scientific) && (p != pend) && (('e' == *p) || ('E' == *p))) {
+    const char * location_of_e = p;
+    ++p;
+    bool neg_exp = false;
+    if ((p != pend) && ('-' == *p)) {
+      neg_exp = true;
+      ++p;
+    } else if ((p != pend) && ('+' == *p)) { // '+' on exponent is allowed by C++17 20.19.3.(7.1)
+      ++p;
+    }
+    if ((p == pend) || !is_integer(*p)) {
+      if(!(fmt & chars_format::fixed)) {
+        // We are in error.
+        return answer;
+      }
+      // Otherwise, we will be ignoring the 'e'.
+      p = location_of_e;
+    } else {
+      while ((p != pend) && is_integer(*p)) {
+        uint8_t digit = uint8_t(*p - '0');
+        if (exp_number < 0x10000000) {
+          exp_number = 10 * exp_number + digit;
+        }
+        ++p;
+      }
+      if(neg_exp) { exp_number = - exp_number; }
+      exponent += exp_number;
+    }
+  } else {
+    // If it scientific and not fixed, we have to bail out.
+    if((fmt & chars_format::scientific) && !(fmt & chars_format::fixed)) { return answer; }
+  }
+  answer.lastmatch = p;
+  answer.valid = true;
+
+  // If we frequently had to deal with long strings of digits,
+  // we could extend our code by using a 128-bit integer instead
+  // of a 64-bit integer. However, this is uncommon.
+  //
+  // We can deal with up to 19 digits.
+  if (digit_count > 19) { // this is uncommon
+    // It is possible that the integer had an overflow.
+    // We have to handle the case where we have 0.0000somenumber.
+    // We need to be mindful of the case where we only have zeroes...
+    // E.g., 0.000000000...000.
+    const char *start = start_digits;
+    while ((start != pend) && (*start == '0' || *start == decimal_point)) {
+      if(*start == '0') { digit_count --; }
+      start++;
+    }
+    if (digit_count > 19) {
+      answer.too_many_digits = true;
+      // Let us start again, this time, avoiding overflows.
+      // We don't need to check if is_integer, since we use the
+      // pre-tokenized spans from above.
+      i = 0;
+      p = answer.integer.ptr;
+      const char* int_end = p + answer.integer.len();
+      const uint64_t minimal_nineteen_digit_integer{1000000000000000000};
+      while((i < minimal_nineteen_digit_integer) && (p != int_end)) {
+        i = i * 10 + uint64_t(*p - '0');
+        ++p;
+      }
+      if (i >= minimal_nineteen_digit_integer) { // We have a big integers
+        exponent = end_of_integer_part - p + exp_number;
+      } else { // We have a value with a fractional component.
+          p = answer.fraction.ptr;
+          const char* frac_end = p + answer.fraction.len();
+          while((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
+            i = i * 10 + uint64_t(*p - '0');
+            ++p;
+          }
+          exponent = answer.fraction.ptr - p + exp_number;
+      }
+      // We have now corrected both exponent and i, to a truncated value
+    }
+  }
+  answer.exponent = exponent;
+  answer.mantissa = i;
+  return answer;
+}
+
+} // namespace fast_float
+
+#endif
+
+
+#ifndef FASTFLOAT_FAST_TABLE_H
+#define FASTFLOAT_FAST_TABLE_H
+
+//included above:
+//#include <cstdint>
+
+namespace fast_float {
+
+/**
+ * When mapping numbers from decimal to binary,
+ * we go from w * 10^q to m * 2^p but we have
+ * 10^q = 5^q * 2^q, so effectively
+ * we are trying to match
+ * w * 2^q * 5^q to m * 2^p. Thus the powers of two
+ * are not a concern since they can be represented
+ * exactly using the binary notation, only the powers of five
+ * affect the binary significand.
+ */
+
+/**
+ * The smallest non-zero float (binary64) is 2^−1074.
+ * We take as input numbers of the form w x 10^q where w < 2^64.
+ * We have that w * 10^-343  <  2^(64-344) 5^-343 < 2^-1076.
+ * However, we have that
+ * (2^64-1) * 10^-342 =  (2^64-1) * 2^-342 * 5^-342 > 2^−1074.
+ * Thus it is possible for a number of the form w * 10^-342 where
+ * w is a 64-bit value to be a non-zero floating-point number.
+ *********
+ * Any number of form w * 10^309 where w>= 1 is going to be
+ * infinite in binary64 so we never need to worry about powers
+ * of 5 greater than 308.
+ */
+template <class unused = void>
+struct powers_template {
+
+constexpr static int smallest_power_of_five = binary_format<double>::smallest_power_of_ten();
+constexpr static int largest_power_of_five = binary_format<double>::largest_power_of_ten();
+constexpr static int number_of_entries = 2 * (largest_power_of_five - smallest_power_of_five + 1);
+// Powers of five from 5^-342 all the way to 5^308 rounded toward one.
+static const uint64_t power_of_five_128[number_of_entries];
+};
+
+template <class unused>
+const uint64_t powers_template<unused>::power_of_five_128[number_of_entries] = {
+        0xeef453d6923bd65a,0x113faa2906a13b3f,
+        0x9558b4661b6565f8,0x4ac7ca59a424c507,
+        0xbaaee17fa23ebf76,0x5d79bcf00d2df649,
+        0xe95a99df8ace6f53,0xf4d82c2c107973dc,
+        0x91d8a02bb6c10594,0x79071b9b8a4be869,
+        0xb64ec836a47146f9,0x9748e2826cdee284,
+        0xe3e27a444d8d98b7,0xfd1b1b2308169b25,
+        0x8e6d8c6ab0787f72,0xfe30f0f5e50e20f7,
+        0xb208ef855c969f4f,0xbdbd2d335e51a935,
+        0xde8b2b66b3bc4723,0xad2c788035e61382,
+        0x8b16fb203055ac76,0x4c3bcb5021afcc31,
+        0xaddcb9e83c6b1793,0xdf4abe242a1bbf3d,
+        0xd953e8624b85dd78,0xd71d6dad34a2af0d,
+        0x87d4713d6f33aa6b,0x8672648c40e5ad68,
+        0xa9c98d8ccb009506,0x680efdaf511f18c2,
+        0xd43bf0effdc0ba48,0x212bd1b2566def2,
+        0x84a57695fe98746d,0x14bb630f7604b57,
+        0xa5ced43b7e3e9188,0x419ea3bd35385e2d,
+        0xcf42894a5dce35ea,0x52064cac828675b9,
+        0x818995ce7aa0e1b2,0x7343efebd1940993,
+        0xa1ebfb4219491a1f,0x1014ebe6c5f90bf8,
+        0xca66fa129f9b60a6,0xd41a26e077774ef6,
+        0xfd00b897478238d0,0x8920b098955522b4,
+        0x9e20735e8cb16382,0x55b46e5f5d5535b0,
+        0xc5a890362fddbc62,0xeb2189f734aa831d,
+        0xf712b443bbd52b7b,0xa5e9ec7501d523e4,
+        0x9a6bb0aa55653b2d,0x47b233c92125366e,
+        0xc1069cd4eabe89f8,0x999ec0bb696e840a,
+        0xf148440a256e2c76,0xc00670ea43ca250d,
+        0x96cd2a865764dbca,0x380406926a5e5728,
+        0xbc807527ed3e12bc,0xc605083704f5ecf2,
+        0xeba09271e88d976b,0xf7864a44c633682e,
+        0x93445b8731587ea3,0x7ab3ee6afbe0211d,
+        0xb8157268fdae9e4c,0x5960ea05bad82964,
+        0xe61acf033d1a45df,0x6fb92487298e33bd,
+        0x8fd0c16206306bab,0xa5d3b6d479f8e056,
+        0xb3c4f1ba87bc8696,0x8f48a4899877186c,
+        0xe0b62e2929aba83c,0x331acdabfe94de87,
+        0x8c71dcd9ba0b4925,0x9ff0c08b7f1d0b14,
+        0xaf8e5410288e1b6f,0x7ecf0ae5ee44dd9,
+        0xdb71e91432b1a24a,0xc9e82cd9f69d6150,
+        0x892731ac9faf056e,0xbe311c083a225cd2,
+        0xab70fe17c79ac6ca,0x6dbd630a48aaf406,
+        0xd64d3d9db981787d,0x92cbbccdad5b108,
+        0x85f0468293f0eb4e,0x25bbf56008c58ea5,
+        0xa76c582338ed2621,0xaf2af2b80af6f24e,
+        0xd1476e2c07286faa,0x1af5af660db4aee1,
+        0x82cca4db847945ca,0x50d98d9fc890ed4d,
+        0xa37fce126597973c,0xe50ff107bab528a0,
+        0xcc5fc196fefd7d0c,0x1e53ed49a96272c8,
+        0xff77b1fcbebcdc4f,0x25e8e89c13bb0f7a,
+        0x9faacf3df73609b1,0x77b191618c54e9ac,
+        0xc795830d75038c1d,0xd59df5b9ef6a2417,
+        0xf97ae3d0d2446f25,0x4b0573286b44ad1d,
+        0x9becce62836ac577,0x4ee367f9430aec32,
+        0xc2e801fb244576d5,0x229c41f793cda73f,
+        0xf3a20279ed56d48a,0x6b43527578c1110f,
+        0x9845418c345644d6,0x830a13896b78aaa9,
+        0xbe5691ef416bd60c,0x23cc986bc656d553,
+        0xedec366b11c6cb8f,0x2cbfbe86b7ec8aa8,
+        0x94b3a202eb1c3f39,0x7bf7d71432f3d6a9,
+        0xb9e08a83a5e34f07,0xdaf5ccd93fb0cc53,
+        0xe858ad248f5c22c9,0xd1b3400f8f9cff68,
+        0x91376c36d99995be,0x23100809b9c21fa1,
+        0xb58547448ffffb2d,0xabd40a0c2832a78a,
+        0xe2e69915b3fff9f9,0x16c90c8f323f516c,
+        0x8dd01fad907ffc3b,0xae3da7d97f6792e3,
+        0xb1442798f49ffb4a,0x99cd11cfdf41779c,
+        0xdd95317f31c7fa1d,0x40405643d711d583,
+        0x8a7d3eef7f1cfc52,0x482835ea666b2572,
+        0xad1c8eab5ee43b66,0xda3243650005eecf,
+        0xd863b256369d4a40,0x90bed43e40076a82,
+        0x873e4f75e2224e68,0x5a7744a6e804a291,
+        0xa90de3535aaae202,0x711515d0a205cb36,
+        0xd3515c2831559a83,0xd5a5b44ca873e03,
+        0x8412d9991ed58091,0xe858790afe9486c2,
+        0xa5178fff668ae0b6,0x626e974dbe39a872,
+        0xce5d73ff402d98e3,0xfb0a3d212dc8128f,
+        0x80fa687f881c7f8e,0x7ce66634bc9d0b99,
+        0xa139029f6a239f72,0x1c1fffc1ebc44e80,
+        0xc987434744ac874e,0xa327ffb266b56220,
+        0xfbe9141915d7a922,0x4bf1ff9f0062baa8,
+        0x9d71ac8fada6c9b5,0x6f773fc3603db4a9,
+        0xc4ce17b399107c22,0xcb550fb4384d21d3,
+        0xf6019da07f549b2b,0x7e2a53a146606a48,
+        0x99c102844f94e0fb,0x2eda7444cbfc426d,
+        0xc0314325637a1939,0xfa911155fefb5308,
+        0xf03d93eebc589f88,0x793555ab7eba27ca,
+        0x96267c7535b763b5,0x4bc1558b2f3458de,
+        0xbbb01b9283253ca2,0x9eb1aaedfb016f16,
+        0xea9c227723ee8bcb,0x465e15a979c1cadc,
+        0x92a1958a7675175f,0xbfacd89ec191ec9,
+        0xb749faed14125d36,0xcef980ec671f667b,
+        0xe51c79a85916f484,0x82b7e12780e7401a,
+        0x8f31cc0937ae58d2,0xd1b2ecb8b0908810,
+        0xb2fe3f0b8599ef07,0x861fa7e6dcb4aa15,
+        0xdfbdcece67006ac9,0x67a791e093e1d49a,
+        0x8bd6a141006042bd,0xe0c8bb2c5c6d24e0,
+        0xaecc49914078536d,0x58fae9f773886e18,
+        0xda7f5bf590966848,0xaf39a475506a899e,
+        0x888f99797a5e012d,0x6d8406c952429603,
+        0xaab37fd7d8f58178,0xc8e5087ba6d33b83,
+        0xd5605fcdcf32e1d6,0xfb1e4a9a90880a64,
+        0x855c3be0a17fcd26,0x5cf2eea09a55067f,
+        0xa6b34ad8c9dfc06f,0xf42faa48c0ea481e,
+        0xd0601d8efc57b08b,0xf13b94daf124da26,
+        0x823c12795db6ce57,0x76c53d08d6b70858,
+        0xa2cb1717b52481ed,0x54768c4b0c64ca6e,
+        0xcb7ddcdda26da268,0xa9942f5dcf7dfd09,
+        0xfe5d54150b090b02,0xd3f93b35435d7c4c,
+        0x9efa548d26e5a6e1,0xc47bc5014a1a6daf,
+        0xc6b8e9b0709f109a,0x359ab6419ca1091b,
+        0xf867241c8cc6d4c0,0xc30163d203c94b62,
+        0x9b407691d7fc44f8,0x79e0de63425dcf1d,
+        0xc21094364dfb5636,0x985915fc12f542e4,
+        0xf294b943e17a2bc4,0x3e6f5b7b17b2939d,
+        0x979cf3ca6cec5b5a,0xa705992ceecf9c42,
+        0xbd8430bd08277231,0x50c6ff782a838353,
+        0xece53cec4a314ebd,0xa4f8bf5635246428,
+        0x940f4613ae5ed136,0x871b7795e136be99,
+        0xb913179899f68584,0x28e2557b59846e3f,
+        0xe757dd7ec07426e5,0x331aeada2fe589cf,
+        0x9096ea6f3848984f,0x3ff0d2c85def7621,
+        0xb4bca50b065abe63,0xfed077a756b53a9,
+        0xe1ebce4dc7f16dfb,0xd3e8495912c62894,
+        0x8d3360f09cf6e4bd,0x64712dd7abbbd95c,
+        0xb080392cc4349dec,0xbd8d794d96aacfb3,
+        0xdca04777f541c567,0xecf0d7a0fc5583a0,
+        0x89e42caaf9491b60,0xf41686c49db57244,
+        0xac5d37d5b79b6239,0x311c2875c522ced5,
+        0xd77485cb25823ac7,0x7d633293366b828b,
+        0x86a8d39ef77164bc,0xae5dff9c02033197,
+        0xa8530886b54dbdeb,0xd9f57f830283fdfc,
+        0xd267caa862a12d66,0xd072df63c324fd7b,
+        0x8380dea93da4bc60,0x4247cb9e59f71e6d,
+        0xa46116538d0deb78,0x52d9be85f074e608,
+        0xcd795be870516656,0x67902e276c921f8b,
+        0x806bd9714632dff6,0xba1cd8a3db53b6,
+        0xa086cfcd97bf97f3,0x80e8a40eccd228a4,
+        0xc8a883c0fdaf7df0,0x6122cd128006b2cd,
+        0xfad2a4b13d1b5d6c,0x796b805720085f81,
+        0x9cc3a6eec6311a63,0xcbe3303674053bb0,
+        0xc3f490aa77bd60fc,0xbedbfc4411068a9c,
+        0xf4f1b4d515acb93b,0xee92fb5515482d44,
+        0x991711052d8bf3c5,0x751bdd152d4d1c4a,
+        0xbf5cd54678eef0b6,0xd262d45a78a0635d,
+        0xef340a98172aace4,0x86fb897116c87c34,
+        0x9580869f0e7aac0e,0xd45d35e6ae3d4da0,
+        0xbae0a846d2195712,0x8974836059cca109,
+        0xe998d258869facd7,0x2bd1a438703fc94b,
+        0x91ff83775423cc06,0x7b6306a34627ddcf,
+        0xb67f6455292cbf08,0x1a3bc84c17b1d542,
+        0xe41f3d6a7377eeca,0x20caba5f1d9e4a93,
+        0x8e938662882af53e,0x547eb47b7282ee9c,
+        0xb23867fb2a35b28d,0xe99e619a4f23aa43,
+        0xdec681f9f4c31f31,0x6405fa00e2ec94d4,
+        0x8b3c113c38f9f37e,0xde83bc408dd3dd04,
+        0xae0b158b4738705e,0x9624ab50b148d445,
+        0xd98ddaee19068c76,0x3badd624dd9b0957,
+        0x87f8a8d4cfa417c9,0xe54ca5d70a80e5d6,
+        0xa9f6d30a038d1dbc,0x5e9fcf4ccd211f4c,
+        0xd47487cc8470652b,0x7647c3200069671f,
+        0x84c8d4dfd2c63f3b,0x29ecd9f40041e073,
+        0xa5fb0a17c777cf09,0xf468107100525890,
+        0xcf79cc9db955c2cc,0x7182148d4066eeb4,
+        0x81ac1fe293d599bf,0xc6f14cd848405530,
+        0xa21727db38cb002f,0xb8ada00e5a506a7c,
+        0xca9cf1d206fdc03b,0xa6d90811f0e4851c,
+        0xfd442e4688bd304a,0x908f4a166d1da663,
+        0x9e4a9cec15763e2e,0x9a598e4e043287fe,
+        0xc5dd44271ad3cdba,0x40eff1e1853f29fd,
+        0xf7549530e188c128,0xd12bee59e68ef47c,
+        0x9a94dd3e8cf578b9,0x82bb74f8301958ce,
+        0xc13a148e3032d6e7,0xe36a52363c1faf01,
+        0xf18899b1bc3f8ca1,0xdc44e6c3cb279ac1,
+        0x96f5600f15a7b7e5,0x29ab103a5ef8c0b9,
+        0xbcb2b812db11a5de,0x7415d448f6b6f0e7,
+        0xebdf661791d60f56,0x111b495b3464ad21,
+        0x936b9fcebb25c995,0xcab10dd900beec34,
+        0xb84687c269ef3bfb,0x3d5d514f40eea742,
+        0xe65829b3046b0afa,0xcb4a5a3112a5112,
+        0x8ff71a0fe2c2e6dc,0x47f0e785eaba72ab,
+        0xb3f4e093db73a093,0x59ed216765690f56,
+        0xe0f218b8d25088b8,0x306869c13ec3532c,
+        0x8c974f7383725573,0x1e414218c73a13fb,
+        0xafbd2350644eeacf,0xe5d1929ef90898fa,
+        0xdbac6c247d62a583,0xdf45f746b74abf39,
+        0x894bc396ce5da772,0x6b8bba8c328eb783,
+        0xab9eb47c81f5114f,0x66ea92f3f326564,
+        0xd686619ba27255a2,0xc80a537b0efefebd,
+        0x8613fd0145877585,0xbd06742ce95f5f36,
+        0xa798fc4196e952e7,0x2c48113823b73704,
+        0xd17f3b51fca3a7a0,0xf75a15862ca504c5,
+        0x82ef85133de648c4,0x9a984d73dbe722fb,
+        0xa3ab66580d5fdaf5,0xc13e60d0d2e0ebba,
+        0xcc963fee10b7d1b3,0x318df905079926a8,
+        0xffbbcfe994e5c61f,0xfdf17746497f7052,
+        0x9fd561f1fd0f9bd3,0xfeb6ea8bedefa633,
+        0xc7caba6e7c5382c8,0xfe64a52ee96b8fc0,
+        0xf9bd690a1b68637b,0x3dfdce7aa3c673b0,
+        0x9c1661a651213e2d,0x6bea10ca65c084e,
+        0xc31bfa0fe5698db8,0x486e494fcff30a62,
+        0xf3e2f893dec3f126,0x5a89dba3c3efccfa,
+        0x986ddb5c6b3a76b7,0xf89629465a75e01c,
+        0xbe89523386091465,0xf6bbb397f1135823,
+        0xee2ba6c0678b597f,0x746aa07ded582e2c,
+        0x94db483840b717ef,0xa8c2a44eb4571cdc,
+        0xba121a4650e4ddeb,0x92f34d62616ce413,
+        0xe896a0d7e51e1566,0x77b020baf9c81d17,
+        0x915e2486ef32cd60,0xace1474dc1d122e,
+        0xb5b5ada8aaff80b8,0xd819992132456ba,
+        0xe3231912d5bf60e6,0x10e1fff697ed6c69,
+        0x8df5efabc5979c8f,0xca8d3ffa1ef463c1,
+        0xb1736b96b6fd83b3,0xbd308ff8a6b17cb2,
+        0xddd0467c64bce4a0,0xac7cb3f6d05ddbde,
+        0x8aa22c0dbef60ee4,0x6bcdf07a423aa96b,
+        0xad4ab7112eb3929d,0x86c16c98d2c953c6,
+        0xd89d64d57a607744,0xe871c7bf077ba8b7,
+        0x87625f056c7c4a8b,0x11471cd764ad4972,
+        0xa93af6c6c79b5d2d,0xd598e40d3dd89bcf,
+        0xd389b47879823479,0x4aff1d108d4ec2c3,
+        0x843610cb4bf160cb,0xcedf722a585139ba,
+        0xa54394fe1eedb8fe,0xc2974eb4ee658828,
+        0xce947a3da6a9273e,0x733d226229feea32,
+        0x811ccc668829b887,0x806357d5a3f525f,
+        0xa163ff802a3426a8,0xca07c2dcb0cf26f7,
+        0xc9bcff6034c13052,0xfc89b393dd02f0b5,
+        0xfc2c3f3841f17c67,0xbbac2078d443ace2,
+        0x9d9ba7832936edc0,0xd54b944b84aa4c0d,
+        0xc5029163f384a931,0xa9e795e65d4df11,
+        0xf64335bcf065d37d,0x4d4617b5ff4a16d5,
+        0x99ea0196163fa42e,0x504bced1bf8e4e45,
+        0xc06481fb9bcf8d39,0xe45ec2862f71e1d6,
+        0xf07da27a82c37088,0x5d767327bb4e5a4c,
+        0x964e858c91ba2655,0x3a6a07f8d510f86f,
+        0xbbe226efb628afea,0x890489f70a55368b,
+        0xeadab0aba3b2dbe5,0x2b45ac74ccea842e,
+        0x92c8ae6b464fc96f,0x3b0b8bc90012929d,
+        0xb77ada0617e3bbcb,0x9ce6ebb40173744,
+        0xe55990879ddcaabd,0xcc420a6a101d0515,
+        0x8f57fa54c2a9eab6,0x9fa946824a12232d,
+        0xb32df8e9f3546564,0x47939822dc96abf9,
+        0xdff9772470297ebd,0x59787e2b93bc56f7,
+        0x8bfbea76c619ef36,0x57eb4edb3c55b65a,
+        0xaefae51477a06b03,0xede622920b6b23f1,
+        0xdab99e59958885c4,0xe95fab368e45eced,
+        0x88b402f7fd75539b,0x11dbcb0218ebb414,
+        0xaae103b5fcd2a881,0xd652bdc29f26a119,
+        0xd59944a37c0752a2,0x4be76d3346f0495f,
+        0x857fcae62d8493a5,0x6f70a4400c562ddb,
+        0xa6dfbd9fb8e5b88e,0xcb4ccd500f6bb952,
+        0xd097ad07a71f26b2,0x7e2000a41346a7a7,
+        0x825ecc24c873782f,0x8ed400668c0c28c8,
+        0xa2f67f2dfa90563b,0x728900802f0f32fa,
+        0xcbb41ef979346bca,0x4f2b40a03ad2ffb9,
+        0xfea126b7d78186bc,0xe2f610c84987bfa8,
+        0x9f24b832e6b0f436,0xdd9ca7d2df4d7c9,
+        0xc6ede63fa05d3143,0x91503d1c79720dbb,
+        0xf8a95fcf88747d94,0x75a44c6397ce912a,
+        0x9b69dbe1b548ce7c,0xc986afbe3ee11aba,
+        0xc24452da229b021b,0xfbe85badce996168,
+        0xf2d56790ab41c2a2,0xfae27299423fb9c3,
+        0x97c560ba6b0919a5,0xdccd879fc967d41a,
+        0xbdb6b8e905cb600f,0x5400e987bbc1c920,
+        0xed246723473e3813,0x290123e9aab23b68,
+        0x9436c0760c86e30b,0xf9a0b6720aaf6521,
+        0xb94470938fa89bce,0xf808e40e8d5b3e69,
+        0xe7958cb87392c2c2,0xb60b1d1230b20e04,
+        0x90bd77f3483bb9b9,0xb1c6f22b5e6f48c2,
+        0xb4ecd5f01a4aa828,0x1e38aeb6360b1af3,
+        0xe2280b6c20dd5232,0x25c6da63c38de1b0,
+        0x8d590723948a535f,0x579c487e5a38ad0e,
+        0xb0af48ec79ace837,0x2d835a9df0c6d851,
+        0xdcdb1b2798182244,0xf8e431456cf88e65,
+        0x8a08f0f8bf0f156b,0x1b8e9ecb641b58ff,
+        0xac8b2d36eed2dac5,0xe272467e3d222f3f,
+        0xd7adf884aa879177,0x5b0ed81dcc6abb0f,
+        0x86ccbb52ea94baea,0x98e947129fc2b4e9,
+        0xa87fea27a539e9a5,0x3f2398d747b36224,
+        0xd29fe4b18e88640e,0x8eec7f0d19a03aad,
+        0x83a3eeeef9153e89,0x1953cf68300424ac,
+        0xa48ceaaab75a8e2b,0x5fa8c3423c052dd7,
+        0xcdb02555653131b6,0x3792f412cb06794d,
+        0x808e17555f3ebf11,0xe2bbd88bbee40bd0,
+        0xa0b19d2ab70e6ed6,0x5b6aceaeae9d0ec4,
+        0xc8de047564d20a8b,0xf245825a5a445275,
+        0xfb158592be068d2e,0xeed6e2f0f0d56712,
+        0x9ced737bb6c4183d,0x55464dd69685606b,
+        0xc428d05aa4751e4c,0xaa97e14c3c26b886,
+        0xf53304714d9265df,0xd53dd99f4b3066a8,
+        0x993fe2c6d07b7fab,0xe546a8038efe4029,
+        0xbf8fdb78849a5f96,0xde98520472bdd033,
+        0xef73d256a5c0f77c,0x963e66858f6d4440,
+        0x95a8637627989aad,0xdde7001379a44aa8,
+        0xbb127c53b17ec159,0x5560c018580d5d52,
+        0xe9d71b689dde71af,0xaab8f01e6e10b4a6,
+        0x9226712162ab070d,0xcab3961304ca70e8,
+        0xb6b00d69bb55c8d1,0x3d607b97c5fd0d22,
+        0xe45c10c42a2b3b05,0x8cb89a7db77c506a,
+        0x8eb98a7a9a5b04e3,0x77f3608e92adb242,
+        0xb267ed1940f1c61c,0x55f038b237591ed3,
+        0xdf01e85f912e37a3,0x6b6c46dec52f6688,
+        0x8b61313bbabce2c6,0x2323ac4b3b3da015,
+        0xae397d8aa96c1b77,0xabec975e0a0d081a,
+        0xd9c7dced53c72255,0x96e7bd358c904a21,
+        0x881cea14545c7575,0x7e50d64177da2e54,
+        0xaa242499697392d2,0xdde50bd1d5d0b9e9,
+        0xd4ad2dbfc3d07787,0x955e4ec64b44e864,
+        0x84ec3c97da624ab4,0xbd5af13bef0b113e,
+        0xa6274bbdd0fadd61,0xecb1ad8aeacdd58e,
+        0xcfb11ead453994ba,0x67de18eda5814af2,
+        0x81ceb32c4b43fcf4,0x80eacf948770ced7,
+        0xa2425ff75e14fc31,0xa1258379a94d028d,
+        0xcad2f7f5359a3b3e,0x96ee45813a04330,
+        0xfd87b5f28300ca0d,0x8bca9d6e188853fc,
+        0x9e74d1b791e07e48,0x775ea264cf55347e,
+        0xc612062576589dda,0x95364afe032a819e,
+        0xf79687aed3eec551,0x3a83ddbd83f52205,
+        0x9abe14cd44753b52,0xc4926a9672793543,
+        0xc16d9a0095928a27,0x75b7053c0f178294,
+        0xf1c90080baf72cb1,0x5324c68b12dd6339,
+        0x971da05074da7bee,0xd3f6fc16ebca5e04,
+        0xbce5086492111aea,0x88f4bb1ca6bcf585,
+        0xec1e4a7db69561a5,0x2b31e9e3d06c32e6,
+        0x9392ee8e921d5d07,0x3aff322e62439fd0,
+        0xb877aa3236a4b449,0x9befeb9fad487c3,
+        0xe69594bec44de15b,0x4c2ebe687989a9b4,
+        0x901d7cf73ab0acd9,0xf9d37014bf60a11,
+        0xb424dc35095cd80f,0x538484c19ef38c95,
+        0xe12e13424bb40e13,0x2865a5f206b06fba,
+        0x8cbccc096f5088cb,0xf93f87b7442e45d4,
+        0xafebff0bcb24aafe,0xf78f69a51539d749,
+        0xdbe6fecebdedd5be,0xb573440e5a884d1c,
+        0x89705f4136b4a597,0x31680a88f8953031,
+        0xabcc77118461cefc,0xfdc20d2b36ba7c3e,
+        0xd6bf94d5e57a42bc,0x3d32907604691b4d,
+        0x8637bd05af6c69b5,0xa63f9a49c2c1b110,
+        0xa7c5ac471b478423,0xfcf80dc33721d54,
+        0xd1b71758e219652b,0xd3c36113404ea4a9,
+        0x83126e978d4fdf3b,0x645a1cac083126ea,
+        0xa3d70a3d70a3d70a,0x3d70a3d70a3d70a4,
+        0xcccccccccccccccc,0xcccccccccccccccd,
+        0x8000000000000000,0x0,
+        0xa000000000000000,0x0,
+        0xc800000000000000,0x0,
+        0xfa00000000000000,0x0,
+        0x9c40000000000000,0x0,
+        0xc350000000000000,0x0,
+        0xf424000000000000,0x0,
+        0x9896800000000000,0x0,
+        0xbebc200000000000,0x0,
+        0xee6b280000000000,0x0,
+        0x9502f90000000000,0x0,
+        0xba43b74000000000,0x0,
+        0xe8d4a51000000000,0x0,
+        0x9184e72a00000000,0x0,
+        0xb5e620f480000000,0x0,
+        0xe35fa931a0000000,0x0,
+        0x8e1bc9bf04000000,0x0,
+        0xb1a2bc2ec5000000,0x0,
+        0xde0b6b3a76400000,0x0,
+        0x8ac7230489e80000,0x0,
+        0xad78ebc5ac620000,0x0,
+        0xd8d726b7177a8000,0x0,
+        0x878678326eac9000,0x0,
+        0xa968163f0a57b400,0x0,
+        0xd3c21bcecceda100,0x0,
+        0x84595161401484a0,0x0,
+        0xa56fa5b99019a5c8,0x0,
+        0xcecb8f27f4200f3a,0x0,
+        0x813f3978f8940984,0x4000000000000000,
+        0xa18f07d736b90be5,0x5000000000000000,
+        0xc9f2c9cd04674ede,0xa400000000000000,
+        0xfc6f7c4045812296,0x4d00000000000000,
+        0x9dc5ada82b70b59d,0xf020000000000000,
+        0xc5371912364ce305,0x6c28000000000000,
+        0xf684df56c3e01bc6,0xc732000000000000,
+        0x9a130b963a6c115c,0x3c7f400000000000,
+        0xc097ce7bc90715b3,0x4b9f100000000000,
+        0xf0bdc21abb48db20,0x1e86d40000000000,
+        0x96769950b50d88f4,0x1314448000000000,
+        0xbc143fa4e250eb31,0x17d955a000000000,
+        0xeb194f8e1ae525fd,0x5dcfab0800000000,
+        0x92efd1b8d0cf37be,0x5aa1cae500000000,
+        0xb7abc627050305ad,0xf14a3d9e40000000,
+        0xe596b7b0c643c719,0x6d9ccd05d0000000,
+        0x8f7e32ce7bea5c6f,0xe4820023a2000000,
+        0xb35dbf821ae4f38b,0xdda2802c8a800000,
+        0xe0352f62a19e306e,0xd50b2037ad200000,
+        0x8c213d9da502de45,0x4526f422cc340000,
+        0xaf298d050e4395d6,0x9670b12b7f410000,
+        0xdaf3f04651d47b4c,0x3c0cdd765f114000,
+        0x88d8762bf324cd0f,0xa5880a69fb6ac800,
+        0xab0e93b6efee0053,0x8eea0d047a457a00,
+        0xd5d238a4abe98068,0x72a4904598d6d880,
+        0x85a36366eb71f041,0x47a6da2b7f864750,
+        0xa70c3c40a64e6c51,0x999090b65f67d924,
+        0xd0cf4b50cfe20765,0xfff4b4e3f741cf6d,
+        0x82818f1281ed449f,0xbff8f10e7a8921a4,
+        0xa321f2d7226895c7,0xaff72d52192b6a0d,
+        0xcbea6f8ceb02bb39,0x9bf4f8a69f764490,
+        0xfee50b7025c36a08,0x2f236d04753d5b4,
+        0x9f4f2726179a2245,0x1d762422c946590,
+        0xc722f0ef9d80aad6,0x424d3ad2b7b97ef5,
+        0xf8ebad2b84e0d58b,0xd2e0898765a7deb2,
+        0x9b934c3b330c8577,0x63cc55f49f88eb2f,
+        0xc2781f49ffcfa6d5,0x3cbf6b71c76b25fb,
+        0xf316271c7fc3908a,0x8bef464e3945ef7a,
+        0x97edd871cfda3a56,0x97758bf0e3cbb5ac,
+        0xbde94e8e43d0c8ec,0x3d52eeed1cbea317,
+        0xed63a231d4c4fb27,0x4ca7aaa863ee4bdd,
+        0x945e455f24fb1cf8,0x8fe8caa93e74ef6a,
+        0xb975d6b6ee39e436,0xb3e2fd538e122b44,
+        0xe7d34c64a9c85d44,0x60dbbca87196b616,
+        0x90e40fbeea1d3a4a,0xbc8955e946fe31cd,
+        0xb51d13aea4a488dd,0x6babab6398bdbe41,
+        0xe264589a4dcdab14,0xc696963c7eed2dd1,
+        0x8d7eb76070a08aec,0xfc1e1de5cf543ca2,
+        0xb0de65388cc8ada8,0x3b25a55f43294bcb,
+        0xdd15fe86affad912,0x49ef0eb713f39ebe,
+        0x8a2dbf142dfcc7ab,0x6e3569326c784337,
+        0xacb92ed9397bf996,0x49c2c37f07965404,
+        0xd7e77a8f87daf7fb,0xdc33745ec97be906,
+        0x86f0ac99b4e8dafd,0x69a028bb3ded71a3,
+        0xa8acd7c0222311bc,0xc40832ea0d68ce0c,
+        0xd2d80db02aabd62b,0xf50a3fa490c30190,
+        0x83c7088e1aab65db,0x792667c6da79e0fa,
+        0xa4b8cab1a1563f52,0x577001b891185938,
+        0xcde6fd5e09abcf26,0xed4c0226b55e6f86,
+        0x80b05e5ac60b6178,0x544f8158315b05b4,
+        0xa0dc75f1778e39d6,0x696361ae3db1c721,
+        0xc913936dd571c84c,0x3bc3a19cd1e38e9,
+        0xfb5878494ace3a5f,0x4ab48a04065c723,
+        0x9d174b2dcec0e47b,0x62eb0d64283f9c76,
+        0xc45d1df942711d9a,0x3ba5d0bd324f8394,
+        0xf5746577930d6500,0xca8f44ec7ee36479,
+        0x9968bf6abbe85f20,0x7e998b13cf4e1ecb,
+        0xbfc2ef456ae276e8,0x9e3fedd8c321a67e,
+        0xefb3ab16c59b14a2,0xc5cfe94ef3ea101e,
+        0x95d04aee3b80ece5,0xbba1f1d158724a12,
+        0xbb445da9ca61281f,0x2a8a6e45ae8edc97,
+        0xea1575143cf97226,0xf52d09d71a3293bd,
+        0x924d692ca61be758,0x593c2626705f9c56,
+        0xb6e0c377cfa2e12e,0x6f8b2fb00c77836c,
+        0xe498f455c38b997a,0xb6dfb9c0f956447,
+        0x8edf98b59a373fec,0x4724bd4189bd5eac,
+        0xb2977ee300c50fe7,0x58edec91ec2cb657,
+        0xdf3d5e9bc0f653e1,0x2f2967b66737e3ed,
+        0x8b865b215899f46c,0xbd79e0d20082ee74,
+        0xae67f1e9aec07187,0xecd8590680a3aa11,
+        0xda01ee641a708de9,0xe80e6f4820cc9495,
+        0x884134fe908658b2,0x3109058d147fdcdd,
+        0xaa51823e34a7eede,0xbd4b46f0599fd415,
+        0xd4e5e2cdc1d1ea96,0x6c9e18ac7007c91a,
+        0x850fadc09923329e,0x3e2cf6bc604ddb0,
+        0xa6539930bf6bff45,0x84db8346b786151c,
+        0xcfe87f7cef46ff16,0xe612641865679a63,
+        0x81f14fae158c5f6e,0x4fcb7e8f3f60c07e,
+        0xa26da3999aef7749,0xe3be5e330f38f09d,
+        0xcb090c8001ab551c,0x5cadf5bfd3072cc5,
+        0xfdcb4fa002162a63,0x73d9732fc7c8f7f6,
+        0x9e9f11c4014dda7e,0x2867e7fddcdd9afa,
+        0xc646d63501a1511d,0xb281e1fd541501b8,
+        0xf7d88bc24209a565,0x1f225a7ca91a4226,
+        0x9ae757596946075f,0x3375788de9b06958,
+        0xc1a12d2fc3978937,0x52d6b1641c83ae,
+        0xf209787bb47d6b84,0xc0678c5dbd23a49a,
+        0x9745eb4d50ce6332,0xf840b7ba963646e0,
+        0xbd176620a501fbff,0xb650e5a93bc3d898,
+        0xec5d3fa8ce427aff,0xa3e51f138ab4cebe,
+        0x93ba47c980e98cdf,0xc66f336c36b10137,
+        0xb8a8d9bbe123f017,0xb80b0047445d4184,
+        0xe6d3102ad96cec1d,0xa60dc059157491e5,
+        0x9043ea1ac7e41392,0x87c89837ad68db2f,
+        0xb454e4a179dd1877,0x29babe4598c311fb,
+        0xe16a1dc9d8545e94,0xf4296dd6fef3d67a,
+        0x8ce2529e2734bb1d,0x1899e4a65f58660c,
+        0xb01ae745b101e9e4,0x5ec05dcff72e7f8f,
+        0xdc21a1171d42645d,0x76707543f4fa1f73,
+        0x899504ae72497eba,0x6a06494a791c53a8,
+        0xabfa45da0edbde69,0x487db9d17636892,
+        0xd6f8d7509292d603,0x45a9d2845d3c42b6,
+        0x865b86925b9bc5c2,0xb8a2392ba45a9b2,
+        0xa7f26836f282b732,0x8e6cac7768d7141e,
+        0xd1ef0244af2364ff,0x3207d795430cd926,
+        0x8335616aed761f1f,0x7f44e6bd49e807b8,
+        0xa402b9c5a8d3a6e7,0x5f16206c9c6209a6,
+        0xcd036837130890a1,0x36dba887c37a8c0f,
+        0x802221226be55a64,0xc2494954da2c9789,
+        0xa02aa96b06deb0fd,0xf2db9baa10b7bd6c,
+        0xc83553c5c8965d3d,0x6f92829494e5acc7,
+        0xfa42a8b73abbf48c,0xcb772339ba1f17f9,
+        0x9c69a97284b578d7,0xff2a760414536efb,
+        0xc38413cf25e2d70d,0xfef5138519684aba,
+        0xf46518c2ef5b8cd1,0x7eb258665fc25d69,
+        0x98bf2f79d5993802,0xef2f773ffbd97a61,
+        0xbeeefb584aff8603,0xaafb550ffacfd8fa,
+        0xeeaaba2e5dbf6784,0x95ba2a53f983cf38,
+        0x952ab45cfa97a0b2,0xdd945a747bf26183,
+        0xba756174393d88df,0x94f971119aeef9e4,
+        0xe912b9d1478ceb17,0x7a37cd5601aab85d,
+        0x91abb422ccb812ee,0xac62e055c10ab33a,
+        0xb616a12b7fe617aa,0x577b986b314d6009,
+        0xe39c49765fdf9d94,0xed5a7e85fda0b80b,
+        0x8e41ade9fbebc27d,0x14588f13be847307,
+        0xb1d219647ae6b31c,0x596eb2d8ae258fc8,
+        0xde469fbd99a05fe3,0x6fca5f8ed9aef3bb,
+        0x8aec23d680043bee,0x25de7bb9480d5854,
+        0xada72ccc20054ae9,0xaf561aa79a10ae6a,
+        0xd910f7ff28069da4,0x1b2ba1518094da04,
+        0x87aa9aff79042286,0x90fb44d2f05d0842,
+        0xa99541bf57452b28,0x353a1607ac744a53,
+        0xd3fa922f2d1675f2,0x42889b8997915ce8,
+        0x847c9b5d7c2e09b7,0x69956135febada11,
+        0xa59bc234db398c25,0x43fab9837e699095,
+        0xcf02b2c21207ef2e,0x94f967e45e03f4bb,
+        0x8161afb94b44f57d,0x1d1be0eebac278f5,
+        0xa1ba1ba79e1632dc,0x6462d92a69731732,
+        0xca28a291859bbf93,0x7d7b8f7503cfdcfe,
+        0xfcb2cb35e702af78,0x5cda735244c3d43e,
+        0x9defbf01b061adab,0x3a0888136afa64a7,
+        0xc56baec21c7a1916,0x88aaa1845b8fdd0,
+        0xf6c69a72a3989f5b,0x8aad549e57273d45,
+        0x9a3c2087a63f6399,0x36ac54e2f678864b,
+        0xc0cb28a98fcf3c7f,0x84576a1bb416a7dd,
+        0xf0fdf2d3f3c30b9f,0x656d44a2a11c51d5,
+        0x969eb7c47859e743,0x9f644ae5a4b1b325,
+        0xbc4665b596706114,0x873d5d9f0dde1fee,
+        0xeb57ff22fc0c7959,0xa90cb506d155a7ea,
+        0x9316ff75dd87cbd8,0x9a7f12442d588f2,
+        0xb7dcbf5354e9bece,0xc11ed6d538aeb2f,
+        0xe5d3ef282a242e81,0x8f1668c8a86da5fa,
+        0x8fa475791a569d10,0xf96e017d694487bc,
+        0xb38d92d760ec4455,0x37c981dcc395a9ac,
+        0xe070f78d3927556a,0x85bbe253f47b1417,
+        0x8c469ab843b89562,0x93956d7478ccec8e,
+        0xaf58416654a6babb,0x387ac8d1970027b2,
+        0xdb2e51bfe9d0696a,0x6997b05fcc0319e,
+        0x88fcf317f22241e2,0x441fece3bdf81f03,
+        0xab3c2fddeeaad25a,0xd527e81cad7626c3,
+        0xd60b3bd56a5586f1,0x8a71e223d8d3b074,
+        0x85c7056562757456,0xf6872d5667844e49,
+        0xa738c6bebb12d16c,0xb428f8ac016561db,
+        0xd106f86e69d785c7,0xe13336d701beba52,
+        0x82a45b450226b39c,0xecc0024661173473,
+        0xa34d721642b06084,0x27f002d7f95d0190,
+        0xcc20ce9bd35c78a5,0x31ec038df7b441f4,
+        0xff290242c83396ce,0x7e67047175a15271,
+        0x9f79a169bd203e41,0xf0062c6e984d386,
+        0xc75809c42c684dd1,0x52c07b78a3e60868,
+        0xf92e0c3537826145,0xa7709a56ccdf8a82,
+        0x9bbcc7a142b17ccb,0x88a66076400bb691,
+        0xc2abf989935ddbfe,0x6acff893d00ea435,
+        0xf356f7ebf83552fe,0x583f6b8c4124d43,
+        0x98165af37b2153de,0xc3727a337a8b704a,
+        0xbe1bf1b059e9a8d6,0x744f18c0592e4c5c,
+        0xeda2ee1c7064130c,0x1162def06f79df73,
+        0x9485d4d1c63e8be7,0x8addcb5645ac2ba8,
+        0xb9a74a0637ce2ee1,0x6d953e2bd7173692,
+        0xe8111c87c5c1ba99,0xc8fa8db6ccdd0437,
+        0x910ab1d4db9914a0,0x1d9c9892400a22a2,
+        0xb54d5e4a127f59c8,0x2503beb6d00cab4b,
+        0xe2a0b5dc971f303a,0x2e44ae64840fd61d,
+        0x8da471a9de737e24,0x5ceaecfed289e5d2,
+        0xb10d8e1456105dad,0x7425a83e872c5f47,
+        0xdd50f1996b947518,0xd12f124e28f77719,
+        0x8a5296ffe33cc92f,0x82bd6b70d99aaa6f,
+        0xace73cbfdc0bfb7b,0x636cc64d1001550b,
+        0xd8210befd30efa5a,0x3c47f7e05401aa4e,
+        0x8714a775e3e95c78,0x65acfaec34810a71,
+        0xa8d9d1535ce3b396,0x7f1839a741a14d0d,
+        0xd31045a8341ca07c,0x1ede48111209a050,
+        0x83ea2b892091e44d,0x934aed0aab460432,
+        0xa4e4b66b68b65d60,0xf81da84d5617853f,
+        0xce1de40642e3f4b9,0x36251260ab9d668e,
+        0x80d2ae83e9ce78f3,0xc1d72b7c6b426019,
+        0xa1075a24e4421730,0xb24cf65b8612f81f,
+        0xc94930ae1d529cfc,0xdee033f26797b627,
+        0xfb9b7cd9a4a7443c,0x169840ef017da3b1,
+        0x9d412e0806e88aa5,0x8e1f289560ee864e,
+        0xc491798a08a2ad4e,0xf1a6f2bab92a27e2,
+        0xf5b5d7ec8acb58a2,0xae10af696774b1db,
+        0x9991a6f3d6bf1765,0xacca6da1e0a8ef29,
+        0xbff610b0cc6edd3f,0x17fd090a58d32af3,
+        0xeff394dcff8a948e,0xddfc4b4cef07f5b0,
+        0x95f83d0a1fb69cd9,0x4abdaf101564f98e,
+        0xbb764c4ca7a4440f,0x9d6d1ad41abe37f1,
+        0xea53df5fd18d5513,0x84c86189216dc5ed,
+        0x92746b9be2f8552c,0x32fd3cf5b4e49bb4,
+        0xb7118682dbb66a77,0x3fbc8c33221dc2a1,
+        0xe4d5e82392a40515,0xfabaf3feaa5334a,
+        0x8f05b1163ba6832d,0x29cb4d87f2a7400e,
+        0xb2c71d5bca9023f8,0x743e20e9ef511012,
+        0xdf78e4b2bd342cf6,0x914da9246b255416,
+        0x8bab8eefb6409c1a,0x1ad089b6c2f7548e,
+        0xae9672aba3d0c320,0xa184ac2473b529b1,
+        0xda3c0f568cc4f3e8,0xc9e5d72d90a2741e,
+        0x8865899617fb1871,0x7e2fa67c7a658892,
+        0xaa7eebfb9df9de8d,0xddbb901b98feeab7,
+        0xd51ea6fa85785631,0x552a74227f3ea565,
+        0x8533285c936b35de,0xd53a88958f87275f,
+        0xa67ff273b8460356,0x8a892abaf368f137,
+        0xd01fef10a657842c,0x2d2b7569b0432d85,
+        0x8213f56a67f6b29b,0x9c3b29620e29fc73,
+        0xa298f2c501f45f42,0x8349f3ba91b47b8f,
+        0xcb3f2f7642717713,0x241c70a936219a73,
+        0xfe0efb53d30dd4d7,0xed238cd383aa0110,
+        0x9ec95d1463e8a506,0xf4363804324a40aa,
+        0xc67bb4597ce2ce48,0xb143c6053edcd0d5,
+        0xf81aa16fdc1b81da,0xdd94b7868e94050a,
+        0x9b10a4e5e9913128,0xca7cf2b4191c8326,
+        0xc1d4ce1f63f57d72,0xfd1c2f611f63a3f0,
+        0xf24a01a73cf2dccf,0xbc633b39673c8cec,
+        0x976e41088617ca01,0xd5be0503e085d813,
+        0xbd49d14aa79dbc82,0x4b2d8644d8a74e18,
+        0xec9c459d51852ba2,0xddf8e7d60ed1219e,
+        0x93e1ab8252f33b45,0xcabb90e5c942b503,
+        0xb8da1662e7b00a17,0x3d6a751f3b936243,
+        0xe7109bfba19c0c9d,0xcc512670a783ad4,
+        0x906a617d450187e2,0x27fb2b80668b24c5,
+        0xb484f9dc9641e9da,0xb1f9f660802dedf6,
+        0xe1a63853bbd26451,0x5e7873f8a0396973,
+        0x8d07e33455637eb2,0xdb0b487b6423e1e8,
+        0xb049dc016abc5e5f,0x91ce1a9a3d2cda62,
+        0xdc5c5301c56b75f7,0x7641a140cc7810fb,
+        0x89b9b3e11b6329ba,0xa9e904c87fcb0a9d,
+        0xac2820d9623bf429,0x546345fa9fbdcd44,
+        0xd732290fbacaf133,0xa97c177947ad4095,
+        0x867f59a9d4bed6c0,0x49ed8eabcccc485d,
+        0xa81f301449ee8c70,0x5c68f256bfff5a74,
+        0xd226fc195c6a2f8c,0x73832eec6fff3111,
+        0x83585d8fd9c25db7,0xc831fd53c5ff7eab,
+        0xa42e74f3d032f525,0xba3e7ca8b77f5e55,
+        0xcd3a1230c43fb26f,0x28ce1bd2e55f35eb,
+        0x80444b5e7aa7cf85,0x7980d163cf5b81b3,
+        0xa0555e361951c366,0xd7e105bcc332621f,
+        0xc86ab5c39fa63440,0x8dd9472bf3fefaa7,
+        0xfa856334878fc150,0xb14f98f6f0feb951,
+        0x9c935e00d4b9d8d2,0x6ed1bf9a569f33d3,
+        0xc3b8358109e84f07,0xa862f80ec4700c8,
+        0xf4a642e14c6262c8,0xcd27bb612758c0fa,
+        0x98e7e9cccfbd7dbd,0x8038d51cb897789c,
+        0xbf21e44003acdd2c,0xe0470a63e6bd56c3,
+        0xeeea5d5004981478,0x1858ccfce06cac74,
+        0x95527a5202df0ccb,0xf37801e0c43ebc8,
+        0xbaa718e68396cffd,0xd30560258f54e6ba,
+        0xe950df20247c83fd,0x47c6b82ef32a2069,
+        0x91d28b7416cdd27e,0x4cdc331d57fa5441,
+        0xb6472e511c81471d,0xe0133fe4adf8e952,
+        0xe3d8f9e563a198e5,0x58180fddd97723a6,
+        0x8e679c2f5e44ff8f,0x570f09eaa7ea7648,};
+using powers = powers_template<>;
+
+}
+
+#endif
+
+
+#ifndef FASTFLOAT_DECIMAL_TO_BINARY_H
+#define FASTFLOAT_DECIMAL_TO_BINARY_H
+
+//included above:
+//#include <cfloat>
+#include <cinttypes>
+#include <cmath>
+//included above:
+//#include <cstdint>
+#include <cstdlib>
+//included above:
+//#include <cstring>
+
+namespace fast_float {
+
+// This will compute or rather approximate w * 5**q and return a pair of 64-bit words approximating
+// the result, with the "high" part corresponding to the most significant bits and the
+// low part corresponding to the least significant bits.
+//
+template <int bit_precision>
+fastfloat_really_inline
+value128 compute_product_approximation(int64_t q, uint64_t w) {
+  const int index = 2 * int(q - powers::smallest_power_of_five);
+  // For small values of q, e.g., q in [0,27], the answer is always exact because
+  // The line value128 firstproduct = full_multiplication(w, power_of_five_128[index]);
+  // gives the exact answer.
+  value128 firstproduct = full_multiplication(w, powers::power_of_five_128[index]);
+  static_assert((bit_precision >= 0) && (bit_precision <= 64), " precision should  be in (0,64]");
+  constexpr uint64_t precision_mask = (bit_precision < 64) ?
+               (uint64_t(0xFFFFFFFFFFFFFFFF) >> bit_precision)
+               : uint64_t(0xFFFFFFFFFFFFFFFF);
+  if((firstproduct.high & precision_mask) == precision_mask) { // could further guard with  (lower + w < lower)
+    // regarding the second product, we only need secondproduct.high, but our expectation is that the compiler will optimize this extra work away if needed.
+    value128 secondproduct = full_multiplication(w, powers::power_of_five_128[index + 1]);
+    firstproduct.low += secondproduct.high;
+    if(secondproduct.high > firstproduct.low) {
+      firstproduct.high++;
+    }
+  }
+  return firstproduct;
+}
+
+namespace detail {
+/**
+ * For q in (0,350), we have that
+ *  f = (((152170 + 65536) * q ) >> 16);
+ * is equal to
+ *   floor(p) + q
+ * where
+ *   p = log(5**q)/log(2) = q * log(5)/log(2)
+ *
+ * For negative values of q in (-400,0), we have that 
+ *  f = (((152170 + 65536) * q ) >> 16);
+ * is equal to 
+ *   -ceil(p) + q
+ * where
+ *   p = log(5**-q)/log(2) = -q * log(5)/log(2)
+ */
+  constexpr fastfloat_really_inline int32_t power(int32_t q)  noexcept  {
+    return (((152170 + 65536) * q) >> 16) + 63;
+  }
+} // namespace detail
+
+// create an adjusted mantissa, biased by the invalid power2
+// for significant digits already multiplied by 10 ** q.
+template <typename binary>
+fastfloat_really_inline
+adjusted_mantissa compute_error_scaled(int64_t q, uint64_t w, int lz) noexcept  {
+  int hilz = int(w >> 63) ^ 1;
+  adjusted_mantissa answer;
+  answer.mantissa = w << hilz;
+  int bias = binary::mantissa_explicit_bits() - binary::minimum_exponent();
+  answer.power2 = int32_t(detail::power(int32_t(q)) + bias - hilz - lz - 62 + invalid_am_bias);
+  return answer;
+}
+
+// w * 10 ** q, without rounding the representation up.
+// the power2 in the exponent will be adjusted by invalid_am_bias.
+template <typename binary>
+fastfloat_really_inline
+adjusted_mantissa compute_error(int64_t q, uint64_t w)  noexcept  {
+  int lz = leading_zeroes(w);
+  w <<= lz;
+  value128 product = compute_product_approximation<binary::mantissa_explicit_bits() + 3>(q, w);
+  return compute_error_scaled<binary>(q, product.high, lz);
+}
+
+// w * 10 ** q
+// The returned value should be a valid ieee64 number that simply need to be packed.
+// However, in some very rare cases, the computation will fail. In such cases, we
+// return an adjusted_mantissa with a negative power of 2: the caller should recompute
+// in such cases.
+template <typename binary>
+fastfloat_really_inline
+adjusted_mantissa compute_float(int64_t q, uint64_t w)  noexcept  {
+  adjusted_mantissa answer;
+  if ((w == 0) || (q < binary::smallest_power_of_ten())) {
+    answer.power2 = 0;
+    answer.mantissa = 0;
+    // result should be zero
+    return answer;
+  }
+  if (q > binary::largest_power_of_ten()) {
+    // we want to get infinity:
+    answer.power2 = binary::infinite_power();
+    answer.mantissa = 0;
+    return answer;
+  }
+  // At this point in time q is in [powers::smallest_power_of_five, powers::largest_power_of_five].
+
+  // We want the most significant bit of i to be 1. Shift if needed.
+  int lz = leading_zeroes(w);
+  w <<= lz;
+
+  // The required precision is binary::mantissa_explicit_bits() + 3 because
+  // 1. We need the implicit bit
+  // 2. We need an extra bit for rounding purposes
+  // 3. We might lose a bit due to the "upperbit" routine (result too small, requiring a shift)
+
+  value128 product = compute_product_approximation<binary::mantissa_explicit_bits() + 3>(q, w);
+  if(product.low == 0xFFFFFFFFFFFFFFFF) { //  could guard it further
+    // In some very rare cases, this could happen, in which case we might need a more accurate
+    // computation that what we can provide cheaply. This is very, very unlikely.
+    //
+    const bool inside_safe_exponent = (q >= -27) && (q <= 55); // always good because 5**q <2**128 when q>=0, 
+    // and otherwise, for q<0, we have 5**-q<2**64 and the 128-bit reciprocal allows for exact computation.
+    if(!inside_safe_exponent) {
+      return compute_error_scaled<binary>(q, product.high, lz);
+    }
+  }
+  // The "compute_product_approximation" function can be slightly slower than a branchless approach:
+  // value128 product = compute_product(q, w);
+  // but in practice, we can win big with the compute_product_approximation if its additional branch
+  // is easily predicted. Which is best is data specific.
+  int upperbit = int(product.high >> 63);
+
+  answer.mantissa = product.high >> (upperbit + 64 - binary::mantissa_explicit_bits() - 3);
+
+  answer.power2 = int32_t(detail::power(int32_t(q)) + upperbit - lz - binary::minimum_exponent());
+  if (answer.power2 <= 0) { // we have a subnormal?
+    // Here have that answer.power2 <= 0 so -answer.power2 >= 0
+    if(-answer.power2 + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
+      answer.power2 = 0;
+      answer.mantissa = 0;
+      // result should be zero
+      return answer;
+    }
+    // next line is safe because -answer.power2 + 1 < 64
+    answer.mantissa >>= -answer.power2 + 1;
+    // Thankfully, we can't have both "round-to-even" and subnormals because
+    // "round-to-even" only occurs for powers close to 0.
+    answer.mantissa += (answer.mantissa & 1); // round up
+    answer.mantissa >>= 1;
+    // There is a weird scenario where we don't have a subnormal but just.
+    // Suppose we start with 2.2250738585072013e-308, we end up
+    // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
+    // whereas 0x40000000000000 x 2^-1023-53  is normal. Now, we need to round
+    // up 0x3fffffffffffff x 2^-1023-53  and once we do, we are no longer
+    // subnormal, but we can only know this after rounding.
+    // So we only declare a subnormal if we are smaller than the threshold.
+    answer.power2 = (answer.mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) ? 0 : 1;
+    return answer;
+  }
+
+  // usually, we round *up*, but if we fall right in between and and we have an
+  // even basis, we need to round down
+  // We are only concerned with the cases where 5**q fits in single 64-bit word.
+  if ((product.low <= 1) &&  (q >= binary::min_exponent_round_to_even()) && (q <= binary::max_exponent_round_to_even()) &&
+      ((answer.mantissa & 3) == 1) ) { // we may fall between two floats!
+    // To be in-between two floats we need that in doing
+    //   answer.mantissa = product.high >> (upperbit + 64 - binary::mantissa_explicit_bits() - 3);
+    // ... we dropped out only zeroes. But if this happened, then we can go back!!!
+    if((answer.mantissa  << (upperbit + 64 - binary::mantissa_explicit_bits() - 3)) ==  product.high) {
+      answer.mantissa &= ~uint64_t(1);          // flip it so that we do not round up
+    }
+  }
+
+  answer.mantissa += (answer.mantissa & 1); // round up
+  answer.mantissa >>= 1;
+  if (answer.mantissa >= (uint64_t(2) << binary::mantissa_explicit_bits())) {
+    answer.mantissa = (uint64_t(1) << binary::mantissa_explicit_bits());
+    answer.power2++; // undo previous addition
+  }
+
+  answer.mantissa &= ~(uint64_t(1) << binary::mantissa_explicit_bits());
+  if (answer.power2 >= binary::infinite_power()) { // infinity
+    answer.power2 = binary::infinite_power();
+    answer.mantissa = 0;
+  }
+  return answer;
+}
+
+} // namespace fast_float
+
+#endif
+
+
+#ifndef FASTFLOAT_BIGINT_H
+#define FASTFLOAT_BIGINT_H
+
+#include <algorithm>
+//included above:
+//#include <cstdint>
+//included above:
+//#include <climits>
+//included above:
+//#include <cstring>
+
+
+namespace fast_float {
+
+// the limb width: we want efficient multiplication of double the bits in
+// limb, or for 64-bit limbs, at least 64-bit multiplication where we can
+// extract the high and low parts efficiently. this is every 64-bit
+// architecture except for sparc, which emulates 128-bit multiplication.
+// we might have platforms where `CHAR_BIT` is not 8, so let's avoid
+// doing `8 * sizeof(limb)`.
+#if defined(FASTFLOAT_64BIT) && !defined(__sparc)
+#define FASTFLOAT_64BIT_LIMB
+typedef uint64_t limb;
+constexpr size_t limb_bits = 64;
+#else
+#define FASTFLOAT_32BIT_LIMB
+typedef uint32_t limb;
+constexpr size_t limb_bits = 32;
+#endif
+
+typedef span<limb> limb_span;
+
+// number of bits in a bigint. this needs to be at least the number
+// of bits required to store the largest bigint, which is
+// `log2(10**(digits + max_exp))`, or `log2(10**(767 + 342))`, or
+// ~3600 bits, so we round to 4000.
+constexpr size_t bigint_bits = 4000;
+constexpr size_t bigint_limbs = bigint_bits / limb_bits;
+
+// vector-like type that is allocated on the stack. the entire
+// buffer is pre-allocated, and only the length changes.
+template <uint16_t size>
+struct stackvec {
+  limb data[size];
+  // we never need more than 150 limbs
+  uint16_t length{0};
+
+  stackvec() = default;
+  stackvec(const stackvec &) = delete;
+  stackvec &operator=(const stackvec &) = delete;
+  stackvec(stackvec &&) = delete;
+  stackvec &operator=(stackvec &&other) = delete;
+
+  // create stack vector from existing limb span.
+  stackvec(limb_span s) {
+    FASTFLOAT_ASSERT(try_extend(s));
+  }
+
+  limb& operator[](size_t index) noexcept {
+    FASTFLOAT_DEBUG_ASSERT(index < length);
+    return data[index];
+  }
+  const limb& operator[](size_t index) const noexcept {
+    FASTFLOAT_DEBUG_ASSERT(index < length);
+    return data[index];
+  }
+  // index from the end of the container
+  const limb& rindex(size_t index) const noexcept {
+    FASTFLOAT_DEBUG_ASSERT(index < length);
+    size_t rindex = length - index - 1;
+    return data[rindex];
+  }
+
+  // set the length, without bounds checking.
+  void set_len(size_t len) noexcept {
+    length = uint16_t(len);
+  }
+  constexpr size_t len() const noexcept {
+    return length;
+  }
+  constexpr bool is_empty() const noexcept {
+    return length == 0;
+  }
+  constexpr size_t capacity() const noexcept {
+    return size;
+  }
+  // append item to vector, without bounds checking
+  void push_unchecked(limb value) noexcept {
+    data[length] = value;
+    length++;
+  }
+  // append item to vector, returning if item was added
+  bool try_push(limb value) noexcept {
+    if (len() < capacity()) {
+      push_unchecked(value);
+      return true;
+    } else {
+      return false;
+    }
+  }
+  // add items to the vector, from a span, without bounds checking
+  void extend_unchecked(limb_span s) noexcept {
+    limb* ptr = data + length;
+    ::memcpy((void*)ptr, (const void*)s.ptr, sizeof(limb) * s.len());
+    set_len(len() + s.len());
+  }
+  // try to add items to the vector, returning if items were added
+  bool try_extend(limb_span s) noexcept {
+    if (len() + s.len() <= capacity()) {
+      extend_unchecked(s);
+      return true;
+    } else {
+      return false;
+    }
+  }
+  // resize the vector, without bounds checking
+  // if the new size is longer than the vector, assign value to each
+  // appended item.
+  void resize_unchecked(size_t new_len, limb value) noexcept {
+    if (new_len > len()) {
+      size_t count = new_len - len();
+      limb* first = data + len();
+      limb* last = first + count;
+      ::std::fill(first, last, value);
+      set_len(new_len);
+    } else {
+      set_len(new_len);
+    }
+  }
+  // try to resize the vector, returning if the vector was resized.
+  bool try_resize(size_t new_len, limb value) noexcept {
+    if (new_len > capacity()) {
+      return false;
+    } else {
+      resize_unchecked(new_len, value);
+      return true;
+    }
+  }
+  // check if any limbs are non-zero after the given index.
+  // this needs to be done in reverse order, since the index
+  // is relative to the most significant limbs.
+  bool nonzero(size_t index) const noexcept {
+    while (index < len()) {
+      if (rindex(index) != 0) {
+        return true;
+      }
+      index++;
+    }
+    return false;
+  }
+  // normalize the big integer, so most-significant zero limbs are removed.
+  void normalize() noexcept {
+    while (len() > 0 && rindex(0) == 0) {
+      length--;
+    }
+  }
+};
+
+fastfloat_really_inline
+uint64_t empty_hi64(bool& truncated) noexcept {
+  truncated = false;
+  return 0;
+}
+
+fastfloat_really_inline
+uint64_t uint64_hi64(uint64_t r0, bool& truncated) noexcept {
+  truncated = false;
+  int shl = leading_zeroes(r0);
+  return r0 << shl;
+}
+
+fastfloat_really_inline
+uint64_t uint64_hi64(uint64_t r0, uint64_t r1, bool& truncated) noexcept {
+  int shl = leading_zeroes(r0);
+  if (shl == 0) {
+    truncated = r1 != 0;
+    return r0;
+  } else {
+    int shr = 64 - shl;
+    truncated = (r1 << shl) != 0;
+    return (r0 << shl) | (r1 >> shr);
+  }
+}
+
+fastfloat_really_inline
+uint64_t uint32_hi64(uint32_t r0, bool& truncated) noexcept {
+  return uint64_hi64(r0, truncated);
+}
+
+fastfloat_really_inline
+uint64_t uint32_hi64(uint32_t r0, uint32_t r1, bool& truncated) noexcept {
+  uint64_t x0 = r0;
+  uint64_t x1 = r1;
+  return uint64_hi64((x0 << 32) | x1, truncated);
+}
+
+fastfloat_really_inline
+uint64_t uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool& truncated) noexcept {
+  uint64_t x0 = r0;
+  uint64_t x1 = r1;
+  uint64_t x2 = r2;
+  return uint64_hi64(x0, (x1 << 32) | x2, truncated);
+}
+
+// add two small integers, checking for overflow.
+// we want an efficient operation. for msvc, where
+// we don't have built-in intrinsics, this is still
+// pretty fast.
+fastfloat_really_inline
+limb scalar_add(limb x, limb y, bool& overflow) noexcept {
+  limb z;
+
+// gcc and clang
+#if defined(__has_builtin)
+  #if __has_builtin(__builtin_add_overflow)
+    overflow = __builtin_add_overflow(x, y, &z);
+    return z;
+  #endif
+#endif
+
+  // generic, this still optimizes correctly on MSVC.
+  z = x + y;
+  overflow = z < x;
+  return z;
+}
+
+// multiply two small integers, getting both the high and low bits.
+fastfloat_really_inline
+limb scalar_mul(limb x, limb y, limb& carry) noexcept {
+#ifdef FASTFLOAT_64BIT_LIMB
+  #if defined(__SIZEOF_INT128__)
+  // GCC and clang both define it as an extension.
+  __uint128_t z = __uint128_t(x) * __uint128_t(y) + __uint128_t(carry);
+  carry = limb(z >> limb_bits);
+  return limb(z);
+  #else
+  // fallback, no native 128-bit integer multiplication with carry.
+  // on msvc, this optimizes identically, somehow.
+  value128 z = full_multiplication(x, y);
+  bool overflow;
+  z.low = scalar_add(z.low, carry, overflow);
+  z.high += uint64_t(overflow);  // cannot overflow
+  carry = z.high;
+  return z.low;
+  #endif
+#else
+  uint64_t z = uint64_t(x) * uint64_t(y) + uint64_t(carry);
+  carry = limb(z >> limb_bits);
+  return limb(z);
+#endif
+}
+
+// add scalar value to bigint starting from offset.
+// used in grade school multiplication
+template <uint16_t size>
+inline bool small_add_from(stackvec<size>& vec, limb y, size_t start) noexcept {
+  size_t index = start;
+  limb carry = y;
+  bool overflow;
+  while (carry != 0 && index < vec.len()) {
+    vec[index] = scalar_add(vec[index], carry, overflow);
+    carry = limb(overflow);
+    index += 1;
+  }
+  if (carry != 0) {
+    FASTFLOAT_TRY(vec.try_push(carry));
+  }
+  return true;
+}
+
+// add scalar value to bigint.
+template <uint16_t size>
+fastfloat_really_inline bool small_add(stackvec<size>& vec, limb y) noexcept {
+  return small_add_from(vec, y, 0);
+}
+
+// multiply bigint by scalar value.
+template <uint16_t size>
+inline bool small_mul(stackvec<size>& vec, limb y) noexcept {
+  limb carry = 0;
+  for (size_t index = 0; index < vec.len(); index++) {
+    vec[index] = scalar_mul(vec[index], y, carry);
+  }
+  if (carry != 0) {
+    FASTFLOAT_TRY(vec.try_push(carry));
+  }
+  return true;
+}
+
+// add bigint to bigint starting from index.
+// used in grade school multiplication
+template <uint16_t size>
+bool large_add_from(stackvec<size>& x, limb_span y, size_t start) noexcept {
+  // the effective x buffer is from `xstart..x.len()`, so exit early
+  // if we can't get that current range.
+  if (x.len() < start || y.len() > x.len() - start) {
+      FASTFLOAT_TRY(x.try_resize(y.len() + start, 0));
+  }
+
+  bool carry = false;
+  for (size_t index = 0; index < y.len(); index++) {
+    limb xi = x[index + start];
+    limb yi = y[index];
+    bool c1 = false;
+    bool c2 = false;
+    xi = scalar_add(xi, yi, c1);
+    if (carry) {
+      xi = scalar_add(xi, 1, c2);
+    }
+    x[index + start] = xi;
+    carry = c1 | c2;
+  }
+
+  // handle overflow
+  if (carry) {
+    FASTFLOAT_TRY(small_add_from(x, 1, y.len() + start));
+  }
+  return true;
+}
+
+// add bigint to bigint.
+template <uint16_t size>
+fastfloat_really_inline bool large_add_from(stackvec<size>& x, limb_span y) noexcept {
+  return large_add_from(x, y, 0);
+}
+
+// grade-school multiplication algorithm
+template <uint16_t size>
+bool long_mul(stackvec<size>& x, limb_span y) noexcept {
+  limb_span xs = limb_span(x.data, x.len());
+  stackvec<size> z(xs);
+  limb_span zs = limb_span(z.data, z.len());
+
+  if (y.len() != 0) {
+    limb y0 = y[0];
+    FASTFLOAT_TRY(small_mul(x, y0));
+    for (size_t index = 1; index < y.len(); index++) {
+      limb yi = y[index];
+      stackvec<size> zi;
+      if (yi != 0) {
+        // re-use the same buffer throughout
+        zi.set_len(0);
+        FASTFLOAT_TRY(zi.try_extend(zs));
+        FASTFLOAT_TRY(small_mul(zi, yi));
+        limb_span zis = limb_span(zi.data, zi.len());
+        FASTFLOAT_TRY(large_add_from(x, zis, index));
+      }
+    }
+  }
+
+  x.normalize();
+  return true;
+}
+
+// grade-school multiplication algorithm
+template <uint16_t size>
+bool large_mul(stackvec<size>& x, limb_span y) noexcept {
+  if (y.len() == 1) {
+    FASTFLOAT_TRY(small_mul(x, y[0]));
+  } else {
+    FASTFLOAT_TRY(long_mul(x, y));
+  }
+  return true;
+}
+
+// big integer type. implements a small subset of big integer
+// arithmetic, using simple algorithms since asymptotically
+// faster algorithms are slower for a small number of limbs.
+// all operations assume the big-integer is normalized.
+struct bigint {
+  // storage of the limbs, in little-endian order.
+  stackvec<bigint_limbs> vec;
+
+  bigint(): vec() {}
+  bigint(const bigint &) = delete;
+  bigint &operator=(const bigint &) = delete;
+  bigint(bigint &&) = delete;
+  bigint &operator=(bigint &&other) = delete;
+
+  bigint(uint64_t value): vec() {
+#ifdef FASTFLOAT_64BIT_LIMB
+    vec.push_unchecked(value);
+#else
+    vec.push_unchecked(uint32_t(value));
+    vec.push_unchecked(uint32_t(value >> 32));
+#endif
+    vec.normalize();
+  }
+
+  // get the high 64 bits from the vector, and if bits were truncated.
+  // this is to get the significant digits for the float.
+  uint64_t hi64(bool& truncated) const noexcept {
+#ifdef FASTFLOAT_64BIT_LIMB
+    if (vec.len() == 0) {
+      return empty_hi64(truncated);
+    } else if (vec.len() == 1) {
+      return uint64_hi64(vec.rindex(0), truncated);
+    } else {
+      uint64_t result = uint64_hi64(vec.rindex(0), vec.rindex(1), truncated);
+      truncated |= vec.nonzero(2);
+      return result;
+    }
+#else
+    if (vec.len() == 0) {
+      return empty_hi64(truncated);
+    } else if (vec.len() == 1) {
+      return uint32_hi64(vec.rindex(0), truncated);
+    } else if (vec.len() == 2) {
+      return uint32_hi64(vec.rindex(0), vec.rindex(1), truncated);
+    } else {
+      uint64_t result = uint32_hi64(vec.rindex(0), vec.rindex(1), vec.rindex(2), truncated);
+      truncated |= vec.nonzero(3);
+      return result;
+    }
+#endif
+  }
+
+  // compare two big integers, returning the large value.
+  // assumes both are normalized. if the return value is
+  // negative, other is larger, if the return value is
+  // positive, this is larger, otherwise they are equal.
+  // the limbs are stored in little-endian order, so we
+  // must compare the limbs in ever order.
+  int compare(const bigint& other) const noexcept {
+    if (vec.len() > other.vec.len()) {
+      return 1;
+    } else if (vec.len() < other.vec.len()) {
+      return -1;
+    } else {
+      for (size_t index = vec.len(); index > 0; index--) {
+        limb xi = vec[index - 1];
+        limb yi = other.vec[index - 1];
+        if (xi > yi) {
+          return 1;
+        } else if (xi < yi) {
+          return -1;
+        }
+      }
+      return 0;
+    }
+  }
+
+  // shift left each limb n bits, carrying over to the new limb
+  // returns true if we were able to shift all the digits.
+  bool shl_bits(size_t n) noexcept {
+    // Internally, for each item, we shift left by n, and add the previous
+    // right shifted limb-bits.
+    // For example, we transform (for u8) shifted left 2, to:
+    //      b10100100 b01000010
+    //      b10 b10010001 b00001000
+    FASTFLOAT_DEBUG_ASSERT(n != 0);
+    FASTFLOAT_DEBUG_ASSERT(n < sizeof(limb) * 8);
+
+    size_t shl = n;
+    size_t shr = limb_bits - shl;
+    limb prev = 0;
+    for (size_t index = 0; index < vec.len(); index++) {
+      limb xi = vec[index];
+      vec[index] = (xi << shl) | (prev >> shr);
+      prev = xi;
+    }
+
+    limb carry = prev >> shr;
+    if (carry != 0) {
+      return vec.try_push(carry);
+    }
+    return true;
+  }
+
+  // move the limbs left by `n` limbs.
+  bool shl_limbs(size_t n) noexcept {
+    FASTFLOAT_DEBUG_ASSERT(n != 0);
+    if (n + vec.len() > vec.capacity()) {
+      return false;
+    } else if (!vec.is_empty()) {
+      // move limbs
+      limb* dst = vec.data + n;
+      const limb* src = vec.data;
+      ::memmove(dst, src, sizeof(limb) * vec.len());
+      // fill in empty limbs
+      limb* first = vec.data;
+      limb* last = first + n;
+      ::std::fill(first, last, 0);
+      vec.set_len(n + vec.len());
+      return true;
+    } else {
+      return true;
+    }
+  }
+
+  // move the limbs left by `n` bits.
+  bool shl(size_t n) noexcept {
+    size_t rem = n % limb_bits;
+    size_t div = n / limb_bits;
+    if (rem != 0) {
+      FASTFLOAT_TRY(shl_bits(rem));
+    }
+    if (div != 0) {
+      FASTFLOAT_TRY(shl_limbs(div));
+    }
+    return true;
+  }
+
+  // get the number of leading zeros in the bigint.
+  int ctlz() const noexcept {
+    if (vec.is_empty()) {
+      return 0;
+    } else {
+#ifdef FASTFLOAT_64BIT_LIMB
+      return leading_zeroes(vec.rindex(0));
+#else
+      // no use defining a specialized leading_zeroes for a 32-bit type.
+      uint64_t r0 = vec.rindex(0);
+      return leading_zeroes(r0 << 32);
+#endif
+    }
+  }
+
+  // get the number of bits in the bigint.
+  int bit_length() const noexcept {
+    int lz = ctlz();
+    return int(limb_bits * vec.len()) - lz;
+  }
+
+  bool mul(limb y) noexcept {
+    return small_mul(vec, y);
+  }
+
+  bool add(limb y) noexcept {
+    return small_add(vec, y);
+  }
+
+  // multiply as if by 2 raised to a power.
+  bool pow2(uint32_t exp) noexcept {
+    return shl(exp);
+  }
+
+  // multiply as if by 5 raised to a power.
+  bool pow5(uint32_t exp) noexcept {
+    // multiply by a power of 5
+    static constexpr uint32_t large_step = 135;
+    static constexpr uint64_t small_power_of_5[] = {
+      1UL, 5UL, 25UL, 125UL, 625UL, 3125UL, 15625UL, 78125UL, 390625UL,
+      1953125UL, 9765625UL, 48828125UL, 244140625UL, 1220703125UL,
+      6103515625UL, 30517578125UL, 152587890625UL, 762939453125UL,
+      3814697265625UL, 19073486328125UL, 95367431640625UL, 476837158203125UL,
+      2384185791015625UL, 11920928955078125UL, 59604644775390625UL,
+      298023223876953125UL, 1490116119384765625UL, 7450580596923828125UL,
+    };
+#ifdef FASTFLOAT_64BIT_LIMB
+    constexpr static limb large_power_of_5[] = {
+      1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL,
+      10482974169319127550UL, 198276706040285095UL};
+#else
+    constexpr static limb large_power_of_5[] = {
+      4279965485U, 329373468U, 4020270615U, 2137533757U, 4287402176U,
+      1057042919U, 1071430142U, 2440757623U, 381945767U, 46164893U};
+#endif
+    size_t large_length = sizeof(large_power_of_5) / sizeof(limb);
+    limb_span large = limb_span(large_power_of_5, large_length);
+    while (exp >= large_step) {
+      FASTFLOAT_TRY(large_mul(vec, large));
+      exp -= large_step;
+    }
+#ifdef FASTFLOAT_64BIT_LIMB
+    uint32_t small_step = 27;
+    limb max_native = 7450580596923828125UL;
+#else
+    uint32_t small_step = 13;
+    limb max_native = 1220703125U;
+#endif
+    while (exp >= small_step) {
+      FASTFLOAT_TRY(small_mul(vec, max_native));
+      exp -= small_step;
+    }
+    if (exp != 0) {
+      FASTFLOAT_TRY(small_mul(vec, limb(small_power_of_5[exp])));
+    }
+
+    return true;
+  }
+
+  // multiply as if by 10 raised to a power.
+  bool pow10(uint32_t exp) noexcept {
+    FASTFLOAT_TRY(pow5(exp));
+    return pow2(exp);
+  }
+};
+
+} // namespace fast_float
+
+#endif
+
+
+#ifndef FASTFLOAT_ASCII_NUMBER_H
+#define FASTFLOAT_ASCII_NUMBER_H
+
+//included above:
+//#include <cctype>
+//included above:
+//#include <cstdint>
+//included above:
+//#include <cstring>
+//included above:
+//#include <iterator>
+
+
+namespace fast_float {
+
+// Next function can be micro-optimized, but compilers are entirely
+// able to optimize it well.
+fastfloat_really_inline bool is_integer(char c)  noexcept  { return c >= '0' && c <= '9'; }
+
+fastfloat_really_inline uint64_t byteswap(uint64_t val) {
+  return (val & 0xFF00000000000000) >> 56
+    | (val & 0x00FF000000000000) >> 40
+    | (val & 0x0000FF0000000000) >> 24
+    | (val & 0x000000FF00000000) >> 8
+    | (val & 0x00000000FF000000) << 8
+    | (val & 0x0000000000FF0000) << 24
+    | (val & 0x000000000000FF00) << 40
+    | (val & 0x00000000000000FF) << 56;
+}
+
+fastfloat_really_inline uint64_t read_u64(const char *chars) {
+  uint64_t val;
+  ::memcpy(&val, chars, sizeof(uint64_t));
+#if FASTFLOAT_IS_BIG_ENDIAN == 1
+  // Need to read as-if the number was in little-endian order.
+  val = byteswap(val);
+#endif
+  return val;
+}
+
+fastfloat_really_inline void write_u64(uint8_t *chars, uint64_t val) {
+#if FASTFLOAT_IS_BIG_ENDIAN == 1
+  // Need to read as-if the number was in little-endian order.
+  val = byteswap(val);
+#endif
+  ::memcpy(chars, &val, sizeof(uint64_t));
+}
+
+// credit  @aqrit
+fastfloat_really_inline uint32_t  parse_eight_digits_unrolled(uint64_t val) {
+  const uint64_t mask = 0x000000FF000000FF;
+  const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
+  const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
+  val -= 0x3030303030303030;
+  val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
+  val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;
+  return uint32_t(val);
+}
+
+fastfloat_really_inline uint32_t parse_eight_digits_unrolled(const char *chars)  noexcept  {
+  return parse_eight_digits_unrolled(read_u64(chars));
+}
+
+// credit @aqrit
+fastfloat_really_inline bool is_made_of_eight_digits_fast(uint64_t val)  noexcept  {
+  return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) &
+     0x8080808080808080));
+}
+
+fastfloat_really_inline bool is_made_of_eight_digits_fast(const char *chars)  noexcept  {
+  return is_made_of_eight_digits_fast(read_u64(chars));
+}
+
+typedef span<const char> byte_span;
+
+struct parsed_number_string {
+  int64_t exponent{0};
+  uint64_t mantissa{0};
+  const char *lastmatch{nullptr};
+  bool negative{false};
+  bool valid{false};
+  bool too_many_digits{false};
+  // contains the range of the significant digits
+  byte_span integer{};  // non-nullable
+  byte_span fraction{}; // nullable
+};
+
+// Assuming that you use no more than 19 digits, this will
+// parse an ASCII string.
+fastfloat_really_inline
+parsed_number_string parse_number_string(const char *p, const char *pend, parse_options options) noexcept {
+  const chars_format fmt = options.format;
+  const char decimal_point = options.decimal_point;
+
+  parsed_number_string answer;
+  answer.valid = false;
+  answer.too_many_digits = false;
+  answer.negative = (*p == '-');
+  if (*p == '-') { // C++17 20.19.3.(7.1) explicitly forbids '+' sign here
+    ++p;
+    if (p == pend) {
+      return answer;
+    }
+    if (!is_integer(*p) && (*p != decimal_point)) { // a sign must be followed by an integer or the dot
+      return answer;
+    }
+  }
+  const char *const start_digits = p;
+
+  uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
+
+  while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
+    i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
+    p += 8;
+  }
+  while ((p != pend) && is_integer(*p)) {
+    // a multiplication by 10 is cheaper than an arbitrary integer
+    // multiplication
+    i = 10 * i +
+        uint64_t(*p - '0'); // might overflow, we will handle the overflow later
+    ++p;
+  }
+  const char *const end_of_integer_part = p;
+  int64_t digit_count = int64_t(end_of_integer_part - start_digits);
+  answer.integer = byte_span(start_digits, size_t(digit_count));
+  int64_t exponent = 0;
+  if ((p != pend) && (*p == decimal_point)) {
+    ++p;
+    const char* before = p;
+    // can occur at most twice without overflowing, but let it occur more, since
+    // for integers with many digits, digit parsing is the primary bottleneck.
+    while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
+      i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
+      p += 8;
+    }
+    while ((p != pend) && is_integer(*p)) {
+      uint8_t digit = uint8_t(*p - '0');
+      ++p;
+      i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
+    }
+    exponent = before - p;
+    answer.fraction = byte_span(before, size_t(p - before));
+    digit_count -= exponent;
+  }
+  // we must have encountered at least one integer!
+  if (digit_count == 0) {
+    return answer;
+  }
+  int64_t exp_number = 0;            // explicit exponential part
+  if ((fmt & chars_format::scientific) && (p != pend) && (('e' == *p) || ('E' == *p))) {
+    const char * location_of_e = p;
+    ++p;
+    bool neg_exp = false;
+    if ((p != pend) && ('-' == *p)) {
+      neg_exp = true;
+      ++p;
+    } else if ((p != pend) && ('+' == *p)) { // '+' on exponent is allowed by C++17 20.19.3.(7.1)
+      ++p;
+    }
+    if ((p == pend) || !is_integer(*p)) {
+      if(!(fmt & chars_format::fixed)) {
+        // We are in error.
+        return answer;
+      }
+      // Otherwise, we will be ignoring the 'e'.
+      p = location_of_e;
+    } else {
+      while ((p != pend) && is_integer(*p)) {
+        uint8_t digit = uint8_t(*p - '0');
+        if (exp_number < 0x10000000) {
+          exp_number = 10 * exp_number + digit;
+        }
+        ++p;
+      }
+      if(neg_exp) { exp_number = - exp_number; }
+      exponent += exp_number;
+    }
+  } else {
+    // If it scientific and not fixed, we have to bail out.
+    if((fmt & chars_format::scientific) && !(fmt & chars_format::fixed)) { return answer; }
+  }
+  answer.lastmatch = p;
+  answer.valid = true;
+
+  // If we frequently had to deal with long strings of digits,
+  // we could extend our code by using a 128-bit integer instead
+  // of a 64-bit integer. However, this is uncommon.
+  //
+  // We can deal with up to 19 digits.
+  if (digit_count > 19) { // this is uncommon
+    // It is possible that the integer had an overflow.
+    // We have to handle the case where we have 0.0000somenumber.
+    // We need to be mindful of the case where we only have zeroes...
+    // E.g., 0.000000000...000.
+    const char *start = start_digits;
+    while ((start != pend) && (*start == '0' || *start == decimal_point)) {
+      if(*start == '0') { digit_count --; }
+      start++;
+    }
+    if (digit_count > 19) {
+      answer.too_many_digits = true;
+      // Let us start again, this time, avoiding overflows.
+      // We don't need to check if is_integer, since we use the
+      // pre-tokenized spans from above.
+      i = 0;
+      p = answer.integer.ptr;
+      const char* int_end = p + answer.integer.len();
+      const uint64_t minimal_nineteen_digit_integer{1000000000000000000};
+      while((i < minimal_nineteen_digit_integer) && (p != int_end)) {
+        i = i * 10 + uint64_t(*p - '0');
+        ++p;
+      }
+      if (i >= minimal_nineteen_digit_integer) { // We have a big integers
+        exponent = end_of_integer_part - p + exp_number;
+      } else { // We have a value with a fractional component.
+          p = answer.fraction.ptr;
+          const char* frac_end = p + answer.fraction.len();
+          while((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
+            i = i * 10 + uint64_t(*p - '0');
+            ++p;
+          }
+          exponent = answer.fraction.ptr - p + exp_number;
+      }
+      // We have now corrected both exponent and i, to a truncated value
+    }
+  }
+  answer.exponent = exponent;
+  answer.mantissa = i;
+  return answer;
+}
+
+} // namespace fast_float
+
+#endif
+
+
+#ifndef FASTFLOAT_DIGIT_COMPARISON_H
+#define FASTFLOAT_DIGIT_COMPARISON_H
+
+//included above:
+//#include <algorithm>
+//included above:
+//#include <cstdint>
+//included above:
+//#include <cstring>
+//included above:
+//#include <iterator>
+
+
+namespace fast_float {
+
+// 1e0 to 1e19
+constexpr static uint64_t powers_of_ten_uint64[] = {
+    1UL, 10UL, 100UL, 1000UL, 10000UL, 100000UL, 1000000UL, 10000000UL, 100000000UL,
+    1000000000UL, 10000000000UL, 100000000000UL, 1000000000000UL, 10000000000000UL,
+    100000000000000UL, 1000000000000000UL, 10000000000000000UL, 100000000000000000UL,
+    1000000000000000000UL, 10000000000000000000UL};
+
+// calculate the exponent, in scientific notation, of the number.
+// this algorithm is not even close to optimized, but it has no practical
+// effect on performance: in order to have a faster algorithm, we'd need
+// to slow down performance for faster algorithms, and this is still fast.
+fastfloat_really_inline int32_t scientific_exponent(parsed_number_string& num) noexcept {
+  uint64_t mantissa = num.mantissa;
+  int32_t exponent = int32_t(num.exponent);
+  while (mantissa >= 10000) {
+    mantissa /= 10000;
+    exponent += 4;
+  }
+  while (mantissa >= 100) {
+    mantissa /= 100;
+    exponent += 2;
+  }
+  while (mantissa >= 10) {
+    mantissa /= 10;
+    exponent += 1;
+  }
+  return exponent;
+}
+
+// this converts a native floating-point number to an extended-precision float.
+template <typename T>
+fastfloat_really_inline adjusted_mantissa to_extended(T value) noexcept {
+  adjusted_mantissa am;
+  int32_t bias = binary_format<T>::mantissa_explicit_bits() - binary_format<T>::minimum_exponent();
+  if (std::is_same<T, float>::value) {
+    constexpr uint32_t exponent_mask = 0x7F800000;
+    constexpr uint32_t mantissa_mask = 0x007FFFFF;
+    constexpr uint64_t hidden_bit_mask = 0x00800000;
+    uint32_t bits;
+    ::memcpy(&bits, &value, sizeof(T));
+    if ((bits & exponent_mask) == 0) {
+      // denormal
+      am.power2 = 1 - bias;
+      am.mantissa = bits & mantissa_mask;
+    } else {
+      // normal
+      am.power2 = int32_t((bits & exponent_mask) >> binary_format<T>::mantissa_explicit_bits());
+      am.power2 -= bias;
+      am.mantissa = (bits & mantissa_mask) | hidden_bit_mask;
+    }
+  } else {
+    constexpr uint64_t exponent_mask = 0x7FF0000000000000;
+    constexpr uint64_t mantissa_mask = 0x000FFFFFFFFFFFFF;
+    constexpr uint64_t hidden_bit_mask = 0x0010000000000000;
+    uint64_t bits;
+    ::memcpy(&bits, &value, sizeof(T));
+    if ((bits & exponent_mask) == 0) {
+      // denormal
+      am.power2 = 1 - bias;
+      am.mantissa = bits & mantissa_mask;
+    } else {
+      // normal
+      am.power2 = int32_t((bits & exponent_mask) >> binary_format<T>::mantissa_explicit_bits());
+      am.power2 -= bias;
+      am.mantissa = (bits & mantissa_mask) | hidden_bit_mask;
+    }
+  }
+
+  return am;
+}
+
+// get the extended precision value of the halfway point between b and b+u.
+// we are given a native float that represents b, so we need to adjust it
+// halfway between b and b+u.
+template <typename T>
+fastfloat_really_inline adjusted_mantissa to_extended_halfway(T value) noexcept {
+  adjusted_mantissa am = to_extended(value);
+  am.mantissa <<= 1;
+  am.mantissa += 1;
+  am.power2 -= 1;
+  return am;
+}
+
+// round an extended-precision float to the nearest machine float.
+template <typename T, typename callback>
+fastfloat_really_inline void round(adjusted_mantissa& am, callback cb) noexcept {
+  int32_t mantissa_shift = 64 - binary_format<T>::mantissa_explicit_bits() - 1;
+  if (-am.power2 >= mantissa_shift) {
+    // have a denormal float
+    int32_t shift = -am.power2 + 1;
+    cb(am, std::min(shift, 64));
+    // check for round-up: if rounding-nearest carried us to the hidden bit.
+    am.power2 = (am.mantissa < (uint64_t(1) << binary_format<T>::mantissa_explicit_bits())) ? 0 : 1;
+    return;
+  }
+
+  // have a normal float, use the default shift.
+  cb(am, mantissa_shift);
+
+  // check for carry
+  if (am.mantissa >= (uint64_t(2) << binary_format<T>::mantissa_explicit_bits())) {
+    am.mantissa = (uint64_t(1) << binary_format<T>::mantissa_explicit_bits());
+    am.power2++;
+  }
+
+  // check for infinite: we could have carried to an infinite power
+  am.mantissa &= ~(uint64_t(1) << binary_format<T>::mantissa_explicit_bits());
+  if (am.power2 >= binary_format<T>::infinite_power()) {
+    am.power2 = binary_format<T>::infinite_power();
+    am.mantissa = 0;
+  }
+}
+
+template <typename callback>
+fastfloat_really_inline
+void round_nearest_tie_even(adjusted_mantissa& am, int32_t shift, callback cb) noexcept {
+  uint64_t mask;
+  uint64_t halfway;
+  if (shift == 64) {
+    mask = UINT64_MAX;
+  } else {
+    mask = (uint64_t(1) << shift) - 1;
+  }
+  if (shift == 0) {
+    halfway = 0;
+  } else {
+    halfway = uint64_t(1) << (shift - 1);
+  }
+  uint64_t truncated_bits = am.mantissa & mask;
+  uint64_t is_above = truncated_bits > halfway;
+  uint64_t is_halfway = truncated_bits == halfway;
+
+  // shift digits into position
+  if (shift == 64) {
+    am.mantissa = 0;
+  } else {
+    am.mantissa >>= shift;
+  }
+  am.power2 += shift;
+
+  bool is_odd = (am.mantissa & 1) == 1;
+  am.mantissa += uint64_t(cb(is_odd, is_halfway, is_above));
+}
+
+fastfloat_really_inline void round_down(adjusted_mantissa& am, int32_t shift) noexcept {
+  if (shift == 64) {
+    am.mantissa = 0;
+  } else {
+    am.mantissa >>= shift;
+  }
+  am.power2 += shift;
+}
+
+fastfloat_really_inline void skip_zeros(const char*& first, const char* last) noexcept {
+  uint64_t val;
+  while (std::distance(first, last) >= 8) {
+    ::memcpy(&val, first, sizeof(uint64_t));
+    if (val != 0x3030303030303030) {
+      break;
+    }
+    first += 8;
+  }
+  while (first != last) {
+    if (*first != '0') {
+      break;
+    }
+    first++;
+  }
+}
+
+// determine if any non-zero digits were truncated.
+// all characters must be valid digits.
+fastfloat_really_inline bool is_truncated(const char* first, const char* last) noexcept {
+  // do 8-bit optimizations, can just compare to 8 literal 0s.
+  uint64_t val;
+  while (std::distance(first, last) >= 8) {
+    ::memcpy(&val, first, sizeof(uint64_t));
+    if (val != 0x3030303030303030) {
+      return true;
+    }
+    first += 8;
+  }
+  while (first != last) {
+    if (*first != '0') {
+      return true;
+    }
+    first++;
+  }
+  return false;
+}
+
+fastfloat_really_inline bool is_truncated(byte_span s) noexcept {
+  return is_truncated(s.ptr, s.ptr + s.len());
+}
+
+fastfloat_really_inline
+void parse_eight_digits(const char*& p, limb& value, size_t& counter, size_t& count) noexcept {
+  value = value * 100000000 + parse_eight_digits_unrolled(p);
+  p += 8;
+  counter += 8;
+  count += 8;
+}
+
+fastfloat_really_inline
+void parse_one_digit(const char*& p, limb& value, size_t& counter, size_t& count) noexcept {
+  value = value * 10 + limb(*p - '0');
+  p++;
+  counter++;
+  count++;
+}
+
+fastfloat_really_inline
+void add_native(bigint& big, limb power, limb value) noexcept {
+  big.mul(power);
+  big.add(value);
+}
+
+fastfloat_really_inline void round_up_bigint(bigint& big, size_t& count) noexcept {
+  // need to round-up the digits, but need to avoid rounding
+  // ....9999 to ...10000, which could cause a false halfway point.
+  add_native(big, 10, 1);
+  count++;
+}
+
+// parse the significant digits into a big integer
+inline void parse_mantissa(bigint& result, parsed_number_string& num, size_t max_digits, size_t& digits) noexcept {
+  // try to minimize the number of big integer and scalar multiplication.
+  // therefore, try to parse 8 digits at a time, and multiply by the largest
+  // scalar value (9 or 19 digits) for each step.
+  size_t counter = 0;
+  digits = 0;
+  limb value = 0;
+#ifdef FASTFLOAT_64BIT_LIMB
+  size_t step = 19;
+#else
+  size_t step = 9;
+#endif
+
+  // process all integer digits.
+  const char* p = num.integer.ptr;
+  const char* pend = p + num.integer.len();
+  skip_zeros(p, pend);
+  // process all digits, in increments of step per loop
+  while (p != pend) {
+    while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) {
+      parse_eight_digits(p, value, counter, digits);
+    }
+    while (counter < step && p != pend && digits < max_digits) {
+      parse_one_digit(p, value, counter, digits);
+    }
+    if (digits == max_digits) {
+      // add the temporary value, then check if we've truncated any digits
+      add_native(result, limb(powers_of_ten_uint64[counter]), value);
+      bool truncated = is_truncated(p, pend);
+      if (num.fraction.ptr != nullptr) {
+        truncated |= is_truncated(num.fraction);
+      }
+      if (truncated) {
+        round_up_bigint(result, digits);
+      }
+      return;
+    } else {
+      add_native(result, limb(powers_of_ten_uint64[counter]), value);
+      counter = 0;
+      value = 0;
+    }
+  }
+
+  // add our fraction digits, if they're available.
+  if (num.fraction.ptr != nullptr) {
+    p = num.fraction.ptr;
+    pend = p + num.fraction.len();
+    if (digits == 0) {
+      skip_zeros(p, pend);
+    }
+    // process all digits, in increments of step per loop
+    while (p != pend) {
+      while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) {
+        parse_eight_digits(p, value, counter, digits);
+      }
+      while (counter < step && p != pend && digits < max_digits) {
+        parse_one_digit(p, value, counter, digits);
+      }
+      if (digits == max_digits) {
+        // add the temporary value, then check if we've truncated any digits
+        add_native(result, limb(powers_of_ten_uint64[counter]), value);
+        bool truncated = is_truncated(p, pend);
+        if (truncated) {
+          round_up_bigint(result, digits);
+        }
+        return;
+      } else {
+        add_native(result, limb(powers_of_ten_uint64[counter]), value);
+        counter = 0;
+        value = 0;
+      }
+    }
+  }
+
+  if (counter != 0) {
+    add_native(result, limb(powers_of_ten_uint64[counter]), value);
+  }
+}
+
+template <typename T>
+inline adjusted_mantissa positive_digit_comp(bigint& bigmant, int32_t exponent) noexcept {
+  FASTFLOAT_ASSERT(bigmant.pow10(uint32_t(exponent)));
+  adjusted_mantissa answer;
+  bool truncated;
+  answer.mantissa = bigmant.hi64(truncated);
+  int bias = binary_format<T>::mantissa_explicit_bits() - binary_format<T>::minimum_exponent();
+  answer.power2 = bigmant.bit_length() - 64 + bias;
+
+  round<T>(answer, [truncated](adjusted_mantissa& a, int32_t shift) {
+    round_nearest_tie_even(a, shift, [truncated](bool is_odd, bool is_halfway, bool is_above) -> bool {
+      return is_above || (is_halfway && truncated) || (is_odd && is_halfway);
+    });
+  });
+
+  return answer;
+}
+
+// the scaling here is quite simple: we have, for the real digits `m * 10^e`,
+// and for the theoretical digits `n * 2^f`. Since `e` is always negative,
+// to scale them identically, we do `n * 2^f * 5^-f`, so we now have `m * 2^e`.
+// we then need to scale by `2^(f- e)`, and then the two significant digits
+// are of the same magnitude.
+template <typename T>
+inline adjusted_mantissa negative_digit_comp(bigint& bigmant, adjusted_mantissa am, int32_t exponent) noexcept {
+  bigint& real_digits = bigmant;
+  int32_t real_exp = exponent;
+
+  // get the value of `b`, rounded down, and get a bigint representation of b+h
+  adjusted_mantissa am_b = am;
+  // gcc7 buf: use a lambda to remove the noexcept qualifier bug with -Wnoexcept-type.
+  round<T>(am_b, [](adjusted_mantissa&a, int32_t shift) { round_down(a, shift); });
+  T b;
+  to_float(false, am_b, b);
+  adjusted_mantissa theor = to_extended_halfway(b);
+  bigint theor_digits(theor.mantissa);
+  int32_t theor_exp = theor.power2;
+
+  // scale real digits and theor digits to be same power.
+  int32_t pow2_exp = theor_exp - real_exp;
+  uint32_t pow5_exp = uint32_t(-real_exp);
+  if (pow5_exp != 0) {
+    FASTFLOAT_ASSERT(theor_digits.pow5(pow5_exp));
+  }
+  if (pow2_exp > 0) {
+    FASTFLOAT_ASSERT(theor_digits.pow2(uint32_t(pow2_exp)));
+  } else if (pow2_exp < 0) {
+    FASTFLOAT_ASSERT(real_digits.pow2(uint32_t(-pow2_exp)));
+  }
+
+  // compare digits, and use it to director rounding
+  int ord = real_digits.compare(theor_digits);
+  adjusted_mantissa answer = am;
+  round<T>(answer, [ord](adjusted_mantissa& a, int32_t shift) {
+    round_nearest_tie_even(a, shift, [ord](bool is_odd, bool _, bool __) -> bool {
+      (void)_;  // not needed, since we've done our comparison
+      (void)__; // not needed, since we've done our comparison
+      if (ord > 0) {
+        return true;
+      } else if (ord < 0) {
+        return false;
+      } else {
+        return is_odd;
+      }
+    });
+  });
+
+  return answer;
+}
+
+// parse the significant digits as a big integer to unambiguously round the
+// the significant digits. here, we are trying to determine how to round
+// an extended float representation close to `b+h`, halfway between `b`
+// (the float rounded-down) and `b+u`, the next positive float. this
+// algorithm is always correct, and uses one of two approaches. when
+// the exponent is positive relative to the significant digits (such as
+// 1234), we create a big-integer representation, get the high 64-bits,
+// determine if any lower bits are truncated, and use that to direct
+// rounding. in case of a negative exponent relative to the significant
+// digits (such as 1.2345), we create a theoretical representation of
+// `b` as a big-integer type, scaled to the same binary exponent as
+// the actual digits. we then compare the big integer representations
+// of both, and use that to direct rounding.
+template <typename T>
+inline adjusted_mantissa digit_comp(parsed_number_string& num, adjusted_mantissa am) noexcept {
+  // remove the invalid exponent bias
+  am.power2 -= invalid_am_bias;
+
+  int32_t sci_exp = scientific_exponent(num);
+  size_t max_digits = binary_format<T>::max_digits();
+  size_t digits = 0;
+  bigint bigmant;
+  parse_mantissa(bigmant, num, max_digits, digits);
+  // can't underflow, since digits is at most max_digits.
+  int32_t exponent = sci_exp + 1 - int32_t(digits);
+  if (exponent >= 0) {
+    return positive_digit_comp<T>(bigmant, exponent);
+  } else {
+    return negative_digit_comp<T>(bigmant, am, exponent);
+  }
+}
+
+} // namespace fast_float
+
+#endif
+
+
+#ifndef FASTFLOAT_PARSE_NUMBER_H
+#define FASTFLOAT_PARSE_NUMBER_H
+
+
+//included above:
+//#include <cmath>
+//included above:
+//#include <cstring>
+//included above:
+//#include <limits>
+//included above:
+//#include <system_error>
+
+namespace fast_float {
+
+
+namespace detail {
+/**
+ * Special case +inf, -inf, nan, infinity, -infinity.
+ * The case comparisons could be made much faster given that we know that the
+ * strings a null-free and fixed.
+ **/
+template <typename T>
+from_chars_result parse_infnan(const char *first, const char *last, T &value)  noexcept  {
+  from_chars_result answer;
+  answer.ptr = first;
+  answer.ec = std::errc(); // be optimistic
+  bool minusSign = false;
+  if (*first == '-') { // assume first < last, so dereference without checks; C++17 20.19.3.(7.1) explicitly forbids '+' here
+      minusSign = true;
+      ++first;
+  }
+  if (last - first >= 3) {
+    if (fastfloat_strncasecmp(first, "nan", 3)) {
+      answer.ptr = (first += 3);
+      value = minusSign ? -std::numeric_limits<T>::quiet_NaN() : std::numeric_limits<T>::quiet_NaN();
+      // Check for possible nan(n-char-seq-opt), C++17 20.19.3.7, C11 7.20.1.3.3. At least MSVC produces nan(ind) and nan(snan).
+      if(first != last && *first == '(') {
+        for(const char* ptr = first + 1; ptr != last; ++ptr) {
+          if (*ptr == ')') {
+            answer.ptr = ptr + 1; // valid nan(n-char-seq-opt)
+            break;
+          }
+          else if(!(('a' <= *ptr && *ptr <= 'z') || ('A' <= *ptr && *ptr <= 'Z') || ('0' <= *ptr && *ptr <= '9') || *ptr == '_'))
+            break; // forbidden char, not nan(n-char-seq-opt)
+        }
+      }
+      return answer;
+    }
+    if (fastfloat_strncasecmp(first, "inf", 3)) {
+      if ((last - first >= 8) && fastfloat_strncasecmp(first + 3, "inity", 5)) {
+        answer.ptr = first + 8;
+      } else {
+        answer.ptr = first + 3;
+      }
+      value = minusSign ? -std::numeric_limits<T>::infinity() : std::numeric_limits<T>::infinity();
+      return answer;
+    }
+  }
+  answer.ec = std::errc::invalid_argument;
+  return answer;
+}
+
+} // namespace detail
+
+template<typename T>
+from_chars_result from_chars(const char *first, const char *last,
+                             T &value, chars_format fmt /*= chars_format::general*/)  noexcept  {
+  return from_chars_advanced(first, last, value, parse_options{fmt});
+}
+
+template<typename T>
+from_chars_result from_chars_advanced(const char *first, const char *last,
+                                      T &value, parse_options options)  noexcept  {
+
+  static_assert (std::is_same<T, double>::value || std::is_same<T, float>::value, "only float and double are supported");
+
+
+  from_chars_result answer;
+  if (first == last) {
+    answer.ec = std::errc::invalid_argument;
+    answer.ptr = first;
+    return answer;
+  }
+  parsed_number_string pns = parse_number_string(first, last, options);
+  if (!pns.valid) {
+    return detail::parse_infnan(first, last, value);
+  }
+  answer.ec = std::errc(); // be optimistic
+  answer.ptr = pns.lastmatch;
+  // Next is Clinger's fast path.
+  if (binary_format<T>::min_exponent_fast_path() <= pns.exponent && pns.exponent <= binary_format<T>::max_exponent_fast_path() && pns.mantissa <=binary_format<T>::max_mantissa_fast_path() && !pns.too_many_digits) {
+    value = T(pns.mantissa);
+    if (pns.exponent < 0) { value = value / binary_format<T>::exact_power_of_ten(-pns.exponent); }
+    else { value = value * binary_format<T>::exact_power_of_ten(pns.exponent); }
+    if (pns.negative) { value = -value; }
+    return answer;
+  }
+  adjusted_mantissa am = compute_float<binary_format<T>>(pns.exponent, pns.mantissa);
+  if(pns.too_many_digits && am.power2 >= 0) {
+    if(am != compute_float<binary_format<T>>(pns.exponent, pns.mantissa + 1)) {
+      am = compute_error<binary_format<T>>(pns.exponent, pns.mantissa);
+    }
+  }
+  // If we called compute_float<binary_format<T>>(pns.exponent, pns.mantissa) and we have an invalid power (am.power2 < 0),
+  // then we need to go the long way around again. This is very uncommon.
+  if(am.power2 < 0) { am = digit_comp<T>(pns, am); }
+  to_float(pns.negative, am, value);
+  return answer;
+}
+
+} // namespace fast_float
+
+#endif
+
+#ifdef _MSC_VER
+#   pragma warning(pop)
+#elif defined(__clang__) || defined(__APPLE_CC__)
+#   pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic pop
+#endif
+
+#endif // _C4_EXT_FAST_FLOAT_HPP_
+
+
+// (end https://github.com/biojppm/c4core/src/c4/ext/fast_float.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/std/vector_fwd.hpp
+// https://github.com/biojppm/c4core/src/c4/std/vector_fwd.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_STD_VECTOR_FWD_HPP_
+#define _C4_STD_VECTOR_FWD_HPP_
+
+/** @file vector_fwd.hpp */
+
+//included above:
+//#include <cstddef>
+
+// forward declarations for std::vector
+#if defined(__GLIBCXX__) || defined(__GLIBCPP__) || defined(_MSC_VER)
+#if defined(_MSC_VER)
+__pragma(warning(push))
+__pragma(warning(disable : 4643))
+#endif
+namespace std {
+template<typename> class allocator;
+template<typename T, typename Alloc> class vector;
+} // namespace std
+#if defined(_MSC_VER)
+__pragma(warning(pop))
+#endif
+#elif defined(_LIBCPP_ABI_NAMESPACE)
+namespace std {
+inline namespace _LIBCPP_ABI_NAMESPACE {
+template<typename> class allocator;
+template<typename T, typename Alloc> class vector;
+} // namespace _LIBCPP_ABI_NAMESPACE
+} // namespace std
+#else
+#error "unknown standard library"
+#endif
+
+#ifndef C4CORE_SINGLE_HEADER
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/substr_fwd.hpp
+//#include "c4/substr_fwd.hpp"
+#if !defined(C4_SUBSTR_FWD_HPP_) && !defined(_C4_SUBSTR_FWD_HPP_)
+#error "amalgamate: file c4/substr_fwd.hpp must have been included at this point"
+#endif /* C4_SUBSTR_FWD_HPP_ */
+
+#endif
+
+namespace c4 {
+
+template<class Alloc> c4::substr to_substr(std::vector<char, Alloc> &vec);
+template<class Alloc> c4::csubstr to_csubstr(std::vector<char, Alloc> const& vec);
+
+template<class Alloc> bool operator!= (c4::csubstr ss, std::vector<char, Alloc> const& s);
+template<class Alloc> bool operator== (c4::csubstr ss, std::vector<char, Alloc> const& s);
+template<class Alloc> bool operator>= (c4::csubstr ss, std::vector<char, Alloc> const& s);
+template<class Alloc> bool operator>  (c4::csubstr ss, std::vector<char, Alloc> const& s);
+template<class Alloc> bool operator<= (c4::csubstr ss, std::vector<char, Alloc> const& s);
+template<class Alloc> bool operator<  (c4::csubstr ss, std::vector<char, Alloc> const& s);
+
+template<class Alloc> bool operator!= (std::vector<char, Alloc> const& s, c4::csubstr ss);
+template<class Alloc> bool operator== (std::vector<char, Alloc> const& s, c4::csubstr ss);
+template<class Alloc> bool operator>= (std::vector<char, Alloc> const& s, c4::csubstr ss);
+template<class Alloc> bool operator>  (std::vector<char, Alloc> const& s, c4::csubstr ss);
+template<class Alloc> bool operator<= (std::vector<char, Alloc> const& s, c4::csubstr ss);
+template<class Alloc> bool operator<  (std::vector<char, Alloc> const& s, c4::csubstr ss);
+
+template<class Alloc> size_t to_chars(c4::substr buf, std::vector<char, Alloc> const& s);
+template<class Alloc> bool from_chars(c4::csubstr buf, std::vector<char, Alloc> * s);
+
+} // namespace c4
+
+#endif // _C4_STD_VECTOR_FWD_HPP_
+
+
+// (end https://github.com/biojppm/c4core/src/c4/std/vector_fwd.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/std/string_fwd.hpp
+// https://github.com/biojppm/c4core/src/c4/std/string_fwd.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_STD_STRING_FWD_HPP_
+#define _C4_STD_STRING_FWD_HPP_
+
+/** @file string_fwd.hpp */
+
+#ifndef DOXYGEN
+
+#ifndef C4CORE_SINGLE_HEADER
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/substr_fwd.hpp
+//#include "c4/substr_fwd.hpp"
+#if !defined(C4_SUBSTR_FWD_HPP_) && !defined(_C4_SUBSTR_FWD_HPP_)
+#error "amalgamate: file c4/substr_fwd.hpp must have been included at this point"
+#endif /* C4_SUBSTR_FWD_HPP_ */
+
+#endif
+
+//included above:
+//#include <cstddef>
+
+// forward declarations for std::string
+#if defined(__GLIBCXX__) || defined(__GLIBCPP__)
+#include <bits/stringfwd.h>  // use the fwd header in glibcxx
+#elif defined(_LIBCPP_VERSION) || defined(__APPLE_CC__)
+#include <iosfwd>  // use the fwd header in stdlibc++
+#elif defined(_MSC_VER)
+//! @todo is there a fwd header in msvc?
+namespace std {
+template<typename> struct char_traits;
+template<typename> class allocator;
+template<typename _CharT, typename _Traits, typename _Alloc> class basic_string;
+using string = basic_string<char, char_traits<char>, allocator<char>>;
+} /* namespace std */
+#else
+#error "unknown standard library"
+#endif
+
+namespace c4 {
+
+C4_ALWAYS_INLINE c4::substr to_substr(std::string &s) noexcept;
+C4_ALWAYS_INLINE c4::csubstr to_csubstr(std::string const& s) noexcept;
+
+bool operator== (c4::csubstr ss, std::string const& s);
+bool operator!= (c4::csubstr ss, std::string const& s);
+bool operator>= (c4::csubstr ss, std::string const& s);
+bool operator>  (c4::csubstr ss, std::string const& s);
+bool operator<= (c4::csubstr ss, std::string const& s);
+bool operator<  (c4::csubstr ss, std::string const& s);
+
+bool operator== (std::string const& s, c4::csubstr ss);
+bool operator!= (std::string const& s, c4::csubstr ss);
+bool operator>= (std::string const& s, c4::csubstr ss);
+bool operator>  (std::string const& s, c4::csubstr ss);
+bool operator<= (std::string const& s, c4::csubstr ss);
+bool operator<  (std::string const& s, c4::csubstr ss);
+
+size_t to_chars(c4::substr buf, std::string const& s);
+bool from_chars(c4::csubstr buf, std::string * s);
+
+} // namespace c4
+
+#endif // DOXYGEN
+#endif // _C4_STD_STRING_FWD_HPP_
+
+
+// (end https://github.com/biojppm/c4core/src/c4/std/string_fwd.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/std/std_fwd.hpp
+// https://github.com/biojppm/c4core/src/c4/std/std_fwd.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_STD_STD_FWD_HPP_
+#define _C4_STD_STD_FWD_HPP_
+
+/** @file std_fwd.hpp includes all c4-std interop fwd files */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/std/vector_fwd.hpp
+//#include "c4/std/vector_fwd.hpp"
+#if !defined(C4_STD_VECTOR_FWD_HPP_) && !defined(_C4_STD_VECTOR_FWD_HPP_)
+#error "amalgamate: file c4/std/vector_fwd.hpp must have been included at this point"
+#endif /* C4_STD_VECTOR_FWD_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/std/string_fwd.hpp
+//#include "c4/std/string_fwd.hpp"
+#if !defined(C4_STD_STRING_FWD_HPP_) && !defined(_C4_STD_STRING_FWD_HPP_)
+#error "amalgamate: file c4/std/string_fwd.hpp must have been included at this point"
+#endif /* C4_STD_STRING_FWD_HPP_ */
+
+//#include "c4/std/tuple_fwd.hpp"
+
+#endif // _C4_STD_STD_FWD_HPP_
+
+
+// (end https://github.com/biojppm/c4core/src/c4/std/std_fwd.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/charconv.hpp
+// https://github.com/biojppm/c4core/src/c4/charconv.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_CHARCONV_HPP_
+#define _C4_CHARCONV_HPP_
+
+/** @file charconv.hpp Lightweight generic type-safe wrappers for
+ * converting individual values to/from strings.
+ *
+ * These are the main functions:
+ *
+ * @code{.cpp}
+ * // Convert the given value, writing into the string.
+ * // The resulting string will NOT be null-terminated.
+ * // Return the number of characters needed.
+ * // This function is safe to call when the string is too small -
+ * // no writes will occur beyond the string's last character.
+ * template<class T> size_t c4::to_chars(substr buf, T const& C4_RESTRICT val);
+ *
+ *
+ * // Convert the given value to a string using to_chars(), and
+ * // return the resulting string, up to and including the last
+ * // written character.
+ * template<class T> substr c4::to_chars_sub(substr buf, T const& C4_RESTRICT val);
+ *
+ *
+ * // Read a value from the string, which must be
+ * // trimmed to the value (ie, no leading/trailing whitespace).
+ * // return true if the conversion succeeded.
+ * // There is no check for overflow; the value wraps around in a way similar
+ * // to the standard C/C++ overflow behavior. For example,
+ * // from_chars<int8_t>("128", &val) returns true and val will be
+ * // set tot 0.
+ * template<class T> bool c4::from_chars(csubstr buf, T * C4_RESTRICT val);
+ *
+ *
+ * // Read the first valid sequence of characters from the string,
+ * // skipping leading whitespace, and convert it using from_chars().
+ * // Return the number of characters read for converting.
+ * template<class T> size_t c4::from_chars_first(csubstr buf, T * C4_RESTRICT val);
+ * @endcode
+ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/language.hpp
+//#include "c4/language.hpp"
+#if !defined(C4_LANGUAGE_HPP_) && !defined(_C4_LANGUAGE_HPP_)
+#error "amalgamate: file c4/language.hpp must have been included at this point"
+#endif /* C4_LANGUAGE_HPP_ */
+
+//included above:
+//#include <inttypes.h>
+//included above:
+//#include <type_traits>
+//included above:
+//#include <climits>
+//included above:
+//#include <limits>
+//included above:
+//#include <utility>
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/config.hpp
+//#include "c4/config.hpp"
+#if !defined(C4_CONFIG_HPP_) && !defined(_C4_CONFIG_HPP_)
+#error "amalgamate: file c4/config.hpp must have been included at this point"
+#endif /* C4_CONFIG_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/substr.hpp
+//#include "c4/substr.hpp"
+#if !defined(C4_SUBSTR_HPP_) && !defined(_C4_SUBSTR_HPP_)
+#error "amalgamate: file c4/substr.hpp must have been included at this point"
+#endif /* C4_SUBSTR_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/std/std_fwd.hpp
+//#include "c4/std/std_fwd.hpp"
+#if !defined(C4_STD_STD_FWD_HPP_) && !defined(_C4_STD_STD_FWD_HPP_)
+#error "amalgamate: file c4/std/std_fwd.hpp must have been included at this point"
+#endif /* C4_STD_STD_FWD_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/memory_util.hpp
+//#include "c4/memory_util.hpp"
+#if !defined(C4_MEMORY_UTIL_HPP_) && !defined(_C4_MEMORY_UTIL_HPP_)
+#error "amalgamate: file c4/memory_util.hpp must have been included at this point"
+#endif /* C4_MEMORY_UTIL_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/szconv.hpp
+//#include "c4/szconv.hpp"
+#if !defined(C4_SZCONV_HPP_) && !defined(_C4_SZCONV_HPP_)
+#error "amalgamate: file c4/szconv.hpp must have been included at this point"
+#endif /* C4_SZCONV_HPP_ */
+
+
+#ifndef C4CORE_NO_FAST_FLOAT
+#   if (C4_CPP >= 17)
+#       if defined(_MSC_VER)
+#           if (C4_MSVC_VERSION >= C4_MSVC_VERSION_2019) // VS2017 and lower do not have these macros
+#               include <charconv>
+#               define C4CORE_HAVE_STD_TOCHARS 1
+#               define C4CORE_HAVE_STD_FROMCHARS 0 // prefer fast_float with MSVC
+#               define C4CORE_HAVE_FAST_FLOAT 1
+#           else
+#               define C4CORE_HAVE_STD_TOCHARS 0
+#               define C4CORE_HAVE_STD_FROMCHARS 0
+#               define C4CORE_HAVE_FAST_FLOAT 1
+#           endif
+#       else
+#           if __has_include(<charconv>)
+//included above:
+//#               include <charconv>
+#               if defined(__cpp_lib_to_chars)
+#                   define C4CORE_HAVE_STD_TOCHARS 1
+#                   define C4CORE_HAVE_STD_FROMCHARS 0 // glibc uses fast_float internally
+#                   define C4CORE_HAVE_FAST_FLOAT 1
+#               else
+#                   define C4CORE_HAVE_STD_TOCHARS 0
+#                   define C4CORE_HAVE_STD_FROMCHARS 0
+#                   define C4CORE_HAVE_FAST_FLOAT 1
+#               endif
+#           else
+#               define C4CORE_HAVE_STD_TOCHARS 0
+#               define C4CORE_HAVE_STD_FROMCHARS 0
+#               define C4CORE_HAVE_FAST_FLOAT 1
+#           endif
+#       endif
+#   else
+#       define C4CORE_HAVE_STD_TOCHARS 0
+#       define C4CORE_HAVE_STD_FROMCHARS 0
+#       define C4CORE_HAVE_FAST_FLOAT 1
+#   endif
+#   if C4CORE_HAVE_FAST_FLOAT
+        C4_SUPPRESS_WARNING_GCC_WITH_PUSH("-Wsign-conversion")
+        C4_SUPPRESS_WARNING_GCC("-Warray-bounds")
+#       if __GNUC__ >= 5
+            C4_SUPPRESS_WARNING_GCC("-Wshift-count-overflow")
+#       endif
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/ext/fast_float.hpp
+//#       include "c4/ext/fast_float.hpp"
+#if !defined(C4_EXT_FAST_FLOAT_HPP_) && !defined(_C4_EXT_FAST_FLOAT_HPP_)
+#error "amalgamate: file c4/ext/fast_float.hpp must have been included at this point"
+#endif /* C4_EXT_FAST_FLOAT_HPP_ */
+
+        C4_SUPPRESS_WARNING_GCC_POP
+#   endif
+#elif (C4_CPP >= 17)
+#   define C4CORE_HAVE_FAST_FLOAT 0
+#   if defined(_MSC_VER)
+#       if (C4_MSVC_VERSION >= C4_MSVC_VERSION_2019) // VS2017 and lower do not have these macros
+//included above:
+//#           include <charconv>
+#           define C4CORE_HAVE_STD_TOCHARS 1
+#           define C4CORE_HAVE_STD_FROMCHARS 1
+#       else
+#           define C4CORE_HAVE_STD_TOCHARS 0
+#           define C4CORE_HAVE_STD_FROMCHARS 0
+#       endif
+#   else
+#       if __has_include(<charconv>)
+//included above:
+//#           include <charconv>
+#           if defined(__cpp_lib_to_chars)
+#               define C4CORE_HAVE_STD_TOCHARS 1
+#               define C4CORE_HAVE_STD_FROMCHARS 1 // glibc uses fast_float internally
+#           else
+#               define C4CORE_HAVE_STD_TOCHARS 0
+#               define C4CORE_HAVE_STD_FROMCHARS 0
+#           endif
+#       else
+#           define C4CORE_HAVE_STD_TOCHARS 0
+#           define C4CORE_HAVE_STD_FROMCHARS 0
+#       endif
+#   endif
+#else
+#   define C4CORE_HAVE_STD_TOCHARS 0
+#   define C4CORE_HAVE_STD_FROMCHARS 0
+#   define C4CORE_HAVE_FAST_FLOAT 0
+#endif
+
+
+#if !C4CORE_HAVE_STD_FROMCHARS
+#include <cstdio>
+#endif
+
+
+#ifdef _MSC_VER
+#   pragma warning(push)
+#   if C4_MSVC_VERSION != C4_MSVC_VERSION_2017
+#       pragma warning(disable: 4800) //'int': forcing value to bool 'true' or 'false' (performance warning)
+#   endif
+#   pragma warning(disable: 4996) // snprintf/scanf: this function or variable may be unsafe
+#elif defined(__clang__)
+#   pragma clang diagnostic push
+#   pragma clang diagnostic ignored "-Wtautological-constant-out-of-range-compare"
+#   pragma clang diagnostic ignored "-Wformat-nonliteral"
+#   pragma clang diagnostic ignored "-Wdouble-promotion" // implicit conversion increases floating-point precision
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wformat-nonliteral"
+#   pragma GCC diagnostic ignored "-Wdouble-promotion" // implicit conversion increases floating-point precision
+#   pragma GCC diagnostic ignored "-Wuseless-cast"
+#endif
+
+
+namespace c4 {
+
+#if C4CORE_HAVE_STD_TOCHARS
+/** @warning Use only the symbol. Do not rely on the type or naked value of this enum. */
+typedef enum : std::underlying_type<std::chars_format>::type {
+    /** print the real number in floating point format (like %f) */
+    FTOA_FLOAT = static_cast<std::underlying_type<std::chars_format>::type>(std::chars_format::fixed),
+    /** print the real number in scientific format (like %e) */
+    FTOA_SCIENT = static_cast<std::underlying_type<std::chars_format>::type>(std::chars_format::scientific),
+    /** print the real number in flexible format (like %g) */
+    FTOA_FLEX = static_cast<std::underlying_type<std::chars_format>::type>(std::chars_format::general),
+    /** print the real number in hexadecimal format (like %a) */
+    FTOA_HEXA = static_cast<std::underlying_type<std::chars_format>::type>(std::chars_format::hex),
+} RealFormat_e;
+#else
+/** @warning Use only the symbol. Do not rely on the type or naked value of this enum. */
+typedef enum : char {
+    /** print the real number in floating point format (like %f) */
+    FTOA_FLOAT = 'f',
+    /** print the real number in scientific format (like %e) */
+    FTOA_SCIENT = 'e',
+    /** print the real number in flexible format (like %g) */
+    FTOA_FLEX = 'g',
+    /** print the real number in hexadecimal format (like %a) */
+    FTOA_HEXA = 'a',
+} RealFormat_e;
+#endif
+
+
+/** in some platforms, int,unsigned int
+ *  are not any of int8_t...int64_t and
+ *  long,unsigned long are not any of uint8_t...uint64_t */
+template<class T>
+struct is_fixed_length
+{
+    enum : bool {
+        /** true if T is one of the fixed length signed types */
+        value_i = (std::is_integral<T>::value
+                   && (std::is_same<T, int8_t>::value
+                       || std::is_same<T, int16_t>::value
+                       || std::is_same<T, int32_t>::value
+                       || std::is_same<T, int64_t>::value)),
+        /** true if T is one of the fixed length unsigned types */
+        value_u = (std::is_integral<T>::value
+                   && (std::is_same<T, uint8_t>::value
+                       || std::is_same<T, uint16_t>::value
+                       || std::is_same<T, uint32_t>::value
+                       || std::is_same<T, uint64_t>::value)),
+        /** true if T is one of the fixed length signed or unsigned types */
+        value = value_i || value_u
+    };
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#   pragma warning(push)
+#elif defined(__clang__)
+#   pragma clang diagnostic push
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wconversion"
+#   if __GNUC__ >= 6
+#       pragma GCC diagnostic ignored "-Wnull-dereference"
+#   endif
+#endif
+
+namespace detail {
+
+/* python command to get the values below:
+def dec(v):
+    return str(v)
+for bits in (8, 16, 32, 64):
+    imin, imax, umax = (-(1 << (bits - 1))), (1 << (bits - 1)) - 1, (1 << bits) - 1
+    for vname, v in (("imin", imin), ("imax", imax), ("umax", umax)):
+        for f in (bin, oct, dec, hex):
+            print(f"{bits}b: {vname}={v} {f.__name__}: len={len(f(v)):2d}: {v} {f(v)}")
+*/
+
+// do not use the type as the template argument because in some
+// platforms long!=int32 and long!=int64. Just use the numbytes
+// which is more generic and spares lengthy SFINAE code.
+template<size_t num_bytes, bool is_signed> struct charconv_digits_;
+template<class T> using charconv_digits = charconv_digits_<sizeof(T), std::is_signed<T>::value>;
+
+template<> struct charconv_digits_<1u, true> // int8_t
+{
+    enum : size_t {
+        maxdigits_bin       = 1 + 2 + 8, // -128==-0b10000000
+        maxdigits_oct       = 1 + 2 + 3, // -128==-0o200
+        maxdigits_dec       = 1     + 3, // -128
+        maxdigits_hex       = 1 + 2 + 2, // -128==-0x80
+        maxdigits_bin_nopfx =         8, // -128==-0b10000000
+        maxdigits_oct_nopfx =         3, // -128==-0o200
+        maxdigits_dec_nopfx =         3, // -128
+        maxdigits_hex_nopfx =         2, // -128==-0x80
+    };
+    // min values without sign!
+    static constexpr csubstr min_value_dec() noexcept { return csubstr("128"); }
+    static constexpr csubstr min_value_hex() noexcept { return csubstr("80"); }
+    static constexpr csubstr min_value_oct() noexcept { return csubstr("200"); }
+    static constexpr csubstr min_value_bin() noexcept { return csubstr("10000000"); }
+    static constexpr csubstr max_value_dec() noexcept { return csubstr("127"); }
+    static constexpr bool    is_oct_overflow(csubstr str) noexcept { return !((str.len < 3) || (str.len == 3 && str[0] <= '1')); }
+};
+template<> struct charconv_digits_<1u, false> // uint8_t
+{
+    enum : size_t {
+        maxdigits_bin       = 2 + 8, // 255 0b11111111
+        maxdigits_oct       = 2 + 3, // 255 0o377
+        maxdigits_dec       =     3, // 255
+        maxdigits_hex       = 2 + 2, // 255 0xff
+        maxdigits_bin_nopfx =     8, // 255 0b11111111
+        maxdigits_oct_nopfx =     3, // 255 0o377
+        maxdigits_dec_nopfx =     3, // 255
+        maxdigits_hex_nopfx =     2, // 255 0xff
+    };
+    static constexpr csubstr max_value_dec() noexcept { return csubstr("255"); }
+    static constexpr bool    is_oct_overflow(csubstr str) noexcept { return !((str.len < 3) || (str.len == 3 && str[0] <= '3')); }
+};
+template<> struct charconv_digits_<2u, true> // int16_t
+{
+    enum : size_t {
+        maxdigits_bin       = 1 + 2 + 16, // -32768 -0b1000000000000000
+        maxdigits_oct       = 1 + 2 +  6, // -32768 -0o100000
+        maxdigits_dec       = 1     +  5, // -32768 -32768
+        maxdigits_hex       = 1 + 2 +  4, // -32768 -0x8000
+        maxdigits_bin_nopfx =         16, // -32768 -0b1000000000000000
+        maxdigits_oct_nopfx =          6, // -32768 -0o100000
+        maxdigits_dec_nopfx =          5, // -32768 -32768
+        maxdigits_hex_nopfx =          4, // -32768 -0x8000
+    };
+    // min values without sign!
+    static constexpr csubstr min_value_dec() noexcept { return csubstr("32768"); }
+    static constexpr csubstr min_value_hex() noexcept { return csubstr("8000"); }
+    static constexpr csubstr min_value_oct() noexcept { return csubstr("100000"); }
+    static constexpr csubstr min_value_bin() noexcept { return csubstr("1000000000000000"); }
+    static constexpr csubstr max_value_dec() noexcept { return csubstr("32767"); }
+    static constexpr bool    is_oct_overflow(csubstr str) noexcept { return !((str.len < 6)); }
+};
+template<> struct charconv_digits_<2u, false> // uint16_t
+{
+    enum : size_t {
+        maxdigits_bin       = 2 + 16, // 65535 0b1111111111111111
+        maxdigits_oct       = 2 +  6, // 65535 0o177777
+        maxdigits_dec       =      6, // 65535 65535
+        maxdigits_hex       = 2 +  4, // 65535 0xffff
+        maxdigits_bin_nopfx =     16, // 65535 0b1111111111111111
+        maxdigits_oct_nopfx =      6, // 65535 0o177777
+        maxdigits_dec_nopfx =      6, // 65535 65535
+        maxdigits_hex_nopfx =      4, // 65535 0xffff
+    };
+    static constexpr csubstr max_value_dec() noexcept { return csubstr("65535"); }
+    static constexpr bool    is_oct_overflow(csubstr str) noexcept { return !((str.len < 6) || (str.len == 6 && str[0] <= '1')); }
+};
+template<> struct charconv_digits_<4u, true> // int32_t
+{
+    enum : size_t {
+        maxdigits_bin       = 1 + 2 + 32, // len=35: -2147483648 -0b10000000000000000000000000000000
+        maxdigits_oct       = 1 + 2 + 11, // len=14: -2147483648 -0o20000000000
+        maxdigits_dec       = 1     + 10, // len=11: -2147483648 -2147483648
+        maxdigits_hex       = 1 + 2 +  8, // len=11: -2147483648 -0x80000000
+        maxdigits_bin_nopfx =         32, // len=35: -2147483648 -0b10000000000000000000000000000000
+        maxdigits_oct_nopfx =         11, // len=14: -2147483648 -0o20000000000
+        maxdigits_dec_nopfx =         10, // len=11: -2147483648 -2147483648
+        maxdigits_hex_nopfx =          8, // len=11: -2147483648 -0x80000000
+    };
+    // min values without sign!
+    static constexpr csubstr min_value_dec() noexcept { return csubstr("2147483648"); }
+    static constexpr csubstr min_value_hex() noexcept { return csubstr("80000000"); }
+    static constexpr csubstr min_value_oct() noexcept { return csubstr("20000000000"); }
+    static constexpr csubstr min_value_bin() noexcept { return csubstr("10000000000000000000000000000000"); }
+    static constexpr csubstr max_value_dec() noexcept { return csubstr("2147483647"); }
+    static constexpr bool    is_oct_overflow(csubstr str) noexcept { return !((str.len < 11) || (str.len == 11 && str[0] <= '1')); }
+};
+template<> struct charconv_digits_<4u, false> // uint32_t
+{
+    enum : size_t {
+        maxdigits_bin       = 2 + 32, // len=34: 4294967295 0b11111111111111111111111111111111
+        maxdigits_oct       = 2 + 11, // len=13: 4294967295 0o37777777777
+        maxdigits_dec       =     10, // len=10: 4294967295 4294967295
+        maxdigits_hex       = 2 +  8, // len=10: 4294967295 0xffffffff
+        maxdigits_bin_nopfx =     32, // len=34: 4294967295 0b11111111111111111111111111111111
+        maxdigits_oct_nopfx =     11, // len=13: 4294967295 0o37777777777
+        maxdigits_dec_nopfx =     10, // len=10: 4294967295 4294967295
+        maxdigits_hex_nopfx =      8, // len=10: 4294967295 0xffffffff
+    };
+    static constexpr csubstr max_value_dec() noexcept { return csubstr("4294967295"); }
+    static constexpr bool is_oct_overflow(csubstr str) noexcept { return !((str.len < 11) || (str.len == 11 && str[0] <= '3')); }
+};
+template<> struct charconv_digits_<8u, true> // int32_t
+{
+    enum : size_t {
+        maxdigits_bin       = 1 + 2 + 64, // len=67: -9223372036854775808 -0b1000000000000000000000000000000000000000000000000000000000000000
+        maxdigits_oct       = 1 + 2 + 22, // len=25: -9223372036854775808 -0o1000000000000000000000
+        maxdigits_dec       = 1     + 19, // len=20: -9223372036854775808 -9223372036854775808
+        maxdigits_hex       = 1 + 2 + 16, // len=19: -9223372036854775808 -0x8000000000000000
+        maxdigits_bin_nopfx =         64, // len=67: -9223372036854775808 -0b1000000000000000000000000000000000000000000000000000000000000000
+        maxdigits_oct_nopfx =         22, // len=25: -9223372036854775808 -0o1000000000000000000000
+        maxdigits_dec_nopfx =         19, // len=20: -9223372036854775808 -9223372036854775808
+        maxdigits_hex_nopfx =         16, // len=19: -9223372036854775808 -0x8000000000000000
+    };
+    static constexpr csubstr min_value_dec() noexcept { return csubstr("9223372036854775808"); }
+    static constexpr csubstr min_value_hex() noexcept { return csubstr("8000000000000000"); }
+    static constexpr csubstr min_value_oct() noexcept { return csubstr("1000000000000000000000"); }
+    static constexpr csubstr min_value_bin() noexcept { return csubstr("1000000000000000000000000000000000000000000000000000000000000000"); }
+    static constexpr csubstr max_value_dec() noexcept { return csubstr("9223372036854775807"); }
+    static constexpr bool    is_oct_overflow(csubstr str) noexcept { return !((str.len < 22)); }
+};
+template<> struct charconv_digits_<8u, false>
+{
+    enum : size_t {
+        maxdigits_bin       = 2 + 64, // len=66: 18446744073709551615 0b1111111111111111111111111111111111111111111111111111111111111111
+        maxdigits_oct       = 2 + 22, // len=24: 18446744073709551615 0o1777777777777777777777
+        maxdigits_dec       =     20, // len=20: 18446744073709551615 18446744073709551615
+        maxdigits_hex       = 2 + 16, // len=18: 18446744073709551615 0xffffffffffffffff
+        maxdigits_bin_nopfx =     64, // len=66: 18446744073709551615 0b1111111111111111111111111111111111111111111111111111111111111111
+        maxdigits_oct_nopfx =     22, // len=24: 18446744073709551615 0o1777777777777777777777
+        maxdigits_dec_nopfx =     20, // len=20: 18446744073709551615 18446744073709551615
+        maxdigits_hex_nopfx =     16, // len=18: 18446744073709551615 0xffffffffffffffff
+    };
+    static constexpr csubstr max_value_dec() noexcept { return csubstr("18446744073709551615"); }
+    static constexpr bool    is_oct_overflow(csubstr str) noexcept { return !((str.len < 22) || (str.len == 22 && str[0] <= '1')); }
+};
+} // namespace detail
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+// Helper macros, undefined below
+#define _c4append(c) { if(C4_LIKELY(pos < buf.len)) { buf.str[pos++] = static_cast<char>(c); } else { ++pos; } }
+#define _c4appendhex(i) { if(C4_LIKELY(pos < buf.len)) { buf.str[pos++] = hexchars[i]; } else { ++pos; } }
+
+/** @name digits_dec return the number of digits required to encode a
+ * decimal number.
+ *
+ * @note At first sight this code may look heavily branchy and
+ * therefore inefficient. However, measurements revealed this to be
+ * the fastest among the alternatives.
+ *
+ * @see https://github.com/biojppm/c4core/pull/77 */
+/** @{ */
+
+template<class T>
+C4_CONSTEXPR14 C4_ALWAYS_INLINE
+auto digits_dec(T v) noexcept
+    -> typename std::enable_if<sizeof(T) == 1u, unsigned>::type
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    C4_ASSERT(v >= 0);
+    return ((v >= 100) ? 3u : ((v >= 10) ? 2u : 1u));
+}
+
+template<class T>
+C4_CONSTEXPR14 C4_ALWAYS_INLINE
+auto digits_dec(T v) noexcept
+    -> typename std::enable_if<sizeof(T) == 2u, unsigned>::type
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    C4_ASSERT(v >= 0);
+    return ((v >= 10000) ? 5u : (v >= 1000) ? 4u : (v >= 100) ? 3u : (v >= 10) ? 2u : 1u);
+}
+
+template<class T>
+C4_CONSTEXPR14 C4_ALWAYS_INLINE
+auto digits_dec(T v) noexcept
+    -> typename std::enable_if<sizeof(T) == 4u, unsigned>::type
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    C4_ASSERT(v >= 0);
+    return ((v >= 1000000000) ? 10u : (v >= 100000000) ? 9u : (v >= 10000000) ? 8u :
+            (v >= 1000000) ? 7u : (v >= 100000) ? 6u : (v >= 10000) ? 5u :
+            (v >= 1000) ? 4u : (v >= 100) ? 3u : (v >= 10) ? 2u : 1u);
+}
+
+template<class T>
+C4_CONSTEXPR14 C4_ALWAYS_INLINE
+auto digits_dec(T v) noexcept
+    -> typename std::enable_if<sizeof(T) == 8u, unsigned>::type
+{
+    // thanks @fargies!!!
+    // https://github.com/biojppm/c4core/pull/77#issuecomment-1063753568
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    C4_ASSERT(v >= 0);
+    if(v >= 1000000000) // 10
+    {
+        if(v >= 100000000000000) // 15 [15-20] range
+        {
+            if(v >= 100000000000000000) // 18 (15 + (20 - 15) / 2)
+            {
+                if((typename std::make_unsigned<T>::type)v >= 10000000000000000000u) // 20
+                    return 20u;
+                else
+                    return (v >= 1000000000000000000) ? 19u : 18u;
+            }
+            else if(v >= 10000000000000000) // 17
+                return 17u;
+            else
+                return(v >= 1000000000000000) ? 16u : 15u;
+        }
+        else if(v >= 1000000000000) // 13
+            return (v >= 10000000000000) ? 14u : 13u;
+        else if(v >= 100000000000) // 12
+            return 12;
+        else
+            return(v >= 10000000000) ? 11u : 10u;
+    }
+    else if(v >= 10000) // 5 [5-9] range
+    {
+        if(v >= 10000000) // 8
+            return (v >= 100000000) ? 9u : 8u;
+        else if(v >= 1000000) // 7
+            return 7;
+        else
+            return (v >= 100000) ? 6u : 5u;
+    }
+    else if(v >= 100)
+        return (v >= 1000) ? 4u : 3u;
+    else
+        return (v >= 10) ? 2u : 1u;
+}
+
+/** @} */
+
+
+template<class T>
+C4_CONSTEXPR14 C4_ALWAYS_INLINE unsigned digits_hex(T v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    C4_ASSERT(v >= 0);
+    return v ? 1u + (msb((typename std::make_unsigned<T>::type)v) >> 2u) : 1u;
+}
+
+template<class T>
+C4_CONSTEXPR14 C4_ALWAYS_INLINE unsigned digits_bin(T v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    C4_ASSERT(v >= 0);
+    return v ? 1u + msb((typename std::make_unsigned<T>::type)v) : 1u;
+}
+
+template<class T>
+C4_CONSTEXPR14 C4_ALWAYS_INLINE unsigned digits_oct(T v_) noexcept
+{
+    // TODO: is there a better way?
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    C4_ASSERT(v_ >= 0);
+    using U = typename
+        std::conditional<sizeof(T) <= sizeof(unsigned),
+                         unsigned,
+                         typename std::make_unsigned<T>::type>::type;
+    U v = (U) v_;  // safe because we require v_ >= 0
+    unsigned __n = 1;
+    const unsigned __b2 = 64u;
+    const unsigned __b3 = __b2 * 8u;
+    const unsigned long __b4 = __b3 * 8u;
+    while(true)
+	{
+        if(v < 8u)
+            return __n;
+        if(v < __b2)
+            return __n + 1;
+        if(v < __b3)
+            return __n + 2;
+        if(v < __b4)
+            return __n + 3;
+        v /= (U) __b4;
+        __n += 4;
+	}
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+namespace detail {
+C4_INLINE_CONSTEXPR const char hexchars[] = "0123456789abcdef";
+C4_INLINE_CONSTEXPR const char digits0099[] =
+    "0001020304050607080910111213141516171819"
+    "2021222324252627282930313233343536373839"
+    "4041424344454647484950515253545556575859"
+    "6061626364656667686970717273747576777879"
+    "8081828384858687888990919293949596979899";
+} // namespace detail
+
+C4_SUPPRESS_WARNING_GCC_PUSH
+C4_SUPPRESS_WARNING_GCC("-Warray-bounds")  // gcc has false positives here
+#if (defined(__GNUC__) && (__GNUC__ >= 7))
+C4_SUPPRESS_WARNING_GCC("-Wstringop-overflow")  // gcc has false positives here
+#endif
+
+template<class T>
+C4_HOT C4_ALWAYS_INLINE
+void write_dec_unchecked(substr buf, T v, unsigned digits_v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    C4_ASSERT(v >= 0);
+    C4_ASSERT(buf.len >= digits_v);
+    C4_XASSERT(digits_v == digits_dec(v));
+    // in bm_xtoa: checkoncelog_singlediv_write2
+    while(v >= T(100))
+    {
+        const T quo = v / T(100);
+        const auto num = (v - quo * T(100)) << 1u;
+        v = quo;
+        buf.str[--digits_v] = detail::digits0099[num + 1];
+        buf.str[--digits_v] = detail::digits0099[num];
+    }
+    if(v >= T(10))
+    {
+        C4_ASSERT(digits_v == 2);
+        const auto num = v << 1u;
+        buf.str[1] = detail::digits0099[num + 1];
+        buf.str[0] = detail::digits0099[num];
+    }
+    else
+    {
+        C4_ASSERT(digits_v == 1);
+        buf.str[0] = (char)('0' + v);
+    }
+}
+
+
+template<class T>
+C4_HOT C4_ALWAYS_INLINE
+void write_hex_unchecked(substr buf, T v, unsigned digits_v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    C4_ASSERT(v >= 0);
+    C4_ASSERT(buf.len >= digits_v);
+    C4_XASSERT(digits_v == digits_hex(v));
+    do {
+        buf.str[--digits_v] = detail::hexchars[v & T(15)];
+        v >>= 4;
+    } while(v);
+    C4_ASSERT(digits_v == 0);
+}
+
+
+template<class T>
+C4_HOT C4_ALWAYS_INLINE
+void write_oct_unchecked(substr buf, T v, unsigned digits_v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    C4_ASSERT(v >= 0);
+    C4_ASSERT(buf.len >= digits_v);
+    C4_XASSERT(digits_v == digits_oct(v));
+    do {
+        buf.str[--digits_v] = (char)('0' + (v & T(7)));
+        v >>= 3;
+    } while(v);
+    C4_ASSERT(digits_v == 0);
+}
+
+
+template<class T>
+C4_HOT C4_ALWAYS_INLINE
+void write_bin_unchecked(substr buf, T v, unsigned digits_v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    C4_ASSERT(v >= 0);
+    C4_ASSERT(buf.len >= digits_v);
+    C4_XASSERT(digits_v == digits_bin(v));
+    do {
+        buf.str[--digits_v] = (char)('0' + (v & T(1)));
+        v >>= 1;
+    } while(v);
+    C4_ASSERT(digits_v == 0);
+}
+
+
+/** write an integer to a string in decimal format. This is the
+ * lowest level (and the fastest) function to do this task.
+ * @note does not accept negative numbers
+ * @note the resulting string is NOT zero-terminated.
+ * @note it is ok to call this with an empty or too-small buffer;
+ * no writes will occur, and the required size will be returned
+ * @return the number of characters required for the buffer. */
+template<class T>
+C4_ALWAYS_INLINE size_t write_dec(substr buf, T v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    C4_ASSERT(v >= 0);
+    unsigned digits = digits_dec(v);
+    if(C4_LIKELY(buf.len >= digits))
+        write_dec_unchecked(buf, v, digits);
+    return digits;
+}
+
+/** write an integer to a string in hexadecimal format. This is the
+ * lowest level (and the fastest) function to do this task.
+ * @note does not accept negative numbers
+ * @note does not prefix with 0x
+ * @note the resulting string is NOT zero-terminated.
+ * @note it is ok to call this with an empty or too-small buffer;
+ * no writes will occur, and the required size will be returned
+ * @return the number of characters required for the buffer. */
+template<class T>
+C4_ALWAYS_INLINE size_t write_hex(substr buf, T v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    C4_ASSERT(v >= 0);
+    unsigned digits = digits_hex(v);
+    if(C4_LIKELY(buf.len >= digits))
+        write_hex_unchecked(buf, v, digits);
+    return digits;
+}
+
+/** write an integer to a string in octal format. This is the
+ * lowest level (and the fastest) function to do this task.
+ * @note does not accept negative numbers
+ * @note does not prefix with 0o
+ * @note the resulting string is NOT zero-terminated.
+ * @note it is ok to call this with an empty or too-small buffer;
+ * no writes will occur, and the required size will be returned
+ * @return the number of characters required for the buffer. */
+template<class T>
+C4_ALWAYS_INLINE size_t write_oct(substr buf, T v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    C4_ASSERT(v >= 0);
+    unsigned digits = digits_oct(v);
+    if(C4_LIKELY(buf.len >= digits))
+        write_oct_unchecked(buf, v, digits);
+    return digits;
+}
+
+/** write an integer to a string in binary format. This is the
+ * lowest level (and the fastest) function to do this task.
+ * @note does not accept negative numbers
+ * @note does not prefix with 0b
+ * @note the resulting string is NOT zero-terminated.
+ * @note it is ok to call this with an empty or too-small buffer;
+ * no writes will occur, and the required size will be returned
+ * @return the number of characters required for the buffer. */
+template<class T>
+C4_ALWAYS_INLINE size_t write_bin(substr buf, T v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    C4_ASSERT(v >= 0);
+    unsigned digits = digits_bin(v);
+    C4_ASSERT(digits > 0);
+    if(C4_LIKELY(buf.len >= digits))
+        write_bin_unchecked(buf, v, digits);
+    return digits;
+}
+
+
+namespace detail {
+template<class U> using NumberWriter = size_t (*)(substr, U);
+template<class T, NumberWriter<T> writer>
+size_t write_num_digits(substr buf, T v, size_t num_digits) noexcept
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    size_t ret = writer(buf, v);
+    if(ret >= num_digits)
+        return ret;
+    else if(ret >= buf.len || num_digits > buf.len)
+        return num_digits;
+    C4_ASSERT(num_digits >= ret);
+    size_t delta = static_cast<size_t>(num_digits - ret);
+    memmove(buf.str + delta, buf.str, ret);
+    memset(buf.str, '0', delta);
+    return num_digits;
+}
+} // namespace detail
+
+
+/** same as c4::write_dec(), but pad with zeroes on the left
+ * such that the resulting string is @p num_digits wide.
+ * If the given number is requires more than num_digits, then the number prevails. */
+template<class T>
+C4_ALWAYS_INLINE size_t write_dec(substr buf, T val, size_t num_digits) noexcept
+{
+    return detail::write_num_digits<T, &write_dec<T>>(buf, val, num_digits);
+}
+
+/** same as c4::write_hex(), but pad with zeroes on the left
+ * such that the resulting string is @p num_digits wide.
+ * If the given number is requires more than num_digits, then the number prevails. */
+template<class T>
+C4_ALWAYS_INLINE size_t write_hex(substr buf, T val, size_t num_digits) noexcept
+{
+    return detail::write_num_digits<T, &write_hex<T>>(buf, val, num_digits);
+}
+
+/** same as c4::write_bin(), but pad with zeroes on the left
+ * such that the resulting string is @p num_digits wide.
+ * If the given number is requires more than num_digits, then the number prevails. */
+template<class T>
+C4_ALWAYS_INLINE size_t write_bin(substr buf, T val, size_t num_digits) noexcept
+{
+    return detail::write_num_digits<T, &write_bin<T>>(buf, val, num_digits);
+}
+
+/** same as c4::write_oct(), but pad with zeroes on the left
+ * such that the resulting string is @p num_digits wide.
+ * If the given number is requires more than num_digits, then the number prevails. */
+template<class T>
+C4_ALWAYS_INLINE size_t write_oct(substr buf, T val, size_t num_digits) noexcept
+{
+    return detail::write_num_digits<T, &write_oct<T>>(buf, val, num_digits);
+}
+
+C4_SUPPRESS_WARNING_GCC_POP
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** read a decimal integer from a string. This is the
+ * lowest level (and the fastest) function to do this task.
+ * @note does not accept negative numbers
+ * @note The string must be trimmed. Whitespace is not accepted.
+ * @note the string must not be empty
+ * @note there is no check for overflow; the value wraps around
+ * in a way similar to the standard C/C++ overflow behavior.
+ * For example, `read_dec<int8_t>("128", &val)` returns true
+ * and val will be set to 0 because 127 is the max i8 value.
+ * @see overflows<T>() to find out if a number string overflows a type range
+ * @return true if the conversion was successful (no overflow check) */
+template<class I>
+C4_ALWAYS_INLINE bool read_dec(csubstr s, I *C4_RESTRICT v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_integral<I>::value);
+    C4_ASSERT(!s.empty());
+    *v = 0;
+    for(char c : s)
+    {
+        if(C4_UNLIKELY(c < '0' || c > '9'))
+            return false;
+        *v = (*v) * I(10) + (I(c) - I('0'));
+    }
+    return true;
+}
+
+/** read an hexadecimal integer from a string. This is the
+ * lowest level (and the fastest) function to do this task.
+ * @note does not accept negative numbers
+ * @note does not accept leading 0x or 0X
+ * @note the string must not be empty
+ * @note the string must be trimmed. Whitespace is not accepted.
+ * @note there is no check for overflow; the value wraps around
+ * in a way similar to the standard C/C++ overflow behavior.
+ * For example, `read_hex<int8_t>("80", &val)` returns true
+ * and val will be set to 0 because 7f is the max i8 value.
+ * @see overflows<T>() to find out if a number string overflows a type range
+ * @return true if the conversion was successful (no overflow check) */
+template<class I>
+C4_ALWAYS_INLINE bool read_hex(csubstr s, I *C4_RESTRICT v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_integral<I>::value);
+    C4_ASSERT(!s.empty());
+    *v = 0;
+    for(char c : s)
+    {
+        I cv;
+        if(c >= '0' && c <= '9')
+            cv = I(c) - I('0');
+        else if(c >= 'a' && c <= 'f')
+            cv = I(10) + (I(c) - I('a'));
+        else if(c >= 'A' && c <= 'F')
+            cv = I(10) + (I(c) - I('A'));
+        else
+            return false;
+        *v = (*v) * I(16) + cv;
+    }
+    return true;
+}
+
+/** read a binary integer from a string. This is the
+ * lowest level (and the fastest) function to do this task.
+ * @note does not accept negative numbers
+ * @note does not accept leading 0b or 0B
+ * @note the string must not be empty
+ * @note the string must be trimmed. Whitespace is not accepted.
+ * @note there is no check for overflow; the value wraps around
+ * in a way similar to the standard C/C++ overflow behavior.
+ * For example, `read_bin<int8_t>("10000000", &val)` returns true
+ * and val will be set to 0 because 1111111 is the max i8 value.
+ * @see overflows<T>() to find out if a number string overflows a type range
+ * @return true if the conversion was successful (no overflow check) */
+template<class I>
+C4_ALWAYS_INLINE bool read_bin(csubstr s, I *C4_RESTRICT v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_integral<I>::value);
+    C4_ASSERT(!s.empty());
+    *v = 0;
+    for(char c : s)
+    {
+        *v <<= 1;
+        if(c == '1')
+            *v |= 1;
+        else if(c != '0')
+            return false;
+    }
+    return true;
+}
+
+/** read an octal integer from a string. This is the
+ * lowest level (and the fastest) function to do this task.
+ * @note does not accept negative numbers
+ * @note does not accept leading 0o or 0O
+ * @note the string must not be empty
+ * @note the string must be trimmed. Whitespace is not accepted.
+ * @note there is no check for overflow; the value wraps around
+ * in a way similar to the standard C/C++ overflow behavior.
+ * For example, `read_oct<int8_t>("200", &val)` returns true
+ * and val will be set to 0 because 177 is the max i8 value.
+ * @see overflows<T>() to find out if a number string overflows a type range
+ * @return true if the conversion was successful (no overflow check) */
+template<class I>
+C4_ALWAYS_INLINE bool read_oct(csubstr s, I *C4_RESTRICT v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_integral<I>::value);
+    C4_ASSERT(!s.empty());
+    *v = 0;
+    for(char c : s)
+    {
+        if(C4_UNLIKELY(c < '0' || c > '7'))
+            return false;
+        *v = (*v) * I(8) + (I(c) - I('0'));
+    }
+    return true;
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+namespace detail {
+inline size_t _itoa2buf(substr buf, size_t pos, csubstr val) noexcept
+{
+    C4_ASSERT(pos + val.len <= buf.len);
+    memcpy(buf.str + pos, val.str, val.len);
+    return pos + val.len;
+}
+inline size_t _itoa2bufwithdigits(substr buf, size_t pos, size_t num_digits, csubstr val) noexcept
+{
+    num_digits = num_digits > val.len ? num_digits - val.len : 0;
+    C4_ASSERT(num_digits + val.len <= buf.len);
+    for(size_t i = 0; i < num_digits; ++i)
+        _c4append('0');
+    return detail::_itoa2buf(buf, pos, val);
+}
+template<class I>
+C4_NO_INLINE size_t _itoadec2buf(substr buf) noexcept
+{
+    using digits_type = detail::charconv_digits<I>;
+    if(C4_UNLIKELY(buf.len < digits_type::maxdigits_dec))
+        return digits_type::maxdigits_dec;
+    buf.str[0] = '-';
+    return detail::_itoa2buf(buf, 1, digits_type::min_value_dec());
+}
+template<class I>
+C4_NO_INLINE size_t _itoa2buf(substr buf, I radix) noexcept
+{
+    using digits_type = detail::charconv_digits<I>;
+    size_t pos = 0;
+    if(C4_LIKELY(buf.len > 0))
+        buf.str[pos++] = '-';
+    switch(radix)
+    {
+    case I(10):
+        if(C4_UNLIKELY(buf.len < digits_type::maxdigits_dec))
+            return digits_type::maxdigits_dec;
+        pos =_itoa2buf(buf, pos, digits_type::min_value_dec());
+        break;
+    case I(16):
+        if(C4_UNLIKELY(buf.len < digits_type::maxdigits_hex))
+            return digits_type::maxdigits_hex;
+        buf.str[pos++] = '0';
+        buf.str[pos++] = 'x';
+        pos = _itoa2buf(buf, pos, digits_type::min_value_hex());
+        break;
+    case I( 2):
+        if(C4_UNLIKELY(buf.len < digits_type::maxdigits_bin))
+            return digits_type::maxdigits_bin;
+        buf.str[pos++] = '0';
+        buf.str[pos++] = 'b';
+        pos = _itoa2buf(buf, pos, digits_type::min_value_bin());
+        break;
+    case I( 8):
+        if(C4_UNLIKELY(buf.len < digits_type::maxdigits_oct))
+            return digits_type::maxdigits_oct;
+        buf.str[pos++] = '0';
+        buf.str[pos++] = 'o';
+        pos = _itoa2buf(buf, pos, digits_type::min_value_oct());
+        break;
+    }
+    return pos;
+}
+template<class I>
+C4_NO_INLINE size_t _itoa2buf(substr buf, I radix, size_t num_digits) noexcept
+{
+    using digits_type = detail::charconv_digits<I>;
+    size_t pos = 0;
+    size_t needed_digits = 0;
+    if(C4_LIKELY(buf.len > 0))
+        buf.str[pos++] = '-';
+    switch(radix)
+    {
+    case I(10):
+        // add 1 to account for -
+        needed_digits = num_digits+1 > digits_type::maxdigits_dec ? num_digits+1 : digits_type::maxdigits_dec;
+        if(C4_UNLIKELY(buf.len < needed_digits))
+            return needed_digits;
+        pos = _itoa2bufwithdigits(buf, pos, num_digits, digits_type::min_value_dec());
+        break;
+    case I(16):
+        // add 3 to account for -0x
+        needed_digits = num_digits+3 > digits_type::maxdigits_hex ? num_digits+3 : digits_type::maxdigits_hex;
+        if(C4_UNLIKELY(buf.len < needed_digits))
+            return needed_digits;
+        buf.str[pos++] = '0';
+        buf.str[pos++] = 'x';
+        pos = _itoa2bufwithdigits(buf, pos, num_digits, digits_type::min_value_hex());
+        break;
+    case I( 2):
+        // add 3 to account for -0b
+        needed_digits = num_digits+3 > digits_type::maxdigits_bin ? num_digits+3 : digits_type::maxdigits_bin;
+        if(C4_UNLIKELY(buf.len < needed_digits))
+            return needed_digits;
+        C4_ASSERT(buf.len >= digits_type::maxdigits_bin);
+        buf.str[pos++] = '0';
+        buf.str[pos++] = 'b';
+        pos = _itoa2bufwithdigits(buf, pos, num_digits, digits_type::min_value_bin());
+        break;
+    case I( 8):
+        // add 3 to account for -0o
+        needed_digits = num_digits+3 > digits_type::maxdigits_oct ? num_digits+3 : digits_type::maxdigits_oct;
+        if(C4_UNLIKELY(buf.len < needed_digits))
+            return needed_digits;
+        C4_ASSERT(buf.len >= digits_type::maxdigits_oct);
+        buf.str[pos++] = '0';
+        buf.str[pos++] = 'o';
+        pos = _itoa2bufwithdigits(buf, pos, num_digits, digits_type::min_value_oct());
+        break;
+    }
+    return pos;
+}
+} // namespace detail
+
+
+/** convert an integral signed decimal to a string.
+ * @note the resulting string is NOT zero-terminated.
+ * @note it is ok to call this with an empty or too-small buffer;
+ * no writes will occur, and the needed size will be returned
+ * @return the number of characters required for the buffer. */
+template<class T>
+C4_ALWAYS_INLINE size_t itoa(substr buf, T v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_signed<T>::value);
+    if(v >= T(0))
+    {
+        // write_dec() checks the buffer size, so no need to check here
+        return write_dec(buf, v);
+    }
+    // when T is the min value (eg i8: -128), negating it
+    // will overflow, so treat the min as a special case
+    else if(C4_LIKELY(v != std::numeric_limits<T>::min()))
+    {
+        v = -v;
+        unsigned digits = digits_dec(v);
+        if(C4_LIKELY(buf.len >= digits + 1u))
+        {
+            buf.str[0] = '-';
+            write_dec_unchecked(buf.sub(1), v, digits);
+        }
+        return digits + 1u;
+    }
+    return detail::_itoadec2buf<T>(buf);
+}
+
+/** convert an integral signed integer to a string, using a specific
+ * radix. The radix must be 2, 8, 10 or 16.
+ *
+ * @note the resulting string is NOT zero-terminated.
+ * @note it is ok to call this with an empty or too-small buffer;
+ * no writes will occur, and the needed size will be returned
+ * @return the number of characters required for the buffer. */
+template<class T>
+C4_ALWAYS_INLINE size_t itoa(substr buf, T v, T radix) noexcept
+{
+    C4_STATIC_ASSERT(std::is_signed<T>::value);
+    C4_ASSERT(radix == 2 || radix == 8 || radix == 10 || radix == 16);
+    C4_SUPPRESS_WARNING_GCC_PUSH
+    #if (defined(__GNUC__) && (__GNUC__ >= 7))
+        C4_SUPPRESS_WARNING_GCC("-Wstringop-overflow")  // gcc has a false positive here
+    #endif
+    // when T is the min value (eg i8: -128), negating it
+    // will overflow, so treat the min as a special case
+    if(C4_LIKELY(v != std::numeric_limits<T>::min()))
+    {
+        unsigned pos = 0;
+        if(v < 0)
+        {
+            v = -v;
+            if(C4_LIKELY(buf.len > 0))
+                buf.str[pos] = '-';
+            ++pos;
+        }
+        unsigned digits = 0;
+        switch(radix)
+        {
+        case T(10):
+            digits = digits_dec(v);
+            if(C4_LIKELY(buf.len >= pos + digits))
+                write_dec_unchecked(buf.sub(pos), v, digits);
+            break;
+        case T(16):
+            digits = digits_hex(v);
+            if(C4_LIKELY(buf.len >= pos + 2u + digits))
+            {
+                buf.str[pos + 0] = '0';
+                buf.str[pos + 1] = 'x';
+                write_hex_unchecked(buf.sub(pos + 2), v, digits);
+            }
+            digits += 2u;
+            break;
+        case T(2):
+            digits = digits_bin(v);
+            if(C4_LIKELY(buf.len >= pos + 2u + digits))
+            {
+                buf.str[pos + 0] = '0';
+                buf.str[pos + 1] = 'b';
+                write_bin_unchecked(buf.sub(pos + 2), v, digits);
+            }
+            digits += 2u;
+            break;
+        case T(8):
+            digits = digits_oct(v);
+            if(C4_LIKELY(buf.len >= pos + 2u + digits))
+            {
+                buf.str[pos + 0] = '0';
+                buf.str[pos + 1] = 'o';
+                write_oct_unchecked(buf.sub(pos + 2), v, digits);
+            }
+            digits += 2u;
+            break;
+        }
+        return pos + digits;
+    }
+    C4_SUPPRESS_WARNING_GCC_POP
+    // when T is the min value (eg i8: -128), negating it
+    // will overflow
+    return detail::_itoa2buf<T>(buf, radix);
+}
+
+
+/** same as c4::itoa(), but pad with zeroes on the left such that the
+ * resulting string is @p num_digits wide, not accounting for radix
+ * prefix (0x,0o,0b). The @p radix must be 2, 8, 10 or 16.
+ *
+ * @note the resulting string is NOT zero-terminated.
+ * @note it is ok to call this with an empty or too-small buffer;
+ * no writes will occur, and the needed size will be returned
+ * @return the number of characters required for the buffer. */
+template<class T>
+C4_ALWAYS_INLINE size_t itoa(substr buf, T v, T radix, size_t num_digits) noexcept
+{
+    C4_STATIC_ASSERT(std::is_signed<T>::value);
+    C4_ASSERT(radix == 2 || radix == 8 || radix == 10 || radix == 16);
+    C4_SUPPRESS_WARNING_GCC_PUSH
+    #if (defined(__GNUC__) && (__GNUC__ >= 7))
+        C4_SUPPRESS_WARNING_GCC("-Wstringop-overflow")  // gcc has a false positive here
+    #endif
+    // when T is the min value (eg i8: -128), negating it
+    // will overflow, so treat the min as a special case
+    if(C4_LIKELY(v != std::numeric_limits<T>::min()))
+    {
+        unsigned pos = 0;
+        if(v < 0)
+        {
+            v = -v;
+            if(C4_LIKELY(buf.len > 0))
+                buf.str[pos] = '-';
+            ++pos;
+        }
+        unsigned total_digits = 0;
+        switch(radix)
+        {
+        case T(10):
+            total_digits = digits_dec(v);
+            total_digits = pos + (unsigned)(num_digits > total_digits ? num_digits : total_digits);
+            if(C4_LIKELY(buf.len >= total_digits))
+                write_dec(buf.sub(pos), v, num_digits);
+            break;
+        case T(16):
+            total_digits = digits_hex(v);
+            total_digits = pos + 2u + (unsigned)(num_digits > total_digits ? num_digits : total_digits);
+            if(C4_LIKELY(buf.len >= total_digits))
+            {
+                buf.str[pos + 0] = '0';
+                buf.str[pos + 1] = 'x';
+                write_hex(buf.sub(pos + 2), v, num_digits);
+            }
+            break;
+        case T(2):
+            total_digits = digits_bin(v);
+            total_digits = pos + 2u + (unsigned)(num_digits > total_digits ? num_digits : total_digits);
+            if(C4_LIKELY(buf.len >= total_digits))
+            {
+                buf.str[pos + 0] = '0';
+                buf.str[pos + 1] = 'b';
+                write_bin(buf.sub(pos + 2), v, num_digits);
+            }
+            break;
+        case T(8):
+            total_digits = digits_oct(v);
+            total_digits = pos + 2u + (unsigned)(num_digits > total_digits ? num_digits : total_digits);
+            if(C4_LIKELY(buf.len >= total_digits))
+            {
+                buf.str[pos + 0] = '0';
+                buf.str[pos + 1] = 'o';
+                write_oct(buf.sub(pos + 2), v, num_digits);
+            }
+            break;
+        }
+        return total_digits;
+    }
+    C4_SUPPRESS_WARNING_GCC_POP
+    // when T is the min value (eg i8: -128), negating it
+    // will overflow
+    return detail::_itoa2buf<T>(buf, radix, num_digits);
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** convert an integral unsigned decimal to a string.
+ *
+ * @note the resulting string is NOT zero-terminated.
+ * @note it is ok to call this with an empty or too-small buffer;
+ * no writes will occur, and the needed size will be returned
+ * @return the number of characters required for the buffer. */
+template<class T>
+C4_ALWAYS_INLINE size_t utoa(substr buf, T v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_unsigned<T>::value);
+    // write_dec() does the buffer length check, so no need to check here
+    return write_dec(buf, v);
+}
+
+/** convert an integral unsigned integer to a string, using a specific
+ * radix. The radix must be 2, 8, 10 or 16.
+ *
+ * @note the resulting string is NOT zero-terminated.
+ * @note it is ok to call this with an empty or too-small buffer;
+ * no writes will occur, and the needed size will be returned
+ * @return the number of characters required for the buffer. */
+template<class T>
+C4_ALWAYS_INLINE size_t utoa(substr buf, T v, T radix) noexcept
+{
+    C4_STATIC_ASSERT(std::is_unsigned<T>::value);
+    C4_ASSERT(radix == 10 || radix == 16 || radix == 2 || radix == 8);
+    unsigned digits = 0;
+    switch(radix)
+    {
+    case T(10):
+        digits = digits_dec(v);
+        if(C4_LIKELY(buf.len >= digits))
+            write_dec_unchecked(buf, v, digits);
+        break;
+    case T(16):
+        digits = digits_hex(v);
+        if(C4_LIKELY(buf.len >= digits+2u))
+        {
+            buf.str[0] = '0';
+            buf.str[1] = 'x';
+            write_hex_unchecked(buf.sub(2), v, digits);
+        }
+        digits += 2u;
+        break;
+    case T(2):
+        digits = digits_bin(v);
+        if(C4_LIKELY(buf.len >= digits+2u))
+        {
+            buf.str[0] = '0';
+            buf.str[1] = 'b';
+            write_bin_unchecked(buf.sub(2), v, digits);
+        }
+        digits += 2u;
+        break;
+    case T(8):
+        digits = digits_oct(v);
+        if(C4_LIKELY(buf.len >= digits+2u))
+        {
+            buf.str[0] = '0';
+            buf.str[1] = 'o';
+            write_oct_unchecked(buf.sub(2), v, digits);
+        }
+        digits += 2u;
+        break;
+    }
+    return digits;
+}
+
+/** same as c4::utoa(), but pad with zeroes on the left such that the
+ * resulting string is @p num_digits wide. The @p radix must be 2,
+ * 8, 10 or 16.
+ *
+ * @note the resulting string is NOT zero-terminated.
+ * @note it is ok to call this with an empty or too-small buffer;
+ * no writes will occur, and the needed size will be returned
+ * @return the number of characters required for the buffer. */
+template<class T>
+C4_ALWAYS_INLINE size_t utoa(substr buf, T v, T radix, size_t num_digits) noexcept
+{
+    C4_STATIC_ASSERT(std::is_unsigned<T>::value);
+    C4_ASSERT(radix == 10 || radix == 16 || radix == 2 || radix == 8);
+    unsigned total_digits = 0;
+    switch(radix)
+    {
+    case T(10):
+        total_digits = digits_dec(v);
+        total_digits = (unsigned)(num_digits > total_digits ? num_digits : total_digits);
+        if(C4_LIKELY(buf.len >= total_digits))
+            write_dec(buf, v, num_digits);
+        break;
+    case T(16):
+        total_digits = digits_hex(v);
+        total_digits = 2u + (unsigned)(num_digits > total_digits ? num_digits : total_digits);
+        if(C4_LIKELY(buf.len >= total_digits))
+        {
+            buf.str[0] = '0';
+            buf.str[1] = 'x';
+            write_hex(buf.sub(2), v, num_digits);
+        }
+        break;
+    case T(2):
+        total_digits = digits_bin(v);
+        total_digits = 2u + (unsigned)(num_digits > total_digits ? num_digits : total_digits);
+        if(C4_LIKELY(buf.len >= total_digits))
+        {
+            buf.str[0] = '0';
+            buf.str[1] = 'b';
+            write_bin(buf.sub(2), v, num_digits);
+        }
+        break;
+    case T(8):
+        total_digits = digits_oct(v);
+        total_digits = 2u + (unsigned)(num_digits > total_digits ? num_digits : total_digits);
+        if(C4_LIKELY(buf.len >= total_digits))
+        {
+            buf.str[0] = '0';
+            buf.str[1] = 'o';
+            write_oct(buf.sub(2), v, num_digits);
+        }
+        break;
+    }
+    return total_digits;
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** Convert a trimmed string to a signed integral value. The input
+ * string can be formatted as decimal, binary (prefix 0b or 0B), octal
+ * (prefix 0o or 0O) or hexadecimal (prefix 0x or 0X). Strings with
+ * leading zeroes are considered as decimal and not octal (unlike the
+ * C/C++ convention). Every character in the input string is read for
+ * the conversion; the input string must not contain any leading or
+ * trailing whitespace.
+ *
+ * @return true if the conversion was successful.
+ *
+ * @note overflow is not detected: the return status is true even if
+ * the conversion would return a value outside of the type's range, in
+ * which case the result will wrap around the type's range.
+ * This is similar to native behavior.
+ *
+ * @note a positive sign is not accepted. ie, the string must not
+ * start with '+'
+ *
+ * @see atoi_first() if the string is not trimmed to the value to read. */
+template<class T>
+C4_ALWAYS_INLINE bool atoi(csubstr str, T * C4_RESTRICT v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    C4_STATIC_ASSERT(std::is_signed<T>::value);
+
+    if(C4_UNLIKELY(str.len == 0))
+        return false;
+
+    C4_ASSERT(str.str[0] != '+');
+
+    T sign = 1;
+    size_t start = 0;
+    if(str.str[0] == '-')
+    {
+        if(C4_UNLIKELY(str.len == ++start))
+            return false;
+        sign = -1;
+    }
+
+    bool parsed_ok = true;
+    if(str.str[start] != '0') // this should be the common case, so put it first
+    {
+        parsed_ok = read_dec(str.sub(start), v);
+    }
+    else if(str.len > start + 1)
+    {
+        // starts with 0: is it 0x, 0o, 0b?
+        const char pfx = str.str[start + 1];
+        if(pfx == 'x' || pfx == 'X')
+            parsed_ok = str.len > start + 2 && read_hex(str.sub(start + 2), v);
+        else if(pfx == 'b' || pfx == 'B')
+            parsed_ok = str.len > start + 2 && read_bin(str.sub(start + 2), v);
+        else if(pfx == 'o' || pfx == 'O')
+            parsed_ok = str.len > start + 2 && read_oct(str.sub(start + 2), v);
+        else
+            parsed_ok = read_dec(str.sub(start + 1), v);
+    }
+    else
+    {
+        parsed_ok = read_dec(str.sub(start), v);
+    }
+    if(C4_LIKELY(parsed_ok))
+        *v *= sign;
+    return parsed_ok;
+}
+
+
+/** Select the next range of characters in the string that can be parsed
+ * as a signed integral value, and convert it using atoi(). Leading
+ * whitespace (space, newline, tabs) is skipped.
+ * @return the number of characters read for conversion, or csubstr::npos if the conversion failed
+ * @see atoi() if the string is already trimmed to the value to read.
+ * @see csubstr::first_int_span() */
+template<class T>
+C4_ALWAYS_INLINE size_t atoi_first(csubstr str, T * C4_RESTRICT v)
+{
+    csubstr trimmed = str.first_int_span();
+    if(trimmed.len == 0)
+        return csubstr::npos;
+    if(atoi(trimmed, v))
+        return static_cast<size_t>(trimmed.end() - str.begin());
+    return csubstr::npos;
+}
+
+
+//-----------------------------------------------------------------------------
+
+/** Convert a trimmed string to an unsigned integral value. The string can be
+ * formatted as decimal, binary (prefix 0b or 0B), octal (prefix 0o or 0O)
+ * or hexadecimal (prefix 0x or 0X). Every character in the input string is read
+ * for the conversion; it must not contain any leading or trailing whitespace.
+ *
+ * @return true if the conversion was successful.
+ *
+ * @note overflow is not detected: the return status is true even if
+ * the conversion would return a value outside of the type's range, in
+ * which case the result will wrap around the type's range.
+ *
+ * @note If the string has a minus character, the return status
+ * will be false.
+ *
+ * @see atou_first() if the string is not trimmed to the value to read. */
+template<class T>
+bool atou(csubstr str, T * C4_RESTRICT v) noexcept
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+
+    if(C4_UNLIKELY(str.len == 0 || str.front() == '-'))
+        return false;
+
+    bool parsed_ok = true;
+    if(str.str[0] != '0')
+    {
+        parsed_ok = read_dec(str, v);
+    }
+    else
+    {
+        if(str.len > 1)
+        {
+            const char pfx = str.str[1];
+            if(pfx == 'x' || pfx == 'X')
+                parsed_ok = str.len > 2 && read_hex(str.sub(2), v);
+            else if(pfx == 'b' || pfx == 'B')
+                parsed_ok = str.len > 2 && read_bin(str.sub(2), v);
+            else if(pfx == 'o' || pfx == 'O')
+                parsed_ok = str.len > 2 && read_oct(str.sub(2), v);
+            else
+                parsed_ok = read_dec(str, v);
+        }
+        else
+        {
+            *v = 0; // we know the first character is 0
+        }
+    }
+    return parsed_ok;
+}
+
+
+/** Select the next range of characters in the string that can be parsed
+ * as an unsigned integral value, and convert it using atou(). Leading
+ * whitespace (space, newline, tabs) is skipped.
+ * @return the number of characters read for conversion, or csubstr::npos if the conversion faileds
+ * @see atou() if the string is already trimmed to the value to read.
+ * @see csubstr::first_uint_span() */
+template<class T>
+C4_ALWAYS_INLINE size_t atou_first(csubstr str, T *v)
+{
+    csubstr trimmed = str.first_uint_span();
+    if(trimmed.len == 0)
+        return csubstr::npos;
+    if(atou(trimmed, v))
+        return static_cast<size_t>(trimmed.end() - str.begin());
+    return csubstr::npos;
+}
+
+
+#ifdef _MSC_VER
+#   pragma warning(pop)
+#elif defined(__clang__)
+#   pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic pop
+#endif
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+namespace detail {
+inline bool check_overflow(csubstr str, csubstr limit) noexcept
+{
+    if(str.len == limit.len)
+    {
+        for(size_t i = 0; i < limit.len; ++i)
+        {
+            if(str[i] < limit[i])
+                return false;
+            else if(str[i] > limit[i])
+                return true;
+        }
+        return false;
+    }
+    else
+        return str.len > limit.len;
+}
+} // namespace detail
+
+
+/** Test if the following string would overflow when converted to associated
+ * types.
+ * @return true if number will overflow, false if it fits (or doesn't parse)
+ */
+template<class T>
+auto overflows(csubstr str) noexcept
+    -> typename std::enable_if<std::is_unsigned<T>::value, bool>::type 
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+
+    if(C4_UNLIKELY(str.len == 0))
+    {
+        return false;
+    }
+    else if(str.str[0] == '0')
+    {
+        if (str.len == 1)
+            return false;
+        switch (str.str[1])
+        {
+            case 'x':
+            case 'X':
+            {
+                size_t fno = str.first_not_of('0', 2);
+                if (fno == csubstr::npos)
+                    return false;
+                return !(str.len <= fno + (sizeof(T) * 2));
+            }
+            case 'b':
+            case 'B':
+            {
+                size_t fno = str.first_not_of('0', 2);
+                if (fno == csubstr::npos)
+                    return false;
+                return !(str.len <= fno +(sizeof(T) * 8));
+            }
+            case 'o':
+            case 'O':
+            {
+                size_t fno = str.first_not_of('0', 2);
+                if(fno == csubstr::npos)
+                    return false;
+                return detail::charconv_digits<T>::is_oct_overflow(str.sub(fno));
+            }
+            default:
+            {
+                size_t fno = str.first_not_of('0', 1);
+                if(fno == csubstr::npos)
+                    return false;
+                return detail::check_overflow(str.sub(fno), detail::charconv_digits<T>::max_value_dec());
+            }
+        }
+    }
+    else if(C4_UNLIKELY(str[0] == '-'))
+    {
+        return true;
+    }
+    else
+    {
+        return detail::check_overflow(str, detail::charconv_digits<T>::max_value_dec());
+    }
+}
+
+
+/** Test if the following string would overflow when converted to associated
+ * types.
+ * @return true if number will overflow, false if it fits (or doesn't parse)
+ */
+template<class T>
+auto overflows(csubstr str)
+    -> typename std::enable_if<std::is_signed<T>::value, bool>::type 
+{
+    C4_STATIC_ASSERT(std::is_integral<T>::value);
+    if(C4_UNLIKELY(str.len == 0))
+        return false;
+    if(str.str[0] == '-')
+    {
+        if(str.str[1] == '0')
+        {
+            if(str.len == 2)
+                return false;
+            switch(str.str[2])
+            {
+                case 'x':
+                case 'X':
+                {
+                    size_t fno = str.first_not_of('0', 3);
+                    if (fno == csubstr::npos)
+                        return false;
+                    return detail::check_overflow(str.sub(fno), detail::charconv_digits<T>::min_value_hex());
+                }
+                case 'b':
+                case 'B':
+                {
+                    size_t fno = str.first_not_of('0', 3);
+                    if (fno == csubstr::npos)
+                        return false;
+                    return detail::check_overflow(str.sub(fno), detail::charconv_digits<T>::min_value_bin());
+                }
+                case 'o':
+                case 'O':
+                {
+                    size_t fno = str.first_not_of('0', 3);
+                    if(fno == csubstr::npos)
+                        return false;
+                    return detail::check_overflow(str.sub(fno), detail::charconv_digits<T>::min_value_oct());
+                }
+                default:
+                {
+                    size_t fno = str.first_not_of('0', 2);
+                    if(fno == csubstr::npos)
+                        return false;
+                    return detail::check_overflow(str.sub(fno), detail::charconv_digits<T>::min_value_dec());
+                }
+            }
+        }
+        else
+            return detail::check_overflow(str.sub(1), detail::charconv_digits<T>::min_value_dec());
+    }
+    else if(str.str[0] == '0')
+    {
+        if (str.len == 1)
+            return false;
+        switch(str.str[1])
+        {
+            case 'x':
+            case 'X':
+            {
+                size_t fno = str.first_not_of('0', 2);
+                if (fno == csubstr::npos)
+                    return false;
+                const size_t len = str.len - fno;
+                return !((len < sizeof (T) * 2) || (len == sizeof(T) * 2 && str[fno] <= '7'));
+            }
+            case 'b':
+            case 'B':
+            {
+                size_t fno = str.first_not_of('0', 2);
+                if (fno == csubstr::npos)
+                    return false;
+                return !(str.len <= fno + (sizeof(T) * 8 - 1));
+            }
+            case 'o':
+            case 'O':
+            {
+                size_t fno = str.first_not_of('0', 2);
+                if(fno == csubstr::npos)
+                    return false;
+                return detail::charconv_digits<T>::is_oct_overflow(str.sub(fno));
+            }
+            default:
+            {
+                size_t fno = str.first_not_of('0', 1);
+                if(fno == csubstr::npos)
+                    return false;
+                return detail::check_overflow(str.sub(fno), detail::charconv_digits<T>::max_value_dec());
+            }
+        }
+    }
+    else
+        return detail::check_overflow(str, detail::charconv_digits<T>::max_value_dec());
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+namespace detail {
+
+
+#if (!C4CORE_HAVE_STD_FROMCHARS)
+/** @see http://www.exploringbinary.com/ for many good examples on float-str conversion */
+template<size_t N>
+void get_real_format_str(char (& C4_RESTRICT fmt)[N], int precision, RealFormat_e formatting, const char* length_modifier="")
+{
+    int iret;
+    if(precision == -1)
+        iret = snprintf(fmt, sizeof(fmt), "%%%s%c", length_modifier, formatting);
+    else if(precision == 0)
+        iret = snprintf(fmt, sizeof(fmt), "%%.%s%c", length_modifier, formatting);
+    else
+        iret = snprintf(fmt, sizeof(fmt), "%%.%d%s%c", precision, length_modifier, formatting);
+    C4_ASSERT(iret >= 2 && size_t(iret) < sizeof(fmt));
+    C4_UNUSED(iret);
+}
+
+
+/** @todo we're depending on snprintf()/sscanf() for converting to/from
+ * floating point numbers. Apparently, this increases the binary size
+ * by a considerable amount. There are some lightweight printf
+ * implementations:
+ *
+ * @see http://www.sparetimelabs.com/tinyprintf/tinyprintf.php (BSD)
+ * @see https://github.com/weiss/c99-snprintf
+ * @see https://github.com/nothings/stb/blob/master/stb_sprintf.h
+ * @see http://www.exploringbinary.com/
+ * @see https://blog.benoitblanchon.fr/lightweight-float-to-string/
+ * @see http://www.ryanjuckett.com/programming/printing-floating-point-numbers/
+ */
+template<class T>
+size_t print_one(substr str, const char* full_fmt, T v)
+{
+#ifdef _MSC_VER
+    /** use _snprintf() to prevent early termination of the output
+     * for writing the null character at the last position
+     * @see https://msdn.microsoft.com/en-us/library/2ts7cx93.aspx */
+    int iret = _snprintf(str.str, str.len, full_fmt, v);
+    if(iret < 0)
+    {
+        /* when buf.len is not enough, VS returns a negative value.
+         * so call it again with a negative value for getting an
+         * actual length of the string */
+        iret = snprintf(nullptr, 0, full_fmt, v);
+        C4_ASSERT(iret > 0);
+    }
+    size_t ret = (size_t) iret;
+    return ret;
+#else
+    int iret = snprintf(str.str, str.len, full_fmt, v);
+    C4_ASSERT(iret >= 0);
+    size_t ret = (size_t) iret;
+    if(ret >= str.len)
+        ++ret; /* snprintf() reserves the last character to write \0 */
+    return ret;
+#endif
+}
+#endif // (!C4CORE_HAVE_STD_FROMCHARS)
+
+
+#if (!C4CORE_HAVE_STD_FROMCHARS) && (!C4CORE_HAVE_FAST_FLOAT)
+/** scans a string using the given type format, while at the same time
+ * allowing non-null-terminated strings AND guaranteeing that the given
+ * string length is strictly respected, so that no buffer overflows
+ * might occur. */
+template<typename T>
+inline size_t scan_one(csubstr str, const char *type_fmt, T *v)
+{
+    /* snscanf() is absolutely needed here as we must be sure that
+     * str.len is strictly respected, because substr is
+     * generally not null-terminated.
+     *
+     * Alas, there is no snscanf().
+     *
+     * So we fake it by using a dynamic format with an explicit
+     * field size set to the length of the given span.
+     * This trick is taken from:
+     * https://stackoverflow.com/a/18368910/5875572 */
+
+    /* this is the actual format we'll use for scanning */
+    char fmt[16];
+
+    /* write the length into it. Eg "%12f".
+     * Also, get the number of characters read from the string.
+     * So the final format ends up as "%12f%n"*/
+    int iret = std::snprintf(fmt, sizeof(fmt), "%%" "%zu" "%s" "%%n", str.len, type_fmt);
+    /* no nasty surprises, please! */
+    C4_ASSERT(iret >= 0 && size_t(iret) < C4_COUNTOF(fmt));
+
+    /* now we scan with confidence that the span length is respected */
+    int num_chars;
+    iret = std::sscanf(str.str, fmt, v, &num_chars);
+    /* scanf returns the number of successful conversions */
+    if(iret != 1) return csubstr::npos;
+    C4_ASSERT(num_chars >= 0);
+    return (size_t)(num_chars);
+}
+#endif // (!C4CORE_HAVE_STD_FROMCHARS) && (!C4CORE_HAVE_FAST_FLOAT)
+
+
+#if C4CORE_HAVE_STD_TOCHARS
+template<class T>
+C4_ALWAYS_INLINE size_t rtoa(substr buf, T v, int precision=-1, RealFormat_e formatting=FTOA_FLEX) noexcept
+{
+    std::to_chars_result result;
+    size_t pos = 0;
+    if(formatting == FTOA_HEXA)
+    {
+        if(buf.len > size_t(2))
+        {
+            buf.str[0] = '0';
+            buf.str[1] = 'x';
+        }
+        pos += size_t(2);
+    }
+    if(precision == -1)
+        result = std::to_chars(buf.str + pos, buf.str + buf.len, v, (std::chars_format)formatting);
+    else
+        result = std::to_chars(buf.str + pos, buf.str + buf.len, v, (std::chars_format)formatting, precision);
+    if(result.ec == std::errc())
+    {
+        // all good, no errors.
+        C4_ASSERT(result.ptr >= buf.str);
+        ptrdiff_t delta = result.ptr - buf.str;
+        return static_cast<size_t>(delta);
+    }
+    C4_ASSERT(result.ec == std::errc::value_too_large);
+    // This is unfortunate.
+    //
+    // When the result can't fit in the given buffer,
+    // std::to_chars() returns the end pointer it was originally
+    // given, which is useless because here we would like to know
+    // _exactly_ how many characters the buffer must have to fit
+    // the result.
+    //
+    // So we take the pessimistic view, and assume as many digits
+    // as could ever be required:
+    size_t ret = static_cast<size_t>(std::numeric_limits<T>::max_digits10);
+    return ret > buf.len ? ret : buf.len + 1;
+}
+#endif // C4CORE_HAVE_STD_TOCHARS
+
+
+#if C4CORE_HAVE_FAST_FLOAT
+template<class T>
+C4_ALWAYS_INLINE bool scan_rhex(csubstr s, T *C4_RESTRICT val) noexcept
+{
+    C4_ASSERT(s.len > 0);
+    C4_ASSERT(s.str[0] != '-');
+    C4_ASSERT(s.str[0] != '+');
+    C4_ASSERT(!s.begins_with("0x"));
+    C4_ASSERT(!s.begins_with("0X"));
+    size_t pos = 0;
+    // integer part
+    for( ; pos < s.len; ++pos)
+    {
+        const char c = s.str[pos];
+        if(c >= '0' && c <= '9')
+            *val = *val * T(16) + T(c - '0');
+        else if(c >= 'a' && c <= 'f')
+            *val = *val * T(16) + T(c - 'a');
+        else if(c >= 'A' && c <= 'F')
+            *val = *val * T(16) + T(c - 'A');
+        else if(c == '.')
+        {
+            ++pos;
+            break; // follow on to mantissa
+        }
+        else if(c == 'p' || c == 'P')
+        {
+            ++pos;
+            goto power; // no mantissa given, jump to power
+        }
+        else
+        {
+            return false;
+        }
+    }
+    // mantissa
+    {
+        // 0.0625 == 1/16 == value of first digit after the comma
+        for(T digit = T(0.0625); pos < s.len; ++pos, digit /= T(16))
+        {
+            const char c = s.str[pos];
+            if(c >= '0' && c <= '9')
+                *val += digit * T(c - '0');
+            else if(c >= 'a' && c <= 'f')
+                *val += digit * T(c - 'a');
+            else if(c >= 'A' && c <= 'F')
+                *val += digit * T(c - 'A');
+            else if(c == 'p' || c == 'P')
+            {
+                ++pos;
+                goto power; // mantissa finished, jump to power
+            }
+            else
+            {
+                return false;
+            }
+        }
+    }
+    return true;
+power:
+    if(C4_LIKELY(pos < s.len))
+    {
+        if(s.str[pos] == '+') // atoi() cannot handle a leading '+'
+            ++pos;
+        if(C4_LIKELY(pos < s.len))
+        {
+            int16_t powval = {};
+            if(C4_LIKELY(atoi(s.sub(pos), &powval)))
+            {
+                *val *= ipow<T, int16_t, 16>(powval);
+                return true;
+            }
+        }
+    }
+    return false;
+}
+#endif
+
+} // namespace detail
+
+
+#undef _c4appendhex
+#undef _c4append
+
+
+/** Convert a single-precision real number to string.  The string will
+ * in general be NOT null-terminated.  For FTOA_FLEX, \p precision is
+ * the number of significand digits. Otherwise \p precision is the
+ * number of decimals. It is safe to call this function with an empty
+ * or too-small buffer.
+ *
+ * @return the size of the buffer needed to write the number
+ */
+C4_ALWAYS_INLINE size_t ftoa(substr str, float v, int precision=-1, RealFormat_e formatting=FTOA_FLEX) noexcept
+{
+#if C4CORE_HAVE_STD_TOCHARS
+    return detail::rtoa(str, v, precision, formatting);
+#else
+    char fmt[16];
+    detail::get_real_format_str(fmt, precision, formatting, /*length_modifier*/"");
+    return detail::print_one(str, fmt, v);
+#endif
+}
+
+
+/** Convert a double-precision real number to string.  The string will
+ * in general be NOT null-terminated.  For FTOA_FLEX, \p precision is
+ * the number of significand digits. Otherwise \p precision is the
+ * number of decimals. It is safe to call this function with an empty
+ * or too-small buffer.
+ *
+ * @return the size of the buffer needed to write the number
+ */
+C4_ALWAYS_INLINE size_t dtoa(substr str, double v, int precision=-1, RealFormat_e formatting=FTOA_FLEX) noexcept
+{
+#if C4CORE_HAVE_STD_TOCHARS
+    return detail::rtoa(str, v, precision, formatting);
+#else
+    char fmt[16];
+    detail::get_real_format_str(fmt, precision, formatting, /*length_modifier*/"l");
+    return detail::print_one(str, fmt, v);
+#endif
+}
+
+
+/** Convert a string to a single precision real number.
+ * The input string must be trimmed to the value, ie
+ * no leading or trailing whitespace can be present.
+ * @return true iff the conversion succeeded
+ * @see atof_first() if the string is not trimmed
+ */
+C4_ALWAYS_INLINE bool atof(csubstr str, float * C4_RESTRICT v) noexcept
+{
+    C4_ASSERT(str.len > 0);
+    C4_ASSERT(str.triml(" \r\t\n").len == str.len);
+#if C4CORE_HAVE_FAST_FLOAT
+    // fastfloat cannot parse hexadecimal floats
+    bool isneg = (str.str[0] == '-');
+    csubstr rem = str.sub(isneg || str.str[0] == '+');
+    if(!(rem.len >= 2 && (rem.str[0] == '0' && (rem.str[1] == 'x' || rem.str[1] == 'X'))))
+    {
+        fast_float::from_chars_result result;
+        result = fast_float::from_chars(str.str, str.str + str.len, *v);
+        return result.ec == std::errc();
+    }
+    else if(detail::scan_rhex(rem.sub(2), v))
+    {
+        *v *= isneg ? -1.f : 1.f;
+        return true;
+    }
+    return false;
+#elif C4CORE_HAVE_STD_FROMCHARS
+    std::from_chars_result result;
+    result = std::from_chars(str.str, str.str + str.len, *v);
+    return result.ec == std::errc();
+#else
+    csubstr rem = str.sub(str.str[0] == '-' || str.str[0] == '+');
+    if(!(rem.len >= 2 && (rem.str[0] == '0' && (rem.str[1] == 'x' || rem.str[1] == 'X'))))
+        return detail::scan_one(str, "f", v) != csubstr::npos;
+    else
+        return detail::scan_one(str, "a", v) != csubstr::npos;
+#endif
+}
+
+
+/** Convert a string to a double precision real number.
+ * The input string must be trimmed to the value, ie
+ * no leading or trailing whitespace can be present.
+ * @return true iff the conversion succeeded
+ * @see atod_first() if the string is not trimmed
+ */
+C4_ALWAYS_INLINE bool atod(csubstr str, double * C4_RESTRICT v) noexcept
+{
+    C4_ASSERT(str.triml(" \r\t\n").len == str.len);
+#if C4CORE_HAVE_FAST_FLOAT
+    // fastfloat cannot parse hexadecimal floats
+    bool isneg = (str.str[0] == '-');
+    csubstr rem = str.sub(isneg || str.str[0] == '+');
+    if(!(rem.len >= 2 && (rem.str[0] == '0' && (rem.str[1] == 'x' || rem.str[1] == 'X'))))
+    {
+        fast_float::from_chars_result result;
+        result = fast_float::from_chars(str.str, str.str + str.len, *v);
+        return result.ec == std::errc();
+    }
+    else if(detail::scan_rhex(rem.sub(2), v))
+    {
+        *v *= isneg ? -1. : 1.;
+        return true;
+    }
+    return false;
+#elif C4CORE_HAVE_STD_FROMCHARS
+    std::from_chars_result result;
+    result = std::from_chars(str.str, str.str + str.len, *v);
+    return result.ec == std::errc();
+#else
+    csubstr rem = str.sub(str.str[0] == '-' || str.str[0] == '+');
+    if(!(rem.len >= 2 && (rem.str[0] == '0' && (rem.str[1] == 'x' || rem.str[1] == 'X'))))
+        return detail::scan_one(str, "lf", v) != csubstr::npos;
+    else
+        return detail::scan_one(str, "la", v) != csubstr::npos;
+#endif
+}
+
+
+/** Convert a string to a single precision real number.
+ * Leading whitespace is skipped until valid characters are found.
+ * @return the number of characters read from the string, or npos if
+ * conversion was not successful or if the string was empty */
+inline size_t atof_first(csubstr str, float * C4_RESTRICT v) noexcept
+{
+    csubstr trimmed = str.first_real_span();
+    if(trimmed.len == 0)
+        return csubstr::npos;
+    if(atof(trimmed, v))
+        return static_cast<size_t>(trimmed.end() - str.begin());
+    return csubstr::npos;
+}
+
+
+/** Convert a string to a double precision real number.
+ * Leading whitespace is skipped until valid characters are found.
+ * @return the number of characters read from the string, or npos if
+ * conversion was not successful or if the string was empty */
+inline size_t atod_first(csubstr str, double * C4_RESTRICT v) noexcept
+{
+    csubstr trimmed = str.first_real_span();
+    if(trimmed.len == 0)
+        return csubstr::npos;
+    if(atod(trimmed, v))
+        return static_cast<size_t>(trimmed.end() - str.begin());
+    return csubstr::npos;
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// generic versions
+
+C4_ALWAYS_INLINE size_t xtoa(substr s,  uint8_t v) noexcept { return write_dec(s, v); }
+C4_ALWAYS_INLINE size_t xtoa(substr s, uint16_t v) noexcept { return write_dec(s, v); }
+C4_ALWAYS_INLINE size_t xtoa(substr s, uint32_t v) noexcept { return write_dec(s, v); }
+C4_ALWAYS_INLINE size_t xtoa(substr s, uint64_t v) noexcept { return write_dec(s, v); }
+C4_ALWAYS_INLINE size_t xtoa(substr s,   int8_t v) noexcept { return itoa(s, v); }
+C4_ALWAYS_INLINE size_t xtoa(substr s,  int16_t v) noexcept { return itoa(s, v); }
+C4_ALWAYS_INLINE size_t xtoa(substr s,  int32_t v) noexcept { return itoa(s, v); }
+C4_ALWAYS_INLINE size_t xtoa(substr s,  int64_t v) noexcept { return itoa(s, v); }
+C4_ALWAYS_INLINE size_t xtoa(substr s,    float v) noexcept { return ftoa(s, v); }
+C4_ALWAYS_INLINE size_t xtoa(substr s,   double v) noexcept { return dtoa(s, v); }
+
+C4_ALWAYS_INLINE size_t xtoa(substr s,  uint8_t v,  uint8_t radix) noexcept { return utoa(s, v, radix); }
+C4_ALWAYS_INLINE size_t xtoa(substr s, uint16_t v, uint16_t radix) noexcept { return utoa(s, v, radix); }
+C4_ALWAYS_INLINE size_t xtoa(substr s, uint32_t v, uint32_t radix) noexcept { return utoa(s, v, radix); }
+C4_ALWAYS_INLINE size_t xtoa(substr s, uint64_t v, uint64_t radix) noexcept { return utoa(s, v, radix); }
+C4_ALWAYS_INLINE size_t xtoa(substr s,   int8_t v,   int8_t radix) noexcept { return itoa(s, v, radix); }
+C4_ALWAYS_INLINE size_t xtoa(substr s,  int16_t v,  int16_t radix) noexcept { return itoa(s, v, radix); }
+C4_ALWAYS_INLINE size_t xtoa(substr s,  int32_t v,  int32_t radix) noexcept { return itoa(s, v, radix); }
+C4_ALWAYS_INLINE size_t xtoa(substr s,  int64_t v,  int64_t radix) noexcept { return itoa(s, v, radix); }
+
+C4_ALWAYS_INLINE size_t xtoa(substr s,  uint8_t v,  uint8_t radix, size_t num_digits) noexcept { return utoa(s, v, radix, num_digits); }
+C4_ALWAYS_INLINE size_t xtoa(substr s, uint16_t v, uint16_t radix, size_t num_digits) noexcept { return utoa(s, v, radix, num_digits); }
+C4_ALWAYS_INLINE size_t xtoa(substr s, uint32_t v, uint32_t radix, size_t num_digits) noexcept { return utoa(s, v, radix, num_digits); }
+C4_ALWAYS_INLINE size_t xtoa(substr s, uint64_t v, uint64_t radix, size_t num_digits) noexcept { return utoa(s, v, radix, num_digits); }
+C4_ALWAYS_INLINE size_t xtoa(substr s,   int8_t v,   int8_t radix, size_t num_digits) noexcept { return itoa(s, v, radix, num_digits); }
+C4_ALWAYS_INLINE size_t xtoa(substr s,  int16_t v,  int16_t radix, size_t num_digits) noexcept { return itoa(s, v, radix, num_digits); }
+C4_ALWAYS_INLINE size_t xtoa(substr s,  int32_t v,  int32_t radix, size_t num_digits) noexcept { return itoa(s, v, radix, num_digits); }
+C4_ALWAYS_INLINE size_t xtoa(substr s,  int64_t v,  int64_t radix, size_t num_digits) noexcept { return itoa(s, v, radix, num_digits); }
+
+C4_ALWAYS_INLINE size_t xtoa(substr s,  float v, int precision, RealFormat_e formatting=FTOA_FLEX) noexcept { return ftoa(s, v, precision, formatting); }
+C4_ALWAYS_INLINE size_t xtoa(substr s, double v, int precision, RealFormat_e formatting=FTOA_FLEX) noexcept { return dtoa(s, v, precision, formatting); }
+
+C4_ALWAYS_INLINE bool atox(csubstr s,  uint8_t *C4_RESTRICT v) noexcept { return atou(s, v); }
+C4_ALWAYS_INLINE bool atox(csubstr s, uint16_t *C4_RESTRICT v) noexcept { return atou(s, v); }
+C4_ALWAYS_INLINE bool atox(csubstr s, uint32_t *C4_RESTRICT v) noexcept { return atou(s, v); }
+C4_ALWAYS_INLINE bool atox(csubstr s, uint64_t *C4_RESTRICT v) noexcept { return atou(s, v); }
+C4_ALWAYS_INLINE bool atox(csubstr s,   int8_t *C4_RESTRICT v) noexcept { return atoi(s, v); }
+C4_ALWAYS_INLINE bool atox(csubstr s,  int16_t *C4_RESTRICT v) noexcept { return atoi(s, v); }
+C4_ALWAYS_INLINE bool atox(csubstr s,  int32_t *C4_RESTRICT v) noexcept { return atoi(s, v); }
+C4_ALWAYS_INLINE bool atox(csubstr s,  int64_t *C4_RESTRICT v) noexcept { return atoi(s, v); }
+C4_ALWAYS_INLINE bool atox(csubstr s,    float *C4_RESTRICT v) noexcept { return atof(s, v); }
+C4_ALWAYS_INLINE bool atox(csubstr s,   double *C4_RESTRICT v) noexcept { return atod(s, v); }
+
+C4_ALWAYS_INLINE size_t to_chars(substr buf,  uint8_t v) noexcept { return write_dec(buf, v); }
+C4_ALWAYS_INLINE size_t to_chars(substr buf, uint16_t v) noexcept { return write_dec(buf, v); }
+C4_ALWAYS_INLINE size_t to_chars(substr buf, uint32_t v) noexcept { return write_dec(buf, v); }
+C4_ALWAYS_INLINE size_t to_chars(substr buf, uint64_t v) noexcept { return write_dec(buf, v); }
+C4_ALWAYS_INLINE size_t to_chars(substr buf,   int8_t v) noexcept { return itoa(buf, v); }
+C4_ALWAYS_INLINE size_t to_chars(substr buf,  int16_t v) noexcept { return itoa(buf, v); }
+C4_ALWAYS_INLINE size_t to_chars(substr buf,  int32_t v) noexcept { return itoa(buf, v); }
+C4_ALWAYS_INLINE size_t to_chars(substr buf,  int64_t v) noexcept { return itoa(buf, v); }
+C4_ALWAYS_INLINE size_t to_chars(substr buf,    float v) noexcept { return ftoa(buf, v); }
+C4_ALWAYS_INLINE size_t to_chars(substr buf,   double v) noexcept { return dtoa(buf, v); }
+
+C4_ALWAYS_INLINE bool from_chars(csubstr buf,  uint8_t *C4_RESTRICT v) noexcept { return atou(buf, v); }
+C4_ALWAYS_INLINE bool from_chars(csubstr buf, uint16_t *C4_RESTRICT v) noexcept { return atou(buf, v); }
+C4_ALWAYS_INLINE bool from_chars(csubstr buf, uint32_t *C4_RESTRICT v) noexcept { return atou(buf, v); }
+C4_ALWAYS_INLINE bool from_chars(csubstr buf, uint64_t *C4_RESTRICT v) noexcept { return atou(buf, v); }
+C4_ALWAYS_INLINE bool from_chars(csubstr buf,   int8_t *C4_RESTRICT v) noexcept { return atoi(buf, v); }
+C4_ALWAYS_INLINE bool from_chars(csubstr buf,  int16_t *C4_RESTRICT v) noexcept { return atoi(buf, v); }
+C4_ALWAYS_INLINE bool from_chars(csubstr buf,  int32_t *C4_RESTRICT v) noexcept { return atoi(buf, v); }
+C4_ALWAYS_INLINE bool from_chars(csubstr buf,  int64_t *C4_RESTRICT v) noexcept { return atoi(buf, v); }
+C4_ALWAYS_INLINE bool from_chars(csubstr buf,    float *C4_RESTRICT v) noexcept { return atof(buf, v); }
+C4_ALWAYS_INLINE bool from_chars(csubstr buf,   double *C4_RESTRICT v) noexcept { return atod(buf, v); }
+
+C4_ALWAYS_INLINE size_t from_chars_first(csubstr buf,  uint8_t *C4_RESTRICT v) noexcept { return atou_first(buf, v); }
+C4_ALWAYS_INLINE size_t from_chars_first(csubstr buf, uint16_t *C4_RESTRICT v) noexcept { return atou_first(buf, v); }
+C4_ALWAYS_INLINE size_t from_chars_first(csubstr buf, uint32_t *C4_RESTRICT v) noexcept { return atou_first(buf, v); }
+C4_ALWAYS_INLINE size_t from_chars_first(csubstr buf, uint64_t *C4_RESTRICT v) noexcept { return atou_first(buf, v); }
+C4_ALWAYS_INLINE size_t from_chars_first(csubstr buf,   int8_t *C4_RESTRICT v) noexcept { return atoi_first(buf, v); }
+C4_ALWAYS_INLINE size_t from_chars_first(csubstr buf,  int16_t *C4_RESTRICT v) noexcept { return atoi_first(buf, v); }
+C4_ALWAYS_INLINE size_t from_chars_first(csubstr buf,  int32_t *C4_RESTRICT v) noexcept { return atoi_first(buf, v); }
+C4_ALWAYS_INLINE size_t from_chars_first(csubstr buf,  int64_t *C4_RESTRICT v) noexcept { return atoi_first(buf, v); }
+C4_ALWAYS_INLINE size_t from_chars_first(csubstr buf,    float *C4_RESTRICT v) noexcept { return atof_first(buf, v); }
+C4_ALWAYS_INLINE size_t from_chars_first(csubstr buf,   double *C4_RESTRICT v) noexcept { return atod_first(buf, v); }
+
+
+//-----------------------------------------------------------------------------
+// on some platforms, (unsigned) int and (unsigned) long
+// are not any of the fixed length types above
+
+#define _C4_IF_NOT_FIXED_LENGTH_I(T, ty) C4_ALWAYS_INLINE typename std::enable_if<std::  is_signed<T>::value && !is_fixed_length<T>::value_i, ty>
+#define _C4_IF_NOT_FIXED_LENGTH_U(T, ty) C4_ALWAYS_INLINE typename std::enable_if<std::is_unsigned<T>::value && !is_fixed_length<T>::value_u, ty>
+
+template <class T> _C4_IF_NOT_FIXED_LENGTH_I(T, size_t)::type xtoa(substr buf, T v) noexcept { return itoa(buf, v); }
+template <class T> _C4_IF_NOT_FIXED_LENGTH_U(T, size_t)::type xtoa(substr buf, T v) noexcept { return write_dec(buf, v); }
+
+template <class T> _C4_IF_NOT_FIXED_LENGTH_I(T, bool  )::type atox(csubstr buf, T *C4_RESTRICT v) noexcept { return atoi(buf, v); }
+template <class T> _C4_IF_NOT_FIXED_LENGTH_U(T, bool  )::type atox(csubstr buf, T *C4_RESTRICT v) noexcept { return atou(buf, v); }
+
+template <class T> _C4_IF_NOT_FIXED_LENGTH_I(T, size_t)::type to_chars(substr buf, T v) noexcept { return itoa(buf, v); }
+template <class T> _C4_IF_NOT_FIXED_LENGTH_U(T, size_t)::type to_chars(substr buf, T v) noexcept { return write_dec(buf, v); }
+
+template <class T> _C4_IF_NOT_FIXED_LENGTH_I(T, bool  )::type from_chars(csubstr buf, T *C4_RESTRICT v) noexcept { return atoi(buf, v); }
+template <class T> _C4_IF_NOT_FIXED_LENGTH_U(T, bool  )::type from_chars(csubstr buf, T *C4_RESTRICT v) noexcept { return atou(buf, v); }
+
+template <class T> _C4_IF_NOT_FIXED_LENGTH_I(T, size_t)::type from_chars_first(csubstr buf, T *C4_RESTRICT v) noexcept { return atoi_first(buf, v); }
+template <class T> _C4_IF_NOT_FIXED_LENGTH_U(T, size_t)::type from_chars_first(csubstr buf, T *C4_RESTRICT v) noexcept { return atou_first(buf, v); }
+
+#undef _C4_IF_NOT_FIXED_LENGTH_I
+#undef _C4_IF_NOT_FIXED_LENGTH_U
+
+
+//-----------------------------------------------------------------------------
+// for pointers
+
+template <class T> C4_ALWAYS_INLINE size_t xtoa(substr s, T *v) noexcept { return itoa(s, (intptr_t)v, (intptr_t)16); }
+template <class T> C4_ALWAYS_INLINE bool   atox(csubstr s, T **v) noexcept { intptr_t tmp; bool ret = atox(s, &tmp); if(ret) { *v = (T*)tmp; } return ret; }
+template <class T> C4_ALWAYS_INLINE size_t to_chars(substr s, T *v) noexcept { return itoa(s, (intptr_t)v, (intptr_t)16); }
+template <class T> C4_ALWAYS_INLINE bool   from_chars(csubstr buf, T **v) noexcept { intptr_t tmp; bool ret = from_chars(buf, &tmp); if(ret) { *v = (T*)tmp; } return ret; }
+template <class T> C4_ALWAYS_INLINE size_t from_chars_first(csubstr buf, T **v) noexcept { intptr_t tmp; bool ret = from_chars_first(buf, &tmp); if(ret) { *v = (T*)tmp; } return ret; }
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+/** call to_chars() and return a substr consisting of the
+ * written portion of the input buffer. Ie, same as to_chars(),
+ * but return a substr instead of a size_t.
+ *
+ * @see to_chars() */
+template<class T>
+C4_ALWAYS_INLINE substr to_chars_sub(substr buf, T const& C4_RESTRICT v) noexcept
+{
+    size_t sz = to_chars(buf, v);
+    return buf.left_of(sz <= buf.len ? sz : buf.len);
+}
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// bool implementation
+
+C4_ALWAYS_INLINE size_t to_chars(substr buf, bool v) noexcept
+{
+    int val = v;
+    return to_chars(buf, val);
+}
+
+inline bool from_chars(csubstr buf, bool * C4_RESTRICT v) noexcept
+{
+    if(buf == '0')
+    {
+        *v = false; return true;
+    }
+    else if(buf == '1')
+    {
+        *v = true; return true;
+    }
+    else if(buf == "false")
+    {
+        *v = false; return true;
+    }
+    else if(buf == "true")
+    {
+        *v = true; return true;
+    }
+    else if(buf == "False")
+    {
+        *v = false; return true;
+    }
+    else if(buf == "True")
+    {
+        *v = true; return true;
+    }
+    else if(buf == "FALSE")
+    {
+        *v = false; return true;
+    }
+    else if(buf == "TRUE")
+    {
+        *v = true; return true;
+    }
+    // fallback to c-style int bools
+    int val = 0;
+    bool ret = from_chars(buf, &val);
+    if(C4_LIKELY(ret))
+    {
+        *v = (val != 0);
+    }
+    return ret;
+}
+
+inline size_t from_chars_first(csubstr buf, bool * C4_RESTRICT v) noexcept
+{
+    csubstr trimmed = buf.first_non_empty_span();
+    if(trimmed.len == 0 || !from_chars(buf, v))
+        return csubstr::npos;
+    return trimmed.len;
+}
+
+
+//-----------------------------------------------------------------------------
+// single-char implementation
+
+inline size_t to_chars(substr buf, char v) noexcept
+{
+    if(buf.len > 0)
+        buf[0] = v;
+    return 1;
+}
+
+/** extract a single character from a substring
+ * @note to extract a string instead and not just a single character, use the csubstr overload */
+inline bool from_chars(csubstr buf, char * C4_RESTRICT v) noexcept
+{
+    if(buf.len != 1)
+        return false;
+    *v = buf[0];
+    return true;
+}
+
+inline size_t from_chars_first(csubstr buf, char * C4_RESTRICT v) noexcept
+{
+    if(buf.len < 1)
+        return csubstr::npos;
+    *v = buf[0];
+    return 1;
+}
+
+
+//-----------------------------------------------------------------------------
+// csubstr implementation
+
+inline size_t to_chars(substr buf, csubstr v) noexcept
+{
+    C4_ASSERT(!buf.overlaps(v));
+    size_t len = buf.len < v.len ? buf.len : v.len;
+    // calling memcpy with null strings is undefined behavior
+    // and will wreak havoc in calling code's branches.
+    // see https://github.com/biojppm/rapidyaml/pull/264#issuecomment-1262133637
+    if(len)
+    {
+        C4_ASSERT(buf.str != nullptr);
+        C4_ASSERT(v.str != nullptr);
+        memcpy(buf.str, v.str, len);
+    }
+    return v.len;
+}
+
+inline bool from_chars(csubstr buf, csubstr *C4_RESTRICT v) noexcept
+{
+    *v = buf;
+    return true;
+}
+
+inline size_t from_chars_first(substr buf, csubstr * C4_RESTRICT v) noexcept
+{
+    csubstr trimmed = buf.first_non_empty_span();
+    if(trimmed.len == 0)
+        return csubstr::npos;
+    *v = trimmed;
+    return static_cast<size_t>(trimmed.end() - buf.begin());
+}
+
+
+//-----------------------------------------------------------------------------
+// substr
+
+inline size_t to_chars(substr buf, substr v) noexcept
+{
+    C4_ASSERT(!buf.overlaps(v));
+    size_t len = buf.len < v.len ? buf.len : v.len;
+    // calling memcpy with null strings is undefined behavior
+    // and will wreak havoc in calling code's branches.
+    // see https://github.com/biojppm/rapidyaml/pull/264#issuecomment-1262133637
+    if(len)
+    {
+        C4_ASSERT(buf.str != nullptr);
+        C4_ASSERT(v.str != nullptr);
+        memcpy(buf.str, v.str, len);
+    }
+    return v.len;
+}
+
+inline bool from_chars(csubstr buf, substr * C4_RESTRICT v) noexcept
+{
+    C4_ASSERT(!buf.overlaps(*v));
+    // is the destination buffer wide enough?
+    if(v->len >= buf.len)
+    {
+        // calling memcpy with null strings is undefined behavior
+        // and will wreak havoc in calling code's branches.
+        // see https://github.com/biojppm/rapidyaml/pull/264#issuecomment-1262133637
+        if(buf.len)
+        {
+            C4_ASSERT(buf.str != nullptr);
+            C4_ASSERT(v->str != nullptr);
+            memcpy(v->str, buf.str, buf.len);
+        }
+        v->len = buf.len;
+        return true;
+    }
+    return false;
+}
+
+inline size_t from_chars_first(csubstr buf, substr * C4_RESTRICT v) noexcept
+{
+    csubstr trimmed = buf.first_non_empty_span();
+    C4_ASSERT(!trimmed.overlaps(*v));
+    if(C4_UNLIKELY(trimmed.len == 0))
+        return csubstr::npos;
+    size_t len = trimmed.len > v->len ? v->len : trimmed.len;
+    // calling memcpy with null strings is undefined behavior
+    // and will wreak havoc in calling code's branches.
+    // see https://github.com/biojppm/rapidyaml/pull/264#issuecomment-1262133637
+    if(len)
+    {
+        C4_ASSERT(buf.str != nullptr);
+        C4_ASSERT(v->str != nullptr);
+        memcpy(v->str, trimmed.str, len);
+    }
+    if(C4_UNLIKELY(trimmed.len > v->len))
+        return csubstr::npos;
+    return static_cast<size_t>(trimmed.end() - buf.begin());
+}
+
+
+//-----------------------------------------------------------------------------
+
+template<size_t N>
+inline size_t to_chars(substr buf, const char (& C4_RESTRICT v)[N]) noexcept
+{
+    csubstr sp(v);
+    return to_chars(buf, sp);
+}
+
+inline size_t to_chars(substr buf, const char * C4_RESTRICT v) noexcept
+{
+    return to_chars(buf, to_csubstr(v));
+}
+
+} // namespace c4
+
+#ifdef _MSC_VER
+#   pragma warning(pop)
+#elif defined(__clang__)
+#   pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic pop
+#endif
+
+#endif /* _C4_CHARCONV_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/charconv.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/utf.hpp
+// https://github.com/biojppm/c4core/src/c4/utf.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef C4_UTF_HPP_
+#define C4_UTF_HPP_
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/language.hpp
+//#include "c4/language.hpp"
+#if !defined(C4_LANGUAGE_HPP_) && !defined(_C4_LANGUAGE_HPP_)
+#error "amalgamate: file c4/language.hpp must have been included at this point"
+#endif /* C4_LANGUAGE_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/substr_fwd.hpp
+//#include "c4/substr_fwd.hpp"
+#if !defined(C4_SUBSTR_FWD_HPP_) && !defined(_C4_SUBSTR_FWD_HPP_)
+#error "amalgamate: file c4/substr_fwd.hpp must have been included at this point"
+#endif /* C4_SUBSTR_FWD_HPP_ */
+
+//included above:
+//#include <stddef.h>
+//included above:
+//#include <stdint.h>
+
+namespace c4 {
+
+substr decode_code_point(substr out, csubstr code_point);
+size_t decode_code_point(uint8_t *C4_RESTRICT buf, size_t buflen, const uint32_t code);
+
+} // namespace c4
+
+#endif // C4_UTF_HPP_
+
+
+// (end https://github.com/biojppm/c4core/src/c4/utf.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/format.hpp
+// https://github.com/biojppm/c4core/src/c4/format.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_FORMAT_HPP_
+#define _C4_FORMAT_HPP_
+
+/** @file format.hpp provides type-safe facilities for formatting arguments
+ * to string buffers */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/charconv.hpp
+//#include "c4/charconv.hpp"
+#if !defined(C4_CHARCONV_HPP_) && !defined(_C4_CHARCONV_HPP_)
+#error "amalgamate: file c4/charconv.hpp must have been included at this point"
+#endif /* C4_CHARCONV_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/blob.hpp
+//#include "c4/blob.hpp"
+#if !defined(C4_BLOB_HPP_) && !defined(_C4_BLOB_HPP_)
+#error "amalgamate: file c4/blob.hpp must have been included at this point"
+#endif /* C4_BLOB_HPP_ */
+
+
+
+#ifdef _MSC_VER
+#   pragma warning(push)
+#   if C4_MSVC_VERSION != C4_MSVC_VERSION_2017
+#       pragma warning(disable: 4800) // forcing value to bool 'true' or 'false' (performance warning)
+#   endif
+#   pragma warning(disable: 4996) // snprintf/scanf: this function or variable may be unsafe
+#elif defined(__clang__)
+#   pragma clang diagnostic push
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wuseless-cast"
+#endif
+
+namespace c4 {
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// formatting truthy types as booleans
+
+namespace fmt {
+
+/** write a variable as an alphabetic boolean, ie as either true or false
+ * @param strict_read */
+template<class T>
+struct boolalpha_
+{
+    boolalpha_(T val_, bool strict_read_=false) : val(val_ ? true : false), strict_read(strict_read_) {}
+    bool val;
+    bool strict_read;
+};
+
+template<class T>
+boolalpha_<T> boolalpha(T const& val, bool strict_read=false)
+{
+    return boolalpha_<T>(val, strict_read);
+}
+
+} // namespace fmt
+
+/** write a variable as an alphabetic boolean, ie as either true or false */
+template<class T>
+inline size_t to_chars(substr buf, fmt::boolalpha_<T> fmt)
+{
+    return to_chars(buf, fmt.val ? "true" : "false");
+}
+
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// formatting integral types
+
+namespace fmt {
+
+/** format an integral type with a custom radix */
+template<typename T>
+struct integral_
+{
+    T val;
+    T radix;
+    C4_ALWAYS_INLINE integral_(T val_, T radix_) : val(val_), radix(radix_) {}
+};
+
+/** format an integral type with a custom radix, and pad with zeroes on the left */
+template<typename T>
+struct integral_padded_
+{
+    T val;
+    T radix;
+    size_t num_digits;
+    C4_ALWAYS_INLINE integral_padded_(T val_, T radix_, size_t nd) : val(val_), radix(radix_), num_digits(nd) {}
+};
+
+/** format an integral type with a custom radix */
+template<class T>
+C4_ALWAYS_INLINE integral_<T> integral(T val, T radix=10)
+{
+    return integral_<T>(val, radix);
+}
+/** format an integral type with a custom radix */
+template<class T>
+C4_ALWAYS_INLINE integral_<intptr_t> integral(T const* val, T radix=10)
+{
+    return integral_<intptr_t>(reinterpret_cast<intptr_t>(val), static_cast<intptr_t>(radix));
+}
+/** format an integral type with a custom radix */
+template<class T>
+C4_ALWAYS_INLINE integral_<intptr_t> integral(std::nullptr_t, T radix=10)
+{
+    return integral_<intptr_t>(intptr_t(0), static_cast<intptr_t>(radix));
+}
+/** pad the argument with zeroes on the left, with decimal radix */
+template<class T>
+C4_ALWAYS_INLINE integral_padded_<T> zpad(T val, size_t num_digits)
+{
+    return integral_padded_<T>(val, T(10), num_digits);
+}
+/** pad the argument with zeroes on the left */
+template<class T>
+C4_ALWAYS_INLINE integral_padded_<T> zpad(integral_<T> val, size_t num_digits)
+{
+    return integral_padded_<T>(val.val, val.radix, num_digits);
+}
+/** pad the argument with zeroes on the left */
+C4_ALWAYS_INLINE integral_padded_<intptr_t> zpad(std::nullptr_t, size_t num_digits)
+{
+    return integral_padded_<intptr_t>(0, 16, num_digits);
+}
+/** pad the argument with zeroes on the left */
+template<class T>
+C4_ALWAYS_INLINE integral_padded_<intptr_t> zpad(T const* val, size_t num_digits)
+{
+    return integral_padded_<intptr_t>(reinterpret_cast<intptr_t>(val), 16, num_digits);
+}
+template<class T>
+C4_ALWAYS_INLINE integral_padded_<intptr_t> zpad(T * val, size_t num_digits)
+{
+    return integral_padded_<intptr_t>(reinterpret_cast<intptr_t>(val), 16, num_digits);
+}
+
+
+/** format the pointer as an hexadecimal value */
+template<class T>
+inline integral_<intptr_t> hex(T * v)
+{
+    return integral_<intptr_t>(reinterpret_cast<intptr_t>(v), intptr_t(16));
+}
+/** format the pointer as an hexadecimal value */
+template<class T>
+inline integral_<intptr_t> hex(T const* v)
+{
+    return integral_<intptr_t>(reinterpret_cast<intptr_t>(v), intptr_t(16));
+}
+/** format null as an hexadecimal value
+ * @overload hex */
+inline integral_<intptr_t> hex(std::nullptr_t)
+{
+    return integral_<intptr_t>(0, intptr_t(16));
+}
+/** format the integral_ argument as an hexadecimal value
+ * @overload hex */
+template<class T>
+inline integral_<T> hex(T v)
+{
+    return integral_<T>(v, T(16));
+}
+
+/** format the pointer as an octal value */
+template<class T>
+inline integral_<intptr_t> oct(T const* v)
+{
+    return integral_<intptr_t>(reinterpret_cast<intptr_t>(v), intptr_t(8));
+}
+/** format the pointer as an octal value */
+template<class T>
+inline integral_<intptr_t> oct(T * v)
+{
+    return integral_<intptr_t>(reinterpret_cast<intptr_t>(v), intptr_t(8));
+}
+/** format null as an octal value */
+inline integral_<intptr_t> oct(std::nullptr_t)
+{
+    return integral_<intptr_t>(intptr_t(0), intptr_t(8));
+}
+/** format the integral_ argument as an octal value */
+template<class T>
+inline integral_<T> oct(T v)
+{
+    return integral_<T>(v, T(8));
+}
+
+/** format the pointer as a binary 0-1 value
+ * @see c4::raw() if you want to use a binary memcpy instead of 0-1 formatting */
+template<class T>
+inline integral_<intptr_t> bin(T const* v)
+{
+    return integral_<intptr_t>(reinterpret_cast<intptr_t>(v), intptr_t(2));
+}
+/** format the pointer as a binary 0-1 value
+ * @see c4::raw() if you want to use a binary memcpy instead of 0-1 formatting */
+template<class T>
+inline integral_<intptr_t> bin(T * v)
+{
+    return integral_<intptr_t>(reinterpret_cast<intptr_t>(v), intptr_t(2));
+}
+/** format null as a binary 0-1 value
+ * @see c4::raw() if you want to use a binary memcpy instead of 0-1 formatting */
+inline integral_<intptr_t> bin(std::nullptr_t)
+{
+    return integral_<intptr_t>(intptr_t(0), intptr_t(2));
+}
+/** format the integral_ argument as a binary 0-1 value
+ * @see c4::raw() if you want to use a raw memcpy-based binary dump instead of 0-1 formatting */
+template<class T>
+inline integral_<T> bin(T v)
+{
+    return integral_<T>(v, T(2));
+}
+
+
+template<class T>
+struct overflow_checked_
+{
+    static_assert(std::is_integral<T>::value, "range checking only for integral types");
+    C4_ALWAYS_INLINE overflow_checked_(T &val_) : val(&val_) {}
+    T *val;
+};
+template<class T>
+C4_ALWAYS_INLINE overflow_checked_<T> overflow_checked(T &val)
+{
+   return overflow_checked_<T>(val);
+}
+
+} // namespace fmt
+
+/** format an integral_ signed type */
+template<typename T>
+C4_ALWAYS_INLINE
+typename std::enable_if<std::is_signed<T>::value, size_t>::type
+to_chars(substr buf, fmt::integral_<T> fmt)
+{
+    return itoa(buf, fmt.val, fmt.radix);
+}
+/** format an integral_ signed type, pad with zeroes */
+template<typename T>
+C4_ALWAYS_INLINE
+typename std::enable_if<std::is_signed<T>::value, size_t>::type
+to_chars(substr buf, fmt::integral_padded_<T> fmt)
+{
+    return itoa(buf, fmt.val, fmt.radix, fmt.num_digits);
+}
+
+/** format an integral_ unsigned type */
+template<typename T>
+C4_ALWAYS_INLINE
+typename std::enable_if<std::is_unsigned<T>::value, size_t>::type
+to_chars(substr buf, fmt::integral_<T> fmt)
+{
+    return utoa(buf, fmt.val, fmt.radix);
+}
+/** format an integral_ unsigned type, pad with zeroes */
+template<typename T>
+C4_ALWAYS_INLINE
+typename std::enable_if<std::is_unsigned<T>::value, size_t>::type
+to_chars(substr buf, fmt::integral_padded_<T> fmt)
+{
+    return utoa(buf, fmt.val, fmt.radix, fmt.num_digits);
+}
+
+template<class T>
+C4_ALWAYS_INLINE bool from_chars(csubstr s, fmt::overflow_checked_<T> wrapper)
+{
+    if(C4_LIKELY(!overflows<T>(s)))
+        return atox(s, wrapper.val);
+    return false;
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// formatting real types
+
+namespace fmt {
+
+template<class T>
+struct real_
+{
+    T val;
+    int precision;
+    RealFormat_e fmt;
+    real_(T v, int prec=-1, RealFormat_e f=FTOA_FLOAT) : val(v), precision(prec), fmt(f)  {}
+};
+
+template<class T>
+real_<T> real(T val, int precision, RealFormat_e fmt=FTOA_FLOAT)
+{
+    return real_<T>(val, precision, fmt);
+}
+
+} // namespace fmt
+
+inline size_t to_chars(substr buf, fmt::real_< float> fmt) { return ftoa(buf, fmt.val, fmt.precision, fmt.fmt); }
+inline size_t to_chars(substr buf, fmt::real_<double> fmt) { return dtoa(buf, fmt.val, fmt.precision, fmt.fmt); }
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// writing raw binary data
+
+namespace fmt {
+
+/** @see blob_ */
+template<class T>
+struct raw_wrapper_ : public blob_<T>
+{
+    size_t alignment;
+
+    C4_ALWAYS_INLINE raw_wrapper_(blob_<T> data, size_t alignment_) noexcept
+        :
+        blob_<T>(data),
+        alignment(alignment_)
+    {
+        C4_ASSERT_MSG(alignment > 0 && (alignment & (alignment - 1)) == 0, "alignment must be a power of two");
+    }
+};
+
+using const_raw_wrapper = raw_wrapper_<cbyte>;
+using raw_wrapper = raw_wrapper_<byte>;
+
+/** mark a variable to be written in raw binary format, using memcpy
+ * @see blob_ */
+inline const_raw_wrapper craw(cblob data, size_t alignment=alignof(max_align_t))
+{
+    return const_raw_wrapper(data, alignment);
+}
+/** mark a variable to be written in raw binary format, using memcpy
+ * @see blob_ */
+inline const_raw_wrapper raw(cblob data, size_t alignment=alignof(max_align_t))
+{
+    return const_raw_wrapper(data, alignment);
+}
+/** mark a variable to be written in raw binary format, using memcpy
+ * @see blob_ */
+template<class T>
+inline const_raw_wrapper craw(T const& C4_RESTRICT data, size_t alignment=alignof(T))
+{
+    return const_raw_wrapper(cblob(data), alignment);
+}
+/** mark a variable to be written in raw binary format, using memcpy
+ * @see blob_ */
+template<class T>
+inline const_raw_wrapper raw(T const& C4_RESTRICT data, size_t alignment=alignof(T))
+{
+    return const_raw_wrapper(cblob(data), alignment);
+}
+
+/** mark a variable to be read in raw binary format, using memcpy */
+inline raw_wrapper raw(blob data, size_t alignment=alignof(max_align_t))
+{
+    return raw_wrapper(data, alignment);
+}
+/** mark a variable to be read in raw binary format, using memcpy */
+template<class T>
+inline raw_wrapper raw(T & C4_RESTRICT data, size_t alignment=alignof(T))
+{
+    return raw_wrapper(blob(data), alignment);
+}
+
+} // namespace fmt
+
+
+/** write a variable in raw binary format, using memcpy */
+C4CORE_EXPORT size_t to_chars(substr buf, fmt::const_raw_wrapper r);
+
+/** read a variable in raw binary format, using memcpy */
+C4CORE_EXPORT bool from_chars(csubstr buf, fmt::raw_wrapper *r);
+/** read a variable in raw binary format, using memcpy */
+inline bool from_chars(csubstr buf, fmt::raw_wrapper r)
+{
+    return from_chars(buf, &r);
+}
+
+/** read a variable in raw binary format, using memcpy */
+inline size_t from_chars_first(csubstr buf, fmt::raw_wrapper *r)
+{
+    return from_chars(buf, r);
+}
+/** read a variable in raw binary format, using memcpy */
+inline size_t from_chars_first(csubstr buf, fmt::raw_wrapper r)
+{
+    return from_chars(buf, &r);
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// formatting aligned to left/right
+
+namespace fmt {
+
+template<class T>
+struct left_
+{
+    T val;
+    size_t width;
+    char pad;
+    left_(T v, size_t w, char p) : val(v), width(w), pad(p) {}
+};
+
+template<class T>
+struct right_
+{
+    T val;
+    size_t width;
+    char pad;
+    right_(T v, size_t w, char p) : val(v), width(w), pad(p) {}
+};
+
+/** mark an argument to be aligned left */
+template<class T>
+left_<T> left(T val, size_t width, char padchar=' ')
+{
+    return left_<T>(val, width, padchar);
+}
+
+/** mark an argument to be aligned right */
+template<class T>
+right_<T> right(T val, size_t width, char padchar=' ')
+{
+    return right_<T>(val, width, padchar);
+}
+
+} // namespace fmt
+
+
+template<class T>
+size_t to_chars(substr buf, fmt::left_<T> const& C4_RESTRICT align)
+{
+    size_t ret = to_chars(buf, align.val);
+    if(ret >= buf.len || ret >= align.width)
+        return ret > align.width ? ret : align.width;
+    buf.first(align.width).sub(ret).fill(align.pad);
+    to_chars(buf, align.val);
+    return align.width;
+}
+
+template<class T>
+size_t to_chars(substr buf, fmt::right_<T> const& C4_RESTRICT align)
+{
+    size_t ret = to_chars(buf, align.val);
+    if(ret >= buf.len || ret >= align.width)
+        return ret > align.width ? ret : align.width;
+    size_t rem = static_cast<size_t>(align.width - ret);
+    buf.first(rem).fill(align.pad);
+    to_chars(buf.sub(rem), align.val);
+    return align.width;
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/// @cond dev
+// terminates the variadic recursion
+inline size_t cat(substr /*buf*/)
+{
+    return 0;
+}
+/// @endcond
+
+
+/** serialize the arguments, concatenating them to the given fixed-size buffer.
+ * The buffer size is strictly respected: no writes will occur beyond its end.
+ * @return the number of characters needed to write all the arguments into the buffer.
+ * @see c4::catrs() if instead of a fixed-size buffer, a resizeable container is desired
+ * @see c4::uncat() for the inverse function
+ * @see c4::catsep() if a separator between each argument is to be used
+ * @see c4::format() if a format string is desired */
+template<class Arg, class... Args>
+size_t cat(substr buf, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    size_t num = to_chars(buf, a);
+    buf  = buf.len >= num ? buf.sub(num) : substr{};
+    num += cat(buf, more...);
+    return num;
+}
+
+/** like c4::cat() but return a substr instead of a size */
+template<class... Args>
+substr cat_sub(substr buf, Args && ...args)
+{
+    size_t sz = cat(buf, std::forward<Args>(args)...);
+    C4_CHECK(sz <= buf.len);
+    return {buf.str, sz <= buf.len ? sz : buf.len};
+}
+
+
+//-----------------------------------------------------------------------------
+
+/// @cond dev
+// terminates the variadic recursion
+inline size_t uncat(csubstr /*buf*/)
+{
+    return 0;
+}
+/// @endcond
+
+
+/** deserialize the arguments from the given buffer.
+ *
+ * @return the number of characters read from the buffer, or csubstr::npos
+ *   if a conversion was not successful.
+ * @see c4::cat(). c4::uncat() is the inverse of c4::cat(). */
+template<class Arg, class... Args>
+size_t uncat(csubstr buf, Arg & C4_RESTRICT a, Args & C4_RESTRICT ...more)
+{
+    size_t out = from_chars_first(buf, &a);
+    if(C4_UNLIKELY(out == csubstr::npos))
+        return csubstr::npos;
+    buf  = buf.len >= out ? buf.sub(out) : substr{};
+    size_t num = uncat(buf, more...);
+    if(C4_UNLIKELY(num == csubstr::npos))
+        return csubstr::npos;
+    return out + num;
+}
+
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+namespace detail {
+
+template<class Sep>
+inline size_t catsep_more(substr /*buf*/, Sep const& C4_RESTRICT /*sep*/)
+{
+    return 0;
+}
+
+template<class Sep, class Arg, class... Args>
+size_t catsep_more(substr buf, Sep const& C4_RESTRICT sep, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    size_t ret = to_chars(buf, sep), num = ret;
+    buf  = buf.len >= ret ? buf.sub(ret) : substr{};
+    ret  = to_chars(buf, a);
+    num += ret;
+    buf  = buf.len >= ret ? buf.sub(ret) : substr{};
+    ret  = catsep_more(buf, sep, more...);
+    num += ret;
+    return num;
+}
+
+template<class Sep>
+inline size_t uncatsep_more(csubstr /*buf*/, Sep & /*sep*/)
+{
+    return 0;
+}
+
+template<class Sep, class Arg, class... Args>
+size_t uncatsep_more(csubstr buf, Sep & C4_RESTRICT sep, Arg & C4_RESTRICT a, Args & C4_RESTRICT ...more)
+{
+    size_t ret = from_chars_first(buf, &sep), num = ret;
+    if(C4_UNLIKELY(ret == csubstr::npos))
+        return csubstr::npos;
+    buf  = buf.len >= ret ? buf.sub(ret) : substr{};
+    ret  = from_chars_first(buf, &a);
+    if(C4_UNLIKELY(ret == csubstr::npos))
+        return csubstr::npos;
+    num += ret;
+    buf  = buf.len >= ret ? buf.sub(ret) : substr{};
+    ret  = uncatsep_more(buf, sep, more...);
+    if(C4_UNLIKELY(ret == csubstr::npos))
+        return csubstr::npos;
+    num += ret;
+    return num;
+}
+
+} // namespace detail
+
+
+/** serialize the arguments, concatenating them to the given fixed-size
+ * buffer, using a separator between each argument.
+ * The buffer size is strictly respected: no writes will occur beyond its end.
+ * @return the number of characters needed to write all the arguments into the buffer.
+ * @see c4::catseprs() if instead of a fixed-size buffer, a resizeable container is desired
+ * @see c4::uncatsep() for the inverse function (ie, reading instead of writing)
+ * @see c4::cat() if no separator is needed
+ * @see c4::format() if a format string is desired */
+template<class Sep, class Arg, class... Args>
+size_t catsep(substr buf, Sep const& C4_RESTRICT sep, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    size_t num = to_chars(buf, a);
+    buf  = buf.len >= num ? buf.sub(num) : substr{};
+    num += detail::catsep_more(buf, sep, more...);
+    return num;
+}
+
+/** like c4::catsep() but return a substr instead of a size
+ * @see c4::catsep(). c4::uncatsep() is the inverse of c4::catsep(). */
+template<class... Args>
+substr catsep_sub(substr buf, Args && ...args)
+{
+    size_t sz = catsep(buf, std::forward<Args>(args)...);
+    C4_CHECK(sz <= buf.len);
+    return {buf.str, sz <= buf.len ? sz : buf.len};
+}
+
+/** deserialize the arguments from the given buffer, using a separator.
+ *
+ * @return the number of characters read from the buffer, or csubstr::npos
+ *   if a conversion was not successful
+ * @see c4::catsep(). c4::uncatsep() is the inverse of c4::catsep(). */
+template<class Sep, class Arg, class... Args>
+size_t uncatsep(csubstr buf, Sep & C4_RESTRICT sep, Arg & C4_RESTRICT a, Args & C4_RESTRICT ...more)
+{
+    size_t ret = from_chars_first(buf, &a), num = ret;
+    if(C4_UNLIKELY(ret == csubstr::npos))
+        return csubstr::npos;
+    buf  = buf.len >= ret ? buf.sub(ret) : substr{};
+    ret  = detail::uncatsep_more(buf, sep, more...);
+    if(C4_UNLIKELY(ret == csubstr::npos))
+        return csubstr::npos;
+    num += ret;
+    return num;
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/// @cond dev
+// terminates the variadic recursion
+inline size_t format(substr buf, csubstr fmt)
+{
+    return to_chars(buf, fmt);
+}
+/// @endcond
+
+
+/** using a format string, serialize the arguments into the given
+ * fixed-size buffer.
+ * The buffer size is strictly respected: no writes will occur beyond its end.
+ * In the format string, each argument is marked with a compact
+ * curly-bracket pair: {}. Arguments beyond the last curly bracket pair
+ * are silently ignored. For example:
+ * @code{.cpp}
+ * c4::format(buf, "the {} drank {} {}", "partier", 5, "beers"); // the partier drank 5 beers
+ * c4::format(buf, "the {} drank {} {}", "programmer", 6, "coffees"); // the programmer drank 6 coffees
+ * @endcode
+ * @return the number of characters needed to write into the buffer.
+ * @see c4::formatrs() if instead of a fixed-size buffer, a resizeable container is desired
+ * @see c4::unformat() for the inverse function
+ * @see c4::cat() if no format or separator is needed
+ * @see c4::catsep() if no format is needed, but a separator must be used */
+template<class Arg, class... Args>
+size_t format(substr buf, csubstr fmt, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    size_t pos = fmt.find("{}"); // @todo use _find_fmt()
+    if(C4_UNLIKELY(pos == csubstr::npos))
+        return to_chars(buf, fmt);
+    size_t num = to_chars(buf, fmt.sub(0, pos));
+    size_t out = num;
+    buf  = buf.len >= num ? buf.sub(num) : substr{};
+    num  = to_chars(buf, a);
+    out += num;
+    buf  = buf.len >= num ? buf.sub(num) : substr{};
+    num  = format(buf, fmt.sub(pos + 2), more...);
+    out += num;
+    return out;
+}
+
+/** like c4::format() but return a substr instead of a size
+ * @see c4::format()
+ * @see c4::catsep(). uncatsep() is the inverse of catsep(). */
+template<class... Args>
+substr format_sub(substr buf, csubstr fmt, Args const& C4_RESTRICT ...args)
+{
+    size_t sz = c4::format(buf, fmt, args...);
+    C4_CHECK(sz <= buf.len);
+    return {buf.str, sz <= buf.len ? sz : buf.len};
+}
+
+
+//-----------------------------------------------------------------------------
+
+/// @cond dev
+// terminates the variadic recursion
+inline size_t unformat(csubstr /*buf*/, csubstr fmt)
+{
+    return fmt.len;
+}
+/// @endcond
+
+
+/** using a format string, deserialize the arguments from the given
+ * buffer.
+ * @return the number of characters read from the buffer, or npos if a conversion failed.
+ * @see c4::format(). c4::unformat() is the inverse function to format(). */
+template<class Arg, class... Args>
+size_t unformat(csubstr buf, csubstr fmt, Arg & C4_RESTRICT a, Args & C4_RESTRICT ...more)
+{
+    const size_t pos = fmt.find("{}");
+    if(C4_UNLIKELY(pos == csubstr::npos))
+        return unformat(buf, fmt);
+    size_t num = pos;
+    size_t out = num;
+    buf  = buf.len >= num ? buf.sub(num) : substr{};
+    num  = from_chars_first(buf, &a);
+    if(C4_UNLIKELY(num == csubstr::npos))
+        return csubstr::npos;
+    out += num;
+    buf  = buf.len >= num ? buf.sub(num) : substr{};
+    num  = unformat(buf, fmt.sub(pos + 2), more...);
+    if(C4_UNLIKELY(num == csubstr::npos))
+        return csubstr::npos;
+    out += num;
+    return out;
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** a tag type for marking append to container
+ * @see c4::catrs() */
+struct append_t {};
+
+/** a tag variable
+ * @see c4::catrs() */
+constexpr const append_t append = {};
+
+
+//-----------------------------------------------------------------------------
+
+/** like c4::cat(), but receives a container, and resizes it as needed to contain
+ * the result. The container is overwritten. To append to it, use the append
+ * overload.
+ * @see c4::cat() */
+template<class CharOwningContainer, class... Args>
+inline void catrs(CharOwningContainer * C4_RESTRICT cont, Args const& C4_RESTRICT ...args)
+{
+retry:
+    substr buf = to_substr(*cont);
+    size_t ret = cat(buf, args...);
+    cont->resize(ret);
+    if(ret > buf.len)
+        goto retry;
+}
+
+/** like c4::cat(), but creates and returns a new container sized as needed to contain
+ * the result.
+ * @see c4::cat() */
+template<class CharOwningContainer, class... Args>
+inline CharOwningContainer catrs(Args const& C4_RESTRICT ...args)
+{
+    CharOwningContainer cont;
+    catrs(&cont, args...);
+    return cont;
+}
+
+/** like c4::cat(), but receives a container, and appends to it instead of
+ * overwriting it. The container is resized as needed to contain the result.
+ * @return the region newly appended to the original container
+ * @see c4::cat()
+ * @see c4::catrs() */
+template<class CharOwningContainer, class... Args>
+inline csubstr catrs(append_t, CharOwningContainer * C4_RESTRICT cont, Args const& C4_RESTRICT ...args)
+{
+    const size_t pos = cont->size();
+retry:
+    substr buf = to_substr(*cont).sub(pos);
+    size_t ret = cat(buf, args...);
+    cont->resize(pos + ret);
+    if(ret > buf.len)
+        goto retry;
+    return to_csubstr(*cont).range(pos, cont->size());
+}
+
+
+//-----------------------------------------------------------------------------
+
+/// @cond dev
+// terminates the recursion
+template<class CharOwningContainer, class Sep, class... Args>
+inline void catseprs(CharOwningContainer * C4_RESTRICT, Sep const& C4_RESTRICT)
+{
+    return;
+}
+/// @end cond
+
+
+/** like c4::catsep(), but receives a container, and resizes it as needed to contain the result.
+ * The container is overwritten. To append to the container use the append overload.
+ * @see c4::catsep() */
+template<class CharOwningContainer, class Sep, class... Args>
+inline void catseprs(CharOwningContainer * C4_RESTRICT cont, Sep const& C4_RESTRICT sep, Args const& C4_RESTRICT ...args)
+{
+retry:
+    substr buf = to_substr(*cont);
+    size_t ret = catsep(buf, sep, args...);
+    cont->resize(ret);
+    if(ret > buf.len)
+        goto retry;
+}
+
+/** like c4::catsep(), but create a new container with the result.
+ * @return the requested container */
+template<class CharOwningContainer, class Sep, class... Args>
+inline CharOwningContainer catseprs(Sep const& C4_RESTRICT sep, Args const& C4_RESTRICT ...args)
+{
+    CharOwningContainer cont;
+    catseprs(&cont, sep, args...);
+    return cont;
+}
+
+
+/// @cond dev
+// terminates the recursion
+template<class CharOwningContainer, class Sep, class... Args>
+inline csubstr catseprs(append_t, CharOwningContainer * C4_RESTRICT, Sep const& C4_RESTRICT)
+{
+    csubstr s;
+    return s;
+}
+/// @endcond
+
+/** like catsep(), but receives a container, and appends the arguments, resizing the
+ * container as needed to contain the result. The buffer is appended to.
+ * @return a csubstr of the appended part
+ * @ingroup formatting_functions */
+template<class CharOwningContainer, class Sep, class... Args>
+inline csubstr catseprs(append_t, CharOwningContainer * C4_RESTRICT cont, Sep const& C4_RESTRICT sep, Args const& C4_RESTRICT ...args)
+{
+    const size_t pos = cont->size();
+retry:
+    substr buf = to_substr(*cont).sub(pos);
+    size_t ret = catsep(buf, sep, args...);
+    cont->resize(pos + ret);
+    if(ret > buf.len)
+        goto retry;
+    return to_csubstr(*cont).range(pos, cont->size());
+}
+
+
+//-----------------------------------------------------------------------------
+
+/** like c4::format(), but receives a container, and resizes it as needed
+ * to contain the result.  The container is overwritten. To append to
+ * the container use the append overload.
+ * @see c4::format() */
+template<class CharOwningContainer, class... Args>
+inline void formatrs(CharOwningContainer * C4_RESTRICT cont, csubstr fmt, Args const& C4_RESTRICT ...args)
+{
+retry:
+    substr buf = to_substr(*cont);
+    size_t ret = format(buf, fmt, args...);
+    cont->resize(ret);
+    if(ret > buf.len)
+        goto retry;
+}
+
+/** like c4::format(), but create a new container with the result.
+ * @return the requested container */
+template<class CharOwningContainer, class... Args>
+inline CharOwningContainer formatrs(csubstr fmt, Args const& C4_RESTRICT ...args)
+{
+    CharOwningContainer cont;
+    formatrs(&cont, fmt, args...);
+    return cont;
+}
+
+/** like format(), but receives a container, and appends the
+ * arguments, resizing the container as needed to contain the
+ * result. The buffer is appended to.
+ * @return the region newly appended to the original container
+ * @ingroup formatting_functions */
+template<class CharOwningContainer, class... Args>
+inline csubstr formatrs(append_t, CharOwningContainer * C4_RESTRICT cont, csubstr fmt, Args const& C4_RESTRICT ...args)
+{
+    const size_t pos = cont->size();
+retry:
+    substr buf = to_substr(*cont).sub(pos);
+    size_t ret = format(buf, fmt, args...);
+    cont->resize(pos + ret);
+    if(ret > buf.len)
+        goto retry;
+    return to_csubstr(*cont).range(pos, cont->size());
+}
+
+} // namespace c4
+
+#ifdef _MSC_VER
+#   pragma warning(pop)
+#elif defined(__clang__)
+#   pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic pop
+#endif
+
+#endif /* _C4_FORMAT_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/format.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/dump.hpp
+// https://github.com/biojppm/c4core/src/c4/dump.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef C4_DUMP_HPP_
+#define C4_DUMP_HPP_
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/substr.hpp
+//#include <c4/substr.hpp>
+#if !defined(C4_SUBSTR_HPP_) && !defined(_C4_SUBSTR_HPP_)
+#error "amalgamate: file c4/substr.hpp must have been included at this point"
+#endif /* C4_SUBSTR_HPP_ */
+
+
+namespace c4 {
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+/** type of the function to dump characters */
+using DumperPfn = void (*)(csubstr buf);
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+template<DumperPfn dumpfn, class Arg>
+inline size_t dump(substr buf, Arg const& a)
+{
+    size_t sz = to_chars(buf, a); // need to serialize to the buffer
+    if(C4_LIKELY(sz <= buf.len))
+        dumpfn(buf.first(sz));
+    return sz;
+}
+
+template<class DumperFn, class Arg>
+inline size_t dump(DumperFn &&dumpfn, substr buf, Arg const& a)
+{
+    size_t sz = to_chars(buf, a); // need to serialize to the buffer
+    if(C4_LIKELY(sz <= buf.len))
+        dumpfn(buf.first(sz));
+    return sz;
+}
+
+template<DumperPfn dumpfn>
+inline size_t dump(substr buf, csubstr a)
+{
+    if(buf.len)
+        dumpfn(a); // dump directly, no need to serialize to the buffer
+    return 0; // no space was used in the buffer
+}
+
+template<class DumperFn>
+inline size_t dump(DumperFn &&dumpfn, substr buf, csubstr a)
+{
+    if(buf.len)
+        dumpfn(a); // dump directly, no need to serialize to the buffer
+    return 0; // no space was used in the buffer
+}
+
+template<DumperPfn dumpfn, size_t N>
+inline size_t dump(substr buf, const char (&a)[N])
+{
+    if(buf.len)
+        dumpfn(csubstr(a)); // dump directly, no need to serialize to the buffer
+    return 0; // no space was used in the buffer
+}
+
+template<class DumperFn, size_t N>
+inline size_t dump(DumperFn &&dumpfn, substr buf, const char (&a)[N])
+{
+    if(buf.len)
+        dumpfn(csubstr(a)); // dump directly, no need to serialize to the buffer
+    return 0; // no space was used in the buffer
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** */
+struct DumpResults
+{
+    enum : size_t { noarg = (size_t)-1 };
+    size_t bufsize = 0;
+    size_t lastok = noarg;
+    bool success_until(size_t expected) const { return lastok == noarg ? false : lastok >= expected; }
+    bool write_arg(size_t arg) const { return lastok == noarg || arg > lastok; }
+    size_t argfail() const { return lastok + 1; }
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/// @cond dev
+// terminates the variadic recursion
+template<class DumperFn>
+size_t cat_dump(DumperFn &&, substr)
+{
+    return 0;
+}
+
+// terminates the variadic recursion
+template<DumperPfn dumpfn>
+size_t cat_dump(substr)
+{
+    return 0;
+}
+/// @endcond
+
+/** take the function pointer as a function argument */
+template<class DumperFn, class Arg, class... Args>
+size_t cat_dump(DumperFn &&dumpfn, substr buf, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    size_t size_for_a = dump(dumpfn, buf, a);
+    if(C4_UNLIKELY(size_for_a > buf.len))
+        buf = buf.first(0); // ensure no more calls
+    size_t size_for_more = cat_dump(dumpfn, buf, more...);
+    return size_for_more > size_for_a ? size_for_more : size_for_a;
+}
+
+/** take the function pointer as a template argument */
+template<DumperPfn dumpfn,class Arg, class... Args>
+size_t cat_dump(substr buf, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    size_t size_for_a = dump<dumpfn>(buf, a);
+    if(C4_LIKELY(size_for_a > buf.len))
+        buf = buf.first(0); // ensure no more calls
+    size_t size_for_more = cat_dump<dumpfn>(buf, more...);
+    return size_for_more > size_for_a ? size_for_more : size_for_a;
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/// @cond dev
+namespace detail {
+
+// terminates the variadic recursion
+template<DumperPfn dumpfn, class Arg>
+DumpResults cat_dump_resume(size_t currarg, DumpResults results, substr buf, Arg const& C4_RESTRICT a)
+{
+    if(C4_LIKELY(results.write_arg(currarg)))
+    {
+        size_t sz = dump<dumpfn>(buf, a);  // yield to the specialized function
+        if(currarg == results.lastok + 1 && sz <= buf.len)
+            results.lastok = currarg;
+        results.bufsize = sz > results.bufsize ? sz : results.bufsize;
+    }
+    return results;
+}
+
+// terminates the variadic recursion
+template<class DumperFn, class Arg>
+DumpResults cat_dump_resume(size_t currarg, DumperFn &&dumpfn, DumpResults results, substr buf, Arg const& C4_RESTRICT a)
+{
+    if(C4_LIKELY(results.write_arg(currarg)))
+    {
+        size_t sz = dump(dumpfn, buf, a);  // yield to the specialized function
+        if(currarg == results.lastok + 1 && sz <= buf.len)
+            results.lastok = currarg;
+        results.bufsize = sz > results.bufsize ? sz : results.bufsize;
+    }
+    return results;
+}
+
+template<DumperPfn dumpfn, class Arg, class... Args>
+DumpResults cat_dump_resume(size_t currarg, DumpResults results, substr buf, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    results = detail::cat_dump_resume<dumpfn>(currarg, results, buf, a);
+    return detail::cat_dump_resume<dumpfn>(currarg + 1u, results, buf, more...);
+}
+
+template<class DumperFn, class Arg, class... Args>
+DumpResults cat_dump_resume(size_t currarg, DumperFn &&dumpfn, DumpResults results, substr buf, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    results = detail::cat_dump_resume(currarg, dumpfn, results, buf, a);
+    return detail::cat_dump_resume(currarg + 1u, dumpfn, results, buf, more...);
+}
+} // namespace detail
+/// @endcond
+
+
+template<DumperPfn dumpfn, class Arg, class... Args>
+C4_ALWAYS_INLINE DumpResults cat_dump_resume(DumpResults results, substr buf, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    if(results.bufsize > buf.len)
+        return results;
+    return detail::cat_dump_resume<dumpfn>(0u, results, buf, a, more...);
+}
+
+template<class DumperFn, class Arg, class... Args>
+C4_ALWAYS_INLINE DumpResults cat_dump_resume(DumperFn &&dumpfn, DumpResults results, substr buf, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    if(results.bufsize > buf.len)
+        return results;
+    return detail::cat_dump_resume(0u, dumpfn, results, buf, a, more...);
+}
+
+template<DumperPfn dumpfn, class Arg, class... Args>
+C4_ALWAYS_INLINE DumpResults cat_dump_resume(substr buf, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    return detail::cat_dump_resume<dumpfn>(0u, DumpResults{}, buf, a, more...);
+}
+
+template<class DumperFn, class Arg, class... Args>
+C4_ALWAYS_INLINE DumpResults cat_dump_resume(DumperFn &&dumpfn, substr buf, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    return detail::cat_dump_resume(0u, dumpfn, DumpResults{}, buf, a, more...);
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/// @cond dev
+// terminate the recursion
+template<class DumperFn, class Sep>
+size_t catsep_dump(DumperFn &&, substr, Sep const& C4_RESTRICT)
+{
+    return 0;
+}
+
+// terminate the recursion
+template<DumperPfn dumpfn, class Sep>
+size_t catsep_dump(substr, Sep const& C4_RESTRICT)
+{
+    return 0;
+}
+/// @endcond
+
+/** take the function pointer as a function argument */
+template<class DumperFn, class Sep, class Arg, class... Args>
+size_t catsep_dump(DumperFn &&dumpfn, substr buf, Sep const& C4_RESTRICT sep, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    size_t sz = dump(dumpfn, buf, a);
+    if(C4_UNLIKELY(sz > buf.len))
+        buf = buf.first(0); // ensure no more calls
+    if C4_IF_CONSTEXPR (sizeof...(more) > 0)
+    {
+        size_t szsep = dump(dumpfn, buf, sep);
+        if(C4_UNLIKELY(szsep > buf.len))
+            buf = buf.first(0); // ensure no more calls
+        sz = sz > szsep ? sz : szsep;
+    }
+    size_t size_for_more = catsep_dump(dumpfn, buf, sep, more...);
+    return size_for_more > sz ? size_for_more : sz;
+}
+
+/** take the function pointer as a template argument */
+template<DumperPfn dumpfn, class Sep, class Arg, class... Args>
+size_t catsep_dump(substr buf, Sep const& C4_RESTRICT sep, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    size_t sz = dump<dumpfn>(buf, a);
+    if(C4_UNLIKELY(sz > buf.len))
+        buf = buf.first(0); // ensure no more calls
+    if C4_IF_CONSTEXPR (sizeof...(more) > 0)
+    {
+        size_t szsep = dump<dumpfn>(buf, sep);
+        if(C4_UNLIKELY(szsep > buf.len))
+            buf = buf.first(0); // ensure no more calls
+        sz = sz > szsep ? sz : szsep;
+    }
+    size_t size_for_more = catsep_dump<dumpfn>(buf, sep, more...);
+    return size_for_more > sz ? size_for_more : sz;
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/// @cond dev
+namespace detail {
+template<DumperPfn dumpfn, class Arg>
+void catsep_dump_resume_(size_t currarg, DumpResults *C4_RESTRICT results, substr *C4_RESTRICT buf, Arg const& C4_RESTRICT a)
+{
+    if(C4_LIKELY(results->write_arg(currarg)))
+    {
+        size_t sz = dump<dumpfn>(*buf, a);
+        results->bufsize = sz > results->bufsize ? sz : results->bufsize;
+        if(C4_LIKELY(sz <= buf->len))
+            results->lastok = currarg;
+        else
+            buf->len = 0;
+    }
+}
+
+template<class DumperFn, class Arg>
+void catsep_dump_resume_(size_t currarg, DumperFn &&dumpfn, DumpResults *C4_RESTRICT results, substr *C4_RESTRICT buf, Arg const& C4_RESTRICT a)
+{
+    if(C4_LIKELY(results->write_arg(currarg)))
+    {
+        size_t sz = dump(dumpfn, *buf, a);
+        results->bufsize = sz > results->bufsize ? sz : results->bufsize;
+        if(C4_LIKELY(sz <= buf->len))
+            results->lastok = currarg;
+        else
+            buf->len = 0;
+    }
+}
+
+template<DumperPfn dumpfn, class Sep, class Arg>
+C4_ALWAYS_INLINE void catsep_dump_resume(size_t currarg, DumpResults *C4_RESTRICT results, substr *C4_RESTRICT buf, Sep const& C4_RESTRICT, Arg const& C4_RESTRICT a)
+{
+    detail::catsep_dump_resume_<dumpfn>(currarg, results, buf, a);
+}
+
+template<class DumperFn, class Sep, class Arg>
+C4_ALWAYS_INLINE void catsep_dump_resume(size_t currarg, DumperFn &&dumpfn, DumpResults *C4_RESTRICT results, substr *C4_RESTRICT buf, Sep const& C4_RESTRICT, Arg const& C4_RESTRICT a)
+{
+    detail::catsep_dump_resume_(currarg, dumpfn, results, buf, a);
+}
+
+template<DumperPfn dumpfn, class Sep, class Arg, class... Args>
+C4_ALWAYS_INLINE void catsep_dump_resume(size_t currarg, DumpResults *C4_RESTRICT results, substr *C4_RESTRICT buf, Sep const& C4_RESTRICT sep, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    detail::catsep_dump_resume_<dumpfn>(currarg     , results, buf, a);
+    detail::catsep_dump_resume_<dumpfn>(currarg + 1u, results, buf, sep);
+    detail::catsep_dump_resume <dumpfn>(currarg + 2u, results, buf, sep, more...);
+}
+
+template<class DumperFn, class Sep, class Arg, class... Args>
+C4_ALWAYS_INLINE void catsep_dump_resume(size_t currarg, DumperFn &&dumpfn, DumpResults *C4_RESTRICT results, substr *C4_RESTRICT buf, Sep const& C4_RESTRICT sep, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    detail::catsep_dump_resume_(currarg     , dumpfn, results, buf, a);
+    detail::catsep_dump_resume_(currarg + 1u, dumpfn, results, buf, sep);
+    detail::catsep_dump_resume (currarg + 2u, dumpfn, results, buf, sep, more...);
+}
+} // namespace detail
+/// @endcond
+
+
+template<DumperPfn dumpfn, class Sep, class... Args>
+C4_ALWAYS_INLINE DumpResults catsep_dump_resume(DumpResults results, substr buf, Sep const& C4_RESTRICT sep, Args const& C4_RESTRICT ...more)
+{
+    detail::catsep_dump_resume<dumpfn>(0u, &results, &buf, sep, more...);
+    return results;
+}
+
+template<class DumperFn, class Sep, class... Args>
+C4_ALWAYS_INLINE DumpResults catsep_dump_resume(DumperFn &&dumpfn, DumpResults results, substr buf, Sep const& C4_RESTRICT sep, Args const& C4_RESTRICT ...more)
+{
+    detail::catsep_dump_resume(0u, dumpfn, &results, &buf, sep, more...);
+    return results;
+}
+
+template<DumperPfn dumpfn, class Sep, class... Args>
+C4_ALWAYS_INLINE DumpResults catsep_dump_resume(substr buf, Sep const& C4_RESTRICT sep, Args const& C4_RESTRICT ...more)
+{
+    DumpResults results;
+    detail::catsep_dump_resume<dumpfn>(0u, &results, &buf, sep, more...);
+    return results;
+}
+
+template<class DumperFn, class Sep, class... Args>
+C4_ALWAYS_INLINE DumpResults catsep_dump_resume(DumperFn &&dumpfn, substr buf, Sep const& C4_RESTRICT sep, Args const& C4_RESTRICT ...more)
+{
+    DumpResults results;
+    detail::catsep_dump_resume(0u, dumpfn, &results, &buf, sep, more...);
+    return results;
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** take the function pointer as a function argument */
+template<class DumperFn>
+C4_ALWAYS_INLINE size_t format_dump(DumperFn &&dumpfn, substr buf, csubstr fmt)
+{
+    // we can dump without using buf
+    // but we'll only dump if the buffer is ok
+    if(C4_LIKELY(buf.len > 0 && fmt.len))
+        dumpfn(fmt);
+    return 0u;
+}
+
+/** take the function pointer as a function argument */
+template<DumperPfn dumpfn>
+C4_ALWAYS_INLINE size_t format_dump(substr buf, csubstr fmt)
+{
+    // we can dump without using buf
+    // but we'll only dump if the buffer is ok
+    if(C4_LIKELY(buf.len > 0 && fmt.len > 0))
+        dumpfn(fmt);
+    return 0u;
+}
+
+/** take the function pointer as a function argument */
+template<class DumperFn, class Arg, class... Args>
+size_t format_dump(DumperFn &&dumpfn, substr buf, csubstr fmt, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    // we can dump without using buf
+    // but we'll only dump if the buffer is ok
+    size_t pos = fmt.find("{}"); // @todo use _find_fmt()
+    if(C4_UNLIKELY(pos == csubstr::npos))
+    {
+        if(C4_LIKELY(buf.len > 0 && fmt.len > 0))
+            dumpfn(fmt);
+        return 0u;
+    }
+    if(C4_LIKELY(buf.len > 0 && pos > 0))
+        dumpfn(fmt.first(pos)); // we can dump without using buf
+    fmt = fmt.sub(pos + 2); // skip {} do this before assigning to pos again
+    pos = dump(dumpfn, buf, a);
+    if(C4_UNLIKELY(pos > buf.len))
+        buf.len = 0; // ensure no more calls to dump
+    size_t size_for_more = format_dump(dumpfn, buf, fmt, more...);
+    return size_for_more > pos ? size_for_more : pos;
+}
+
+/** take the function pointer as a template argument */
+template<DumperPfn dumpfn, class Arg, class... Args>
+size_t format_dump(substr buf, csubstr fmt, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    // we can dump without using buf
+    // but we'll only dump if the buffer is ok
+    size_t pos = fmt.find("{}"); // @todo use _find_fmt()
+    if(C4_UNLIKELY(pos == csubstr::npos))
+    {
+        if(C4_LIKELY(buf.len > 0 && fmt.len > 0))
+            dumpfn(fmt);
+        return 0u;
+    }
+    if(C4_LIKELY(buf.len > 0 && pos > 0))
+        dumpfn(fmt.first(pos)); // we can dump without using buf
+    fmt = fmt.sub(pos + 2); // skip {} do this before assigning to pos again
+    pos = dump<dumpfn>(buf, a);
+    if(C4_UNLIKELY(pos > buf.len))
+        buf.len = 0; // ensure no more calls to dump
+    size_t size_for_more = format_dump<dumpfn>(buf, fmt, more...);
+    return size_for_more > pos ? size_for_more : pos;
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/// @cond dev
+namespace detail {
+
+template<DumperPfn dumpfn>
+DumpResults format_dump_resume(size_t currarg, DumpResults results, substr buf, csubstr fmt)
+{
+    // we can dump without using buf
+    // but we'll only dump if the buffer is ok
+    if(C4_LIKELY(buf.len > 0))
+    {
+        dumpfn(fmt);
+        results.lastok = currarg;
+    }
+    return results;
+}
+
+template<class DumperFn>
+DumpResults format_dump_resume(size_t currarg, DumperFn &&dumpfn, DumpResults results, substr buf, csubstr fmt)
+{
+    // we can dump without using buf
+    // but we'll only dump if the buffer is ok
+    if(C4_LIKELY(buf.len > 0))
+    {
+        dumpfn(fmt);
+        results.lastok = currarg;
+    }
+    return results;
+}
+
+template<DumperPfn dumpfn, class Arg, class... Args>
+DumpResults format_dump_resume(size_t currarg, DumpResults results, substr buf, csubstr fmt, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    // we need to process the format even if we're not
+    // going to print the first arguments because we're resuming
+    size_t pos = fmt.find("{}"); // @todo use _find_fmt()
+    // we can dump without using buf
+    // but we'll only dump if the buffer is ok
+    if(C4_LIKELY(results.write_arg(currarg)))
+    {
+        if(C4_UNLIKELY(pos == csubstr::npos))
+        {
+            if(C4_LIKELY(buf.len > 0))
+            {
+                results.lastok = currarg;
+                dumpfn(fmt);
+            }
+            return results;
+        }
+        if(C4_LIKELY(buf.len > 0))
+        {
+            results.lastok = currarg;
+            dumpfn(fmt.first(pos));
+        }
+    }
+    fmt = fmt.sub(pos + 2);
+    if(C4_LIKELY(results.write_arg(currarg + 1)))
+    {
+        pos = dump<dumpfn>(buf, a);
+        results.bufsize = pos > results.bufsize ? pos : results.bufsize;
+        if(C4_LIKELY(pos <= buf.len))
+            results.lastok = currarg + 1;
+        else
+            buf.len = 0;
+    }
+    return detail::format_dump_resume<dumpfn>(currarg + 2u, results, buf, fmt, more...);
+}
+/// @endcond
+
+
+template<class DumperFn, class Arg, class... Args>
+DumpResults format_dump_resume(size_t currarg, DumperFn &&dumpfn, DumpResults results, substr buf, csubstr fmt, Arg const& C4_RESTRICT a, Args const& C4_RESTRICT ...more)
+{
+    // we need to process the format even if we're not
+    // going to print the first arguments because we're resuming
+    size_t pos = fmt.find("{}"); // @todo use _find_fmt()
+    // we can dump without using buf
+    // but we'll only dump if the buffer is ok
+    if(C4_LIKELY(results.write_arg(currarg)))
+    {
+        if(C4_UNLIKELY(pos == csubstr::npos))
+        {
+            if(C4_LIKELY(buf.len > 0))
+            {
+                results.lastok = currarg;
+                dumpfn(fmt);
+            }
+            return results;
+        }
+        if(C4_LIKELY(buf.len > 0))
+        {
+            results.lastok = currarg;
+            dumpfn(fmt.first(pos));
+        }
+    }
+    fmt = fmt.sub(pos + 2);
+    if(C4_LIKELY(results.write_arg(currarg + 1)))
+    {
+        pos = dump(dumpfn, buf, a);
+        results.bufsize = pos > results.bufsize ? pos : results.bufsize;
+        if(C4_LIKELY(pos <= buf.len))
+            results.lastok = currarg + 1;
+        else
+            buf.len = 0;
+    }
+    return detail::format_dump_resume(currarg + 2u, dumpfn, results, buf, fmt, more...);
+}
+} // namespace detail
+
+
+template<DumperPfn dumpfn, class... Args>
+C4_ALWAYS_INLINE DumpResults format_dump_resume(DumpResults results, substr buf, csubstr fmt, Args const& C4_RESTRICT ...more)
+{
+    return detail::format_dump_resume<dumpfn>(0u, results, buf, fmt, more...);
+}
+
+template<class DumperFn, class... Args>
+C4_ALWAYS_INLINE DumpResults format_dump_resume(DumperFn &&dumpfn, DumpResults results, substr buf, csubstr fmt, Args const& C4_RESTRICT ...more)
+{
+    return detail::format_dump_resume(0u, dumpfn, results, buf, fmt, more...);
+}
+
+
+template<DumperPfn dumpfn, class... Args>
+C4_ALWAYS_INLINE DumpResults format_dump_resume(substr buf, csubstr fmt, Args const& C4_RESTRICT ...more)
+{
+    return detail::format_dump_resume<dumpfn>(0u, DumpResults{}, buf, fmt, more...);
+}
+
+template<class DumperFn, class... Args>
+C4_ALWAYS_INLINE DumpResults format_dump_resume(DumperFn &&dumpfn, substr buf, csubstr fmt, Args const& C4_RESTRICT ...more)
+{
+    return detail::format_dump_resume(0u, dumpfn, DumpResults{}, buf, fmt, more...);
+}
+
+
+} // namespace c4
+
+
+#endif /* C4_DUMP_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/dump.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/enum.hpp
+// https://github.com/biojppm/c4core/src/c4/enum.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_ENUM_HPP_
+#define _C4_ENUM_HPP_
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/error.hpp
+//#include "c4/error.hpp"
+#if !defined(C4_ERROR_HPP_) && !defined(_C4_ERROR_HPP_)
+#error "amalgamate: file c4/error.hpp must have been included at this point"
+#endif /* C4_ERROR_HPP_ */
+
+//included above:
+//#include <string.h>
+
+/** @file enum.hpp utilities for enums: convert to/from string
+ */
+
+
+namespace c4 {
+
+//! taken from http://stackoverflow.com/questions/15586163/c11-type-trait-to-differentiate-between-enum-class-and-regular-enum
+template<typename Enum>
+using is_scoped_enum = std::integral_constant<bool, std::is_enum<Enum>::value && !std::is_convertible<Enum, int>::value>;
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+typedef enum {
+    EOFFS_NONE = 0,  ///< no offset
+    EOFFS_CLS = 1,   ///< get the enum offset for the class name. @see eoffs_cls()
+    EOFFS_PFX = 2,   ///< get the enum offset for the enum prefix. @see eoffs_pfx()
+    _EOFFS_LAST      ///< reserved
+} EnumOffsetType;
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+/** A simple (proxy) container for the value-name pairs of an enum type.
+ * Uses linear search for finds; this could be improved for time-critical
+ * code. */
+template<class Enum>
+class EnumSymbols
+{
+public:
+
+    struct Sym
+    {
+        Enum value;
+        const char *name;
+
+        bool cmp(const char *s) const;
+        bool cmp(const char *s, size_t len) const;
+
+        const char *name_offs(EnumOffsetType t) const;
+    };
+
+    using const_iterator = Sym const*;
+
+public:
+
+    template<size_t N>
+    EnumSymbols(Sym const (&p)[N]) : m_symbols(p), m_num(N) {}
+
+    size_t size() const { return m_num; }
+    bool empty() const { return m_num == 0; }
+
+    Sym const* get(Enum v) const { auto p = find(v); C4_CHECK_MSG(p != nullptr, "could not find symbol=%zd", (std::ptrdiff_t)v); return p; }
+    Sym const* get(const char *s) const { auto p = find(s); C4_CHECK_MSG(p != nullptr, "could not find symbol \"%s\"", s); return p; }
+    Sym const* get(const char *s, size_t len) const { auto p = find(s, len); C4_CHECK_MSG(p != nullptr, "could not find symbol \"%.*s\"", len, s); return p; }
+
+    Sym const* find(Enum v) const;
+    Sym const* find(const char *s) const;
+    Sym const* find(const char *s, size_t len) const;
+
+    Sym const& operator[] (size_t i) const { C4_CHECK(i < m_num); return m_symbols[i]; }
+
+    Sym const* begin() const { return m_symbols; }
+    Sym const* end  () const { return m_symbols + m_num; }
+
+private:
+
+    Sym const* m_symbols;
+    size_t const m_num;
+
+};
+
+//-----------------------------------------------------------------------------
+/** return an EnumSymbols object for the enum type T
+ *
+ * @warning SPECIALIZE! This needs to be specialized for each enum
+ * type. Failure to provide a specialization will cause a linker
+ * error. */
+template<class Enum>
+EnumSymbols<Enum> const esyms();
+
+
+/** return the offset for an enum symbol class. For example,
+ * eoffs_cls<MyEnumClass>() would be 13=strlen("MyEnumClass::").
+ *
+ * With this function you can announce that the full prefix (including
+ * an eventual enclosing class or C++11 enum class) is of a certain
+ * length.
+ *
+ * @warning Needs to be specialized for each enum class type that
+ * wants to use this. When no specialization is given, will return
+ * 0. */
+template<class Enum>
+size_t eoffs_cls()
+{
+    return 0;
+}
+
+
+/** return the offset for an enum symbol prefix. This includes
+ * eoffs_cls().  With this function you can announce that the full
+ * prefix (including an eventual enclosing class or C++11 enum class
+ * plus the string prefix) is of a certain length.
+ *
+ * @warning Needs to be specialized for each enum class type that
+ * wants to use this. When no specialization is given, will return
+ * 0. */
+template<class Enum>
+size_t eoffs_pfx()
+{
+    return 0;
+}
+
+
+template<class Enum>
+size_t eoffs(EnumOffsetType which)
+{
+    switch(which)
+    {
+    case EOFFS_NONE:
+        return 0;
+    case EOFFS_CLS:
+        return eoffs_cls<Enum>();
+    case EOFFS_PFX:
+    {
+        size_t pfx = eoffs_pfx<Enum>();
+        return pfx > 0 ? pfx : eoffs_cls<Enum>();
+    }
+    default:
+        C4_ERROR("unknown offset type %d", (int)which);
+        return 0;
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+/** get the enum value corresponding to a c-string */
+
+#ifdef __clang__
+#   pragma clang diagnostic push
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic push
+#   if __GNUC__ >= 6
+#       pragma GCC diagnostic ignored "-Wnull-dereference"
+#   endif
+#endif
+
+template<class Enum>
+Enum str2e(const char* str)
+{
+    auto pairs = esyms<Enum>();
+    auto *p = pairs.get(str);
+    C4_CHECK_MSG(p != nullptr, "no valid enum pair name for '%s'", str);
+    return p->value;
+}
+
+/** get the c-string corresponding to an enum value */
+template<class Enum>
+const char* e2str(Enum e)
+{
+    auto es = esyms<Enum>();
+    auto *p = es.get(e);
+    C4_CHECK_MSG(p != nullptr, "no valid enum pair name");
+    return p->name;
+}
+
+/** like e2str(), but add an offset. */
+template<class Enum>
+const char* e2stroffs(Enum e, EnumOffsetType ot=EOFFS_PFX)
+{
+    const char *s = e2str<Enum>(e) + eoffs<Enum>(ot);
+    return s;
+}
+
+#ifdef __clang__
+#   pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic pop
+#endif
+
+//-----------------------------------------------------------------------------
+/** Find a symbol by value. Returns nullptr when none is found */
+template<class Enum>
+typename EnumSymbols<Enum>::Sym const* EnumSymbols<Enum>::find(Enum v) const
+{
+    for(Sym const* p = this->m_symbols, *e = p+this->m_num; p < e; ++p)
+        if(p->value == v)
+            return p;
+    return nullptr;
+}
+
+/** Find a symbol by name. Returns nullptr when none is found */
+template<class Enum>
+typename EnumSymbols<Enum>::Sym const* EnumSymbols<Enum>::find(const char *s) const
+{
+    for(Sym const* p = this->m_symbols, *e = p+this->m_num; p < e; ++p)
+        if(p->cmp(s))
+            return p;
+    return nullptr;
+}
+
+/** Find a symbol by name. Returns nullptr when none is found */
+template<class Enum>
+typename EnumSymbols<Enum>::Sym const* EnumSymbols<Enum>::find(const char *s, size_t len) const
+{
+    for(Sym const* p = this->m_symbols, *e = p+this->m_num; p < e; ++p)
+        if(p->cmp(s, len))
+            return p;
+    return nullptr;
+}
+
+//-----------------------------------------------------------------------------
+template<class Enum>
+bool EnumSymbols<Enum>::Sym::cmp(const char *s) const
+{
+    if(strcmp(name, s) == 0)
+        return true;
+
+    for(int i = 1; i < _EOFFS_LAST; ++i)
+    {
+        auto o = eoffs<Enum>((EnumOffsetType)i);
+        if(o > 0)
+            if(strcmp(name + o, s) == 0)
+                return true;
+    }
+
+    return false;
+}
+
+template<class Enum>
+bool EnumSymbols<Enum>::Sym::cmp(const char *s, size_t len) const
+{
+    if(strncmp(name, s, len) == 0)
+        return true;
+
+    size_t nlen = 0;
+    for(int i = 1; i <_EOFFS_LAST; ++i)
+    {
+        auto o = eoffs<Enum>((EnumOffsetType)i);
+        if(o > 0)
+        {
+            if(!nlen)
+            {
+                nlen = strlen(name);
+            }
+            C4_ASSERT(o < nlen);
+            size_t rem = nlen - o;
+            auto m = len > rem ? len : rem;
+            if(len >= m && strncmp(name + o, s, m) == 0)
+                return true;
+        }
+    }
+
+    return false;
+}
+
+//-----------------------------------------------------------------------------
+template<class Enum>
+const char* EnumSymbols<Enum>::Sym::name_offs(EnumOffsetType t) const
+{
+    C4_ASSERT(eoffs<Enum>(t) < strlen(name));
+    return name + eoffs<Enum>(t);
+}
+
+} // namespace c4
+
+#endif // _C4_ENUM_HPP_
+
+
+// (end https://github.com/biojppm/c4core/src/c4/enum.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/bitmask.hpp
+// https://github.com/biojppm/c4core/src/c4/bitmask.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_BITMASK_HPP_
+#define _C4_BITMASK_HPP_
+
+/** @file bitmask.hpp bitmask utilities */
+
+//included above:
+//#include <cstring>
+//included above:
+//#include <type_traits>
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/enum.hpp
+//#include "c4/enum.hpp"
+#if !defined(C4_ENUM_HPP_) && !defined(_C4_ENUM_HPP_)
+#error "amalgamate: file c4/enum.hpp must have been included at this point"
+#endif /* C4_ENUM_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/format.hpp
+//#include "c4/format.hpp"
+#if !defined(C4_FORMAT_HPP_) && !defined(_C4_FORMAT_HPP_)
+#error "amalgamate: file c4/format.hpp must have been included at this point"
+#endif /* C4_FORMAT_HPP_ */
+
+
+#ifdef _MSC_VER
+#   pragma warning(push)
+#   pragma warning(disable : 4996) // 'strncpy', fopen, etc: This function or variable may be unsafe
+#elif defined(__clang__)
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic push
+#   if __GNUC__ >= 8
+#       pragma GCC diagnostic ignored "-Wstringop-truncation"
+#       pragma GCC diagnostic ignored "-Wstringop-overflow"
+#   endif
+#endif
+
+namespace c4 {
+
+//-----------------------------------------------------------------------------
+/** write a bitmask to a stream, formatted as a string */
+
+template<class Enum, class Stream>
+Stream& bm2stream(Stream &s, typename std::underlying_type<Enum>::type bits, EnumOffsetType offst=EOFFS_PFX)
+{
+    using I = typename std::underlying_type<Enum>::type;
+    bool written = false;
+
+    auto const& pairs = esyms<Enum>();
+
+    // write non null value
+    if(bits)
+    {
+        // do reverse iteration to give preference to composite enum symbols,
+        // which are likely to appear at the end of the enum sequence
+        for(size_t i = pairs.size() - 1; i != size_t(-1); --i)
+        {
+            auto p = pairs[i];
+            I b(static_cast<I>(p.value));
+            if(b && (bits & b) == b)
+            {
+                if(written) s << '|'; // append bit-or character
+                written = true;
+                s << p.name_offs(offst); // append bit string
+                bits &= ~b;
+            }
+        }
+        return s;
+    }
+    else
+    {
+        // write a null value
+        for(size_t i = pairs.size() - 1; i != size_t(-1); --i)
+        {
+            auto p = pairs[i];
+            I b(static_cast<I>(p.value));
+            if(b == 0)
+            {
+                s << p.name_offs(offst);
+                written = true;
+                break;
+            }
+        }
+    }
+    if(!written)
+    {
+        s << '0';
+    }
+    return s;
+}
+
+template<class Enum, class Stream>
+typename std::enable_if<is_scoped_enum<Enum>::value, Stream&>::type
+bm2stream(Stream &s, Enum value, EnumOffsetType offst=EOFFS_PFX)
+{
+    using I = typename std::underlying_type<Enum>::type;
+    return bm2stream<Enum>(s, static_cast<I>(value), offst);
+}
+
+
+//-----------------------------------------------------------------------------
+
+// some utility macros, undefed below
+
+/// @cond dev
+
+/* Execute `code` if the `num` of characters is available in the str
+ * buffer. This macro simplifies the code for bm2str().
+ * @todo improve performance by writing from the end and moving only once. */
+#define _c4prependchars(code, num)                                      \
+    if(str && (pos + num <= sz))                                        \
+    {                                                                   \
+        /* move the current string to the right */                      \
+        memmove(str + num, str, pos);                                   \
+        /* now write in the beginning of the string */                  \
+        code;                                                           \
+    }                                                                   \
+    else if(str && sz)                                                  \
+    {                                                                   \
+        C4_ERROR("cannot write to string pos=%d num=%d sz=%d",          \
+                 (int)pos, (int)num, (int)sz);                          \
+    }                                                                   \
+    pos += num
+
+/* Execute `code` if the `num` of characters is available in the str
+ * buffer. This macro simplifies the code for bm2str(). */
+#define _c4appendchars(code, num)                                       \
+    if(str && (pos + num <= sz))                                        \
+    {                                                                   \
+        code;                                                           \
+    }                                                                   \
+    else if(str && sz)                                                  \
+    {                                                                   \
+        C4_ERROR("cannot write to string pos=%d num=%d sz=%d",          \
+                 (int)pos, (int)num, (int)sz);                          \
+    }                                                                   \
+    pos += num
+
+/// @endcond
+
+
+/** convert a bitmask to string.
+ * return the number of characters written. To find the needed size,
+ * call first with str=nullptr and sz=0 */
+template<class Enum>
+size_t bm2str
+(
+    typename std::underlying_type<Enum>::type bits,
+    char *str=nullptr,
+    size_t sz=0,
+    EnumOffsetType offst=EOFFS_PFX
+)
+{
+    using I = typename std::underlying_type<Enum>::type;
+    C4_ASSERT((str == nullptr) == (sz == 0));
+
+    auto syms = esyms<Enum>();
+    size_t pos = 0;
+    typename EnumSymbols<Enum>::Sym const* C4_RESTRICT zero = nullptr;
+
+    // do reverse iteration to give preference to composite enum symbols,
+    // which are likely to appear later in the enum sequence
+    for(size_t i = syms.size()-1; i != size_t(-1); --i)
+    {
+        auto const &C4_RESTRICT p = syms[i]; // do not copy, we are assigning to `zero`
+        I b = static_cast<I>(p.value);
+        if(b == 0)
+        {
+            zero = &p; // save this symbol for later
+        }
+        else if((bits & b) == b)
+        {
+            bits &= ~b;
+            // append bit-or character
+            if(pos > 0)
+            {
+                _c4prependchars(*str = '|', 1);
+            }
+            // append bit string
+            const char *pname = p.name_offs(offst);
+            size_t len = strlen(pname);
+            _c4prependchars(strncpy(str, pname, len), len);
+        }
+    }
+
+    C4_CHECK_MSG(bits == 0, "could not find all bits");
+    if(pos == 0) // make sure at least something is written
+    {
+        if(zero) // if we have a zero symbol, use that
+        {
+            const char *pname = zero->name_offs(offst);
+            size_t len = strlen(pname);
+            _c4prependchars(strncpy(str, pname, len), len);
+        }
+        else // otherwise just write an integer zero
+        {
+            _c4prependchars(*str = '0', 1);
+        }
+    }
+    _c4appendchars(str[pos] = '\0', 1);
+
+    return pos;
+}
+
+
+// cleanup!
+#undef _c4appendchars
+#undef _c4prependchars
+
+
+/** scoped enums do not convert automatically to their underlying type,
+ * so this SFINAE overload will accept scoped enum symbols and cast them
+ * to the underlying type */
+template<class Enum>
+typename std::enable_if<is_scoped_enum<Enum>::value, size_t>::type
+bm2str
+(
+    Enum bits,
+    char *str=nullptr,
+    size_t sz=0,
+    EnumOffsetType offst=EOFFS_PFX
+)
+{
+    using I = typename std::underlying_type<Enum>::type;
+    return bm2str<Enum>(static_cast<I>(bits), str, sz, offst);
+}
+
+
+//-----------------------------------------------------------------------------
+
+namespace detail {
+
+#ifdef __clang__
+#   pragma clang diagnostic push
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic push
+#   if __GNUC__ >= 6
+#       pragma GCC diagnostic ignored "-Wnull-dereference"
+#   endif
+#endif
+
+template<class Enum>
+typename std::underlying_type<Enum>::type str2bm_read_one(const char *str, size_t sz, bool alnum)
+{
+    using I = typename std::underlying_type<Enum>::type;
+    auto pairs = esyms<Enum>();
+    if(alnum)
+    {
+        auto *p = pairs.find(str, sz);
+        C4_CHECK_MSG(p != nullptr, "no valid enum pair name for '%.*s'", (int)sz, str);
+        return static_cast<I>(p->value);
+    }
+    I tmp;
+    size_t len = uncat(csubstr(str, sz), tmp);
+    C4_CHECK_MSG(len != csubstr::npos, "could not read string as an integral type: '%.*s'", (int)sz, str);
+    return tmp;
+}
+
+#ifdef __clang__
+#   pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic pop
+#endif
+} // namespace detail
+
+/** convert a string to a bitmask */
+template<class Enum>
+typename std::underlying_type<Enum>::type str2bm(const char *str, size_t sz)
+{
+    using I = typename std::underlying_type<Enum>::type;
+
+    I val = 0;
+    bool started = false;
+    bool alnum = false, num = false;
+    const char *f = nullptr, *pc = str;
+    for( ; pc < str+sz; ++pc)
+    {
+        const char c = *pc;
+        if((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_')
+        {
+            C4_CHECK(( ! num) || ((pc - f) == 1 && (c == 'x' || c == 'X'))); // accept hexadecimal numbers
+            if( ! started)
+            {
+                f = pc;
+                alnum = started = true;
+            }
+        }
+        else if(c >= '0' && c <= '9')
+        {
+            C4_CHECK( ! alnum);
+            if(!started)
+            {
+                f = pc;
+                num = started = true;
+            }
+        }
+        else if(c == ':' || c == ' ')
+        {
+            // skip this char
+        }
+        else if(c == '|' || c == '\0')
+        {
+            C4_ASSERT(num != alnum);
+            C4_ASSERT(pc >= f);
+            val |= detail::str2bm_read_one<Enum>(f, static_cast<size_t>(pc-f), alnum);
+            started = num = alnum = false;
+            if(c == '\0')
+            {
+                return val;
+            }
+        }
+        else
+        {
+            C4_ERROR("bad character '%c' in bitmask string", c);
+        }
+    }
+
+    if(f)
+    {
+        C4_ASSERT(num != alnum);
+        C4_ASSERT(pc >= f);
+        val |= detail::str2bm_read_one<Enum>(f, static_cast<size_t>(pc-f), alnum);
+    }
+
+    return val;
+}
+
+/** convert a string to a bitmask */
+template<class Enum>
+typename std::underlying_type<Enum>::type str2bm(const char *str)
+{
+    return str2bm<Enum>(str, strlen(str));
+}
+
+} // namespace c4
+
+#ifdef _MSC_VER
+#   pragma warning(pop)
+#elif defined(__clang__)
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic pop
+#endif
+
+#endif // _C4_BITMASK_HPP_
+
+
+// (end https://github.com/biojppm/c4core/src/c4/bitmask.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/span.hpp
+// https://github.com/biojppm/c4core/src/c4/span.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_SPAN_HPP_
+#define _C4_SPAN_HPP_
+
+/** @file span.hpp Provides span classes. */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/config.hpp
+//#include "c4/config.hpp"
+#if !defined(C4_CONFIG_HPP_) && !defined(_C4_CONFIG_HPP_)
+#error "amalgamate: file c4/config.hpp must have been included at this point"
+#endif /* C4_CONFIG_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/error.hpp
+//#include "c4/error.hpp"
+#if !defined(C4_ERROR_HPP_) && !defined(_C4_ERROR_HPP_)
+#error "amalgamate: file c4/error.hpp must have been included at this point"
+#endif /* C4_ERROR_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/szconv.hpp
+//#include "c4/szconv.hpp"
+#if !defined(C4_SZCONV_HPP_) && !defined(_C4_SZCONV_HPP_)
+#error "amalgamate: file c4/szconv.hpp must have been included at this point"
+#endif /* C4_SZCONV_HPP_ */
+
+
+//included above:
+//#include <algorithm>
+
+namespace c4 {
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+/** a crtp base for implementing span classes
+ *
+ * A span is a non-owning range of elements contiguously stored in memory.
+ * Unlike STL's array_view, the span allows write-access to its members.
+ *
+ * To obtain subspans from a span, the following const member functions
+ * are available:
+ *  - subspan(first, num)
+ *  - range(first, last)
+ *  - first(num)
+ *  - last(num)
+ *
+ * A span can also be resized via the following non-const member functions:
+ *  - resize(sz)
+ *  - ltrim(num)
+ *  - rtrim(num)
+ *
+ * @see span
+ * @see cspan
+ * @see spanrs
+ * @see cspanrs
+ * @see spanrsl
+ * @see cspanrsl
+ */
+template<class T, class I, class SpanImpl>
+class span_crtp
+{
+// some utility defines, undefined at the end of this class
+#define _c4this  ((SpanImpl      *)this)
+#define _c4cthis ((SpanImpl const*)this)
+#define _c4ptr   ((SpanImpl      *)this)->m_ptr
+#define _c4cptr  ((SpanImpl const*)this)->m_ptr
+#define _c4sz    ((SpanImpl      *)this)->m_size
+#define _c4csz   ((SpanImpl const*)this)->m_size
+
+public:
+
+    _c4_DEFINE_ARRAY_TYPES(T, I);
+
+public:
+
+    C4_ALWAYS_INLINE constexpr I value_size() const noexcept { return sizeof(T); }
+    C4_ALWAYS_INLINE constexpr I elm_size  () const noexcept { return sizeof(T); }
+    C4_ALWAYS_INLINE constexpr I type_size () const noexcept { return sizeof(T); }
+    C4_ALWAYS_INLINE           I byte_size () const noexcept { return _c4csz*sizeof(T); }
+
+    C4_ALWAYS_INLINE bool empty()    const noexcept { return _c4csz == 0; }
+    C4_ALWAYS_INLINE I    size()     const noexcept { return _c4csz; }
+    //C4_ALWAYS_INLINE I    capacity() const noexcept { return _c4sz; } // this must be defined by impl classes
+
+    C4_ALWAYS_INLINE void clear() noexcept { _c4sz = 0; }
+
+    C4_ALWAYS_INLINE T      * data()       noexcept { return _c4ptr; }
+    C4_ALWAYS_INLINE T const* data() const noexcept { return _c4cptr; }
+
+    C4_ALWAYS_INLINE       iterator  begin()       noexcept { return _c4ptr; }
+    C4_ALWAYS_INLINE const_iterator  begin() const noexcept { return _c4cptr; }
+    C4_ALWAYS_INLINE const_iterator cbegin() const noexcept { return _c4cptr; }
+
+    C4_ALWAYS_INLINE       iterator  end()       noexcept { return _c4ptr  + _c4sz; }
+    C4_ALWAYS_INLINE const_iterator  end() const noexcept { return _c4cptr + _c4csz; }
+    C4_ALWAYS_INLINE const_iterator cend() const noexcept { return _c4cptr + _c4csz; }
+
+    C4_ALWAYS_INLINE       reverse_iterator  rbegin()       noexcept { return reverse_iterator(_c4ptr + _c4sz); }
+    C4_ALWAYS_INLINE const_reverse_iterator  rbegin() const noexcept { return reverse_iterator(_c4cptr + _c4sz); }
+    C4_ALWAYS_INLINE const_reverse_iterator crbegin() const noexcept { return reverse_iterator(_c4cptr + _c4sz); }
+
+    C4_ALWAYS_INLINE       reverse_iterator  rend()       noexcept { return const_reverse_iterator(_c4ptr); }
+    C4_ALWAYS_INLINE const_reverse_iterator  rend() const noexcept { return const_reverse_iterator(_c4cptr); }
+    C4_ALWAYS_INLINE const_reverse_iterator crend() const noexcept { return const_reverse_iterator(_c4cptr); }
+
+    C4_ALWAYS_INLINE T      & front()       C4_NOEXCEPT_X { C4_XASSERT(!empty()); return _c4ptr [0]; }
+    C4_ALWAYS_INLINE T const& front() const C4_NOEXCEPT_X { C4_XASSERT(!empty()); return _c4cptr[0]; }
+
+    C4_ALWAYS_INLINE T      & back()       C4_NOEXCEPT_X { C4_XASSERT(!empty()); return _c4ptr [_c4sz  - 1]; }
+    C4_ALWAYS_INLINE T const& back() const C4_NOEXCEPT_X { C4_XASSERT(!empty()); return _c4cptr[_c4csz - 1]; }
+
+    C4_ALWAYS_INLINE T      & operator[] (I i)       C4_NOEXCEPT_X { C4_XASSERT(i >= 0 && i < _c4sz ); return _c4ptr [i]; }
+    C4_ALWAYS_INLINE T const& operator[] (I i) const C4_NOEXCEPT_X { C4_XASSERT(i >= 0 && i < _c4csz); return _c4cptr[i]; }
+
+    C4_ALWAYS_INLINE SpanImpl subspan(I first, I num) const C4_NOEXCEPT_X
+    {
+        C4_XASSERT((first >= 0 && first < _c4csz) || (first == _c4csz && num == 0));
+        C4_XASSERT((first + num >= 0) && (first + num <= _c4csz));
+        return _c4cthis->_select(_c4cptr + first, num);
+    }
+    C4_ALWAYS_INLINE SpanImpl subspan(I first) const C4_NOEXCEPT_X ///< goes up until the end of the span
+    {
+        C4_XASSERT(first >= 0 && first <= _c4csz);
+        return _c4cthis->_select(_c4cptr + first, _c4csz - first);
+    }
+
+    C4_ALWAYS_INLINE SpanImpl range(I first, I last) const C4_NOEXCEPT_X ///< last element is NOT included
+    {
+        C4_XASSERT(((first >= 0) && (first < _c4csz)) || (first == _c4csz && first == last));
+        C4_XASSERT((last >= 0) && (last <= _c4csz));
+        C4_XASSERT(last >= first);
+        return _c4cthis->_select(_c4cptr + first, last - first);
+    }
+    C4_ALWAYS_INLINE SpanImpl range(I first) const C4_NOEXCEPT_X ///< goes up until the end of the span
+    {
+        C4_XASSERT(((first >= 0) && (first <= _c4csz)));
+        return _c4cthis->_select(_c4cptr + first, _c4csz - first);
+    }
+
+    C4_ALWAYS_INLINE SpanImpl first(I num) const C4_NOEXCEPT_X ///< get the first num elements, starting at 0
+    {
+        C4_XASSERT((num >= 0) && (num <= _c4csz));
+        return _c4cthis->_select(_c4cptr, num);
+    }
+    C4_ALWAYS_INLINE SpanImpl last(I num) const C4_NOEXCEPT_X ///< get the last num elements, starting at size()-num
+    {
+        C4_XASSERT((num >= 0) && (num <= _c4csz));
+        return _c4cthis->_select(_c4cptr + _c4csz - num, num);
+    }
+
+    bool is_subspan(span_crtp const& ss) const noexcept
+    {
+        if(_c4cptr == nullptr) return false;
+        auto *b = begin(), *e = end();
+        auto *ssb = ss.begin(), *sse = ss.end();
+        if(ssb >= b && sse <= e)
+        {
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    /** COMPLement Left: return the complement to the left of the beginning of the given subspan.
+     * If ss does not begin inside this, returns an empty substring. */
+    SpanImpl compll(span_crtp const& ss) const C4_NOEXCEPT_X
+    {
+        auto ssb = ss.begin();
+        auto b = begin();
+        auto e = end();
+        if(ssb >= b && ssb <= e)
+        {
+            return subspan(0, static_cast<size_t>(ssb - b));
+        }
+        else
+        {
+            return subspan(0, 0);
+        }
+    }
+
+    /** COMPLement Right: return the complement to the right of the end of the given subspan.
+     * If ss does not end inside this, returns an empty substring. */
+    SpanImpl complr(span_crtp const& ss) const C4_NOEXCEPT_X
+    {
+        auto sse = ss.end();
+        auto b = begin();
+        auto e = end();
+        if(sse >= b && sse <= e)
+        {
+            return subspan(static_cast<size_t>(sse - b), static_cast<size_t>(e - sse));
+        }
+        else
+        {
+            return subspan(0, 0);
+        }
+    }
+
+    C4_ALWAYS_INLINE bool same_span(span_crtp const& that) const noexcept
+    {
+        return size() == that.size() && data() == that.data();
+    }
+    template<class I2, class Impl2>
+    C4_ALWAYS_INLINE bool same_span(span_crtp<T, I2, Impl2> const& that) const C4_NOEXCEPT_X
+    {
+        I tsz = szconv<I>(that.size()); // x-asserts that the size does not overflow
+        return size() == tsz && data() == that.data();
+    }
+
+#undef _c4this
+#undef _c4cthis
+#undef _c4ptr
+#undef _c4cptr
+#undef _c4sz
+#undef _c4csz
+};
+
+//-----------------------------------------------------------------------------
+template<class T, class Il, class Ir, class _Impll, class _Implr>
+inline constexpr bool operator==
+(
+    span_crtp<T, Il, _Impll> const& l,
+    span_crtp<T, Ir, _Implr> const& r
+)
+{
+#if C4_CPP >= 14
+    return std::equal(l.begin(), l.end(), r.begin(), r.end());
+#else
+    return l.same_span(r) || std::equal(l.begin(), l.end(), r.begin());
+#endif
+}
+
+template<class T, class Il, class Ir, class _Impll, class _Implr>
+inline constexpr bool operator!=
+(
+    span_crtp<T, Il, _Impll> const& l,
+    span_crtp<T, Ir, _Implr> const& r
+)
+{
+    return ! (l == r);
+}
+
+//-----------------------------------------------------------------------------
+template<class T, class Il, class Ir, class _Impll, class _Implr>
+inline constexpr bool operator<
+(
+    span_crtp<T, Il, _Impll> const& l,
+    span_crtp<T, Ir, _Implr> const& r
+)
+{
+    return std::lexicographical_compare(l.begin(), l.end(), r.begin(), r.end());
+}
+
+template<class T, class Il, class Ir, class _Impll, class _Implr>
+inline constexpr bool operator<=
+(
+    span_crtp<T, Il, _Impll> const& l,
+    span_crtp<T, Ir, _Implr> const& r
+)
+{
+    return ! (l > r);
+}
+
+//-----------------------------------------------------------------------------
+template<class T, class Il, class Ir, class _Impll, class _Implr>
+inline constexpr bool operator>
+(
+    span_crtp<T, Il, _Impll> const& l,
+    span_crtp<T, Ir, _Implr> const& r
+)
+{
+    return r < l;
+}
+
+//-----------------------------------------------------------------------------
+template<class T, class Il, class Ir, class _Impll, class _Implr>
+inline constexpr bool operator>=
+(
+    span_crtp<T, Il, _Impll> const& l,
+    span_crtp<T, Ir, _Implr> const& r
+)
+{
+    return ! (l < r);
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+/** A non-owning span of elements contiguously stored in memory. */
+template<class T, class I=C4_SIZE_TYPE>
+class span : public span_crtp<T, I, span<T, I>>
+{
+    friend class span_crtp<T, I, span<T, I>>;
+
+    T * C4_RESTRICT m_ptr;
+    I   m_size;
+
+    C4_ALWAYS_INLINE span _select(T *p, I sz) const { return span(p, sz); }
+
+public:
+
+    _c4_DEFINE_ARRAY_TYPES(T, I);
+    using NCT = typename std::remove_const<T>::type; //!< NCT=non const type
+    using CT = typename std::add_const<T>::type; //!< CT=const type
+    using const_type = span<CT, I>;
+
+    /// convert automatically to span of const T
+    operator span<CT, I> () const { span<CT, I> s(m_ptr, m_size); return s; }
+
+public:
+
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 span() noexcept : m_ptr{nullptr}, m_size{0} {}
+
+    span(span const&) = default;
+    span(span     &&) = default;
+
+    span& operator= (span const&) = default;
+    span& operator= (span     &&) = default;
+
+public:
+
+    /** @name Construction and assignment from same type */
+    /** @{ */
+
+    template<size_t N> C4_ALWAYS_INLINE C4_CONSTEXPR14      span  (T (&arr)[N]) noexcept : m_ptr{arr}, m_size{N} {}
+    template<size_t N> C4_ALWAYS_INLINE C4_CONSTEXPR14 void assign(T (&arr)[N]) noexcept { m_ptr = arr; m_size = N; }
+
+    C4_ALWAYS_INLINE C4_CONSTEXPR14        span(T *p, I sz) noexcept : m_ptr{p}, m_size{sz} {}
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 void   assign(T *p, I sz) noexcept { m_ptr = p; m_size = sz; }
+
+    C4_ALWAYS_INLINE C4_CONSTEXPR14      span  (c4::aggregate_t, std::initializer_list<T> il) noexcept : m_ptr{&*il.begin()}, m_size{il.size()} {}
+    C4_ALWAYS_INLINE C4_CONSTEXPR14 void assign(c4::aggregate_t, std::initializer_list<T> il) noexcept { m_ptr = &*il.begin(); m_size = il.size(); }
+
+    /** @} */
+
+public:
+
+    C4_ALWAYS_INLINE I capacity() const noexcept { return m_size; }
+
+    C4_ALWAYS_INLINE void resize(I sz) C4_NOEXCEPT_A { C4_ASSERT(sz <= m_size); m_size = sz; }
+    C4_ALWAYS_INLINE void rtrim (I n ) C4_NOEXCEPT_A { C4_ASSERT(n >= 0 && n < m_size); m_size -= n; }
+    C4_ALWAYS_INLINE void ltrim (I n ) C4_NOEXCEPT_A { C4_ASSERT(n >= 0 && n < m_size); m_size -= n; m_ptr += n; }
+
+};
+template<class T, class I=C4_SIZE_TYPE> using cspan = span<const T, I>;
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+/** A non-owning span resizeable up to a capacity. Subselection or resizing
+ * will keep the original provided it starts at begin(). If subselection or
+ * resizing change the pointer, then the original capacity information will
+ * be lost.
+ *
+ * Thus, resizing via resize() and ltrim() and subselecting via first()
+ * or any of subspan() or range() when starting from the beginning will keep
+ * the original capacity. OTOH, using last(), or any of subspan() or range()
+ * with an offset from the start will remove from capacity (shifting the
+ * pointer) by the corresponding offset. If this is undesired, then consider
+ * using spanrsl.
+ *
+ * @see spanrs for a span resizeable on the right
+ * @see spanrsl for a span resizeable on the right and left
+ */
+
+template<class T, class I=C4_SIZE_TYPE>
+class spanrs : public span_crtp<T, I, spanrs<T, I>>
+{
+    friend class span_crtp<T, I, spanrs<T, I>>;
+
+    T * C4_RESTRICT m_ptr;
+    I   m_size;
+    I   m_capacity;
+
+    C4_ALWAYS_INLINE spanrs _select(T *p, I sz) const noexcept
+    {
+        C4_ASSERT(p >= m_ptr);
+        size_t delta = static_cast<size_t>(p - m_ptr);
+        C4_ASSERT(m_capacity >= delta);
+        return spanrs(p, sz, static_cast<size_t>(m_capacity - delta));
+    }
+
+public:
+
+    _c4_DEFINE_ARRAY_TYPES(T, I);
+    using NCT = typename std::remove_const<T>::type; //!< NCT=non const type
+    using CT = typename std::add_const<T>::type; //!< CT=const type
+    using const_type = spanrs<CT, I>;
+
+    /// convert automatically to span of T
+    C4_ALWAYS_INLINE operator span<T, I > () const noexcept { return span<T, I>(m_ptr, m_size); }
+    /// convert automatically to span of const T
+    //C4_ALWAYS_INLINE operator span<CT, I> () const noexcept { span<CT, I> s(m_ptr, m_size); return s; }
+    /// convert automatically to spanrs of const T
+    C4_ALWAYS_INLINE operator spanrs<CT, I> () const noexcept { spanrs<CT, I> s(m_ptr, m_size, m_capacity); return s; }
+
+public:
+
+    C4_ALWAYS_INLINE spanrs() noexcept : m_ptr{nullptr}, m_size{0}, m_capacity{0} {}
+
+    spanrs(spanrs const&) = default;
+    spanrs(spanrs     &&) = default;
+
+    spanrs& operator= (spanrs const&) = default;
+    spanrs& operator= (spanrs     &&) = default;
+
+public:
+
+    /** @name Construction and assignment from same type */
+    /** @{ */
+
+    C4_ALWAYS_INLINE      spanrs(T *p, I sz) noexcept : m_ptr{p}, m_size{sz}, m_capacity{sz} {}
+    /** @warning will reset the capacity to sz */
+    C4_ALWAYS_INLINE void assign(T *p, I sz) noexcept { m_ptr = p; m_size = sz; m_capacity = sz; }
+
+    C4_ALWAYS_INLINE      spanrs(T *p, I sz, I cap) noexcept : m_ptr{p}, m_size{sz}, m_capacity{cap} {}
+    C4_ALWAYS_INLINE void assign(T *p, I sz, I cap) noexcept { m_ptr = p; m_size = sz; m_capacity = cap; }
+
+    template<size_t N> C4_ALWAYS_INLINE      spanrs(T (&arr)[N]) noexcept : m_ptr{arr}, m_size{N}, m_capacity{N} {}
+    template<size_t N> C4_ALWAYS_INLINE void assign(T (&arr)[N]) noexcept { m_ptr = arr; m_size = N; m_capacity = N; }
+
+    C4_ALWAYS_INLINE      spanrs(c4::aggregate_t, std::initializer_list<T> il) noexcept : m_ptr{il.begin()}, m_size{il.size()}, m_capacity{il.size()} {}
+    C4_ALWAYS_INLINE void assign(c4::aggregate_t, std::initializer_list<T> il) noexcept { m_ptr = il.begin(); m_size = il.size(); m_capacity = il.size(); }
+
+    /** @} */
+
+public:
+
+    C4_ALWAYS_INLINE I capacity() const noexcept { return m_capacity; }
+
+    C4_ALWAYS_INLINE void resize(I sz) C4_NOEXCEPT_A { C4_ASSERT(sz <= m_capacity); m_size = sz; }
+    C4_ALWAYS_INLINE void rtrim (I n ) C4_NOEXCEPT_A { C4_ASSERT(n >= 0 && n < m_size); m_size -= n; }
+    C4_ALWAYS_INLINE void ltrim (I n ) C4_NOEXCEPT_A { C4_ASSERT(n >= 0 && n < m_size); m_size -= n; m_ptr += n; m_capacity -= n; }
+
+};
+template<class T, class I=C4_SIZE_TYPE> using cspanrs = spanrs<const T, I>;
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+/** A non-owning span which always retains the capacity of the original
+ * range it was taken from (though it may loose its original size).
+ * The resizing methods resize(), ltrim(), rtrim() as well
+ * as the subselection methods subspan(), range(), first() and last() can be
+ * used at will without loosing the original capacity; the full capacity span
+ * can always be recovered by calling original().
+ */
+template<class T, class I=C4_SIZE_TYPE>
+class spanrsl : public span_crtp<T, I, spanrsl<T, I>>
+{
+    friend class span_crtp<T, I, spanrsl<T, I>>;
+
+    T *C4_RESTRICT m_ptr;      ///< the current ptr. the original ptr is (m_ptr - m_offset).
+    I   m_size;     ///< the current size. the original size is unrecoverable.
+    I   m_capacity; ///< the current capacity. the original capacity is (m_capacity + m_offset).
+    I   m_offset;   ///< the offset of the current m_ptr to the start of the original memory block.
+
+    C4_ALWAYS_INLINE spanrsl _select(T *p, I sz) const noexcept
+    {
+        C4_ASSERT(p >= m_ptr);
+        I delta = static_cast<I>(p - m_ptr);
+        C4_ASSERT(m_capacity >= delta);
+        return spanrsl(p, sz, static_cast<I>(m_capacity - delta), m_offset + delta);
+    }
+
+public:
+
+    _c4_DEFINE_ARRAY_TYPES(T, I);
+    using NCT = typename std::remove_const<T>::type; //!< NCT=non const type
+    using CT = typename std::add_const<T>::type; //!< CT=const type
+    using const_type = spanrsl<CT, I>;
+
+    C4_ALWAYS_INLINE operator span<T, I> () const noexcept { return span<T, I>(m_ptr, m_size); }
+    C4_ALWAYS_INLINE operator spanrs<T, I> () const noexcept { return spanrs<T, I>(m_ptr, m_size, m_capacity); }
+    C4_ALWAYS_INLINE operator spanrsl<CT, I> () const noexcept { return spanrsl<CT, I>(m_ptr, m_size, m_capacity, m_offset); }
+
+public:
+
+    C4_ALWAYS_INLINE spanrsl() noexcept : m_ptr{nullptr}, m_size{0}, m_capacity{0}, m_offset{0} {}
+
+    spanrsl(spanrsl const&) = default;
+    spanrsl(spanrsl     &&) = default;
+
+    spanrsl& operator= (spanrsl const&) = default;
+    spanrsl& operator= (spanrsl     &&) = default;
+
+public:
+
+    C4_ALWAYS_INLINE     spanrsl(T *p, I sz) noexcept : m_ptr{p}, m_size{sz}, m_capacity{sz}, m_offset{0} {}
+    C4_ALWAYS_INLINE void assign(T *p, I sz) noexcept { m_ptr = p; m_size = sz; m_capacity = sz; m_offset = 0; }
+
+    C4_ALWAYS_INLINE     spanrsl(T *p, I sz, I cap) noexcept : m_ptr{p}, m_size{sz}, m_capacity{cap}, m_offset{0} {}
+    C4_ALWAYS_INLINE void assign(T *p, I sz, I cap) noexcept { m_ptr = p; m_size = sz; m_capacity = cap; m_offset = 0; }
+
+    C4_ALWAYS_INLINE     spanrsl(T *p, I sz, I cap, I offs) noexcept : m_ptr{p}, m_size{sz}, m_capacity{cap}, m_offset{offs} {}
+    C4_ALWAYS_INLINE void assign(T *p, I sz, I cap, I offs) noexcept { m_ptr = p; m_size = sz; m_capacity = cap; m_offset = offs; }
+
+    template<size_t N> C4_ALWAYS_INLINE     spanrsl(T (&arr)[N]) noexcept : m_ptr{arr}, m_size{N}, m_capacity{N}, m_offset{0} {}
+    template<size_t N> C4_ALWAYS_INLINE void assign(T (&arr)[N]) noexcept { m_ptr = arr; m_size = N; m_capacity = N; m_offset = 0; }
+
+    C4_ALWAYS_INLINE      spanrsl(c4::aggregate_t, std::initializer_list<T> il) noexcept : m_ptr{il.begin()}, m_size{il.size()}, m_capacity{il.size()}, m_offset{0} {}
+    C4_ALWAYS_INLINE void assign (c4::aggregate_t, std::initializer_list<T> il) noexcept { m_ptr = il.begin(); m_size = il.size(); m_capacity = il.size(); m_offset = 0; }
+
+public:
+
+    C4_ALWAYS_INLINE I offset() const noexcept { return m_offset; }
+    C4_ALWAYS_INLINE I capacity() const noexcept { return m_capacity; }
+
+    C4_ALWAYS_INLINE void resize(I sz) C4_NOEXCEPT_A { C4_ASSERT(sz <= m_capacity); m_size = sz; }
+    C4_ALWAYS_INLINE void rtrim (I n ) C4_NOEXCEPT_A { C4_ASSERT(n >= 0 && n < m_size); m_size -= n; }
+    C4_ALWAYS_INLINE void ltrim (I n ) C4_NOEXCEPT_A { C4_ASSERT(n >= 0 && n < m_size); m_size -= n; m_ptr += n; m_offset += n; m_capacity -= n; }
+
+    /** recover the original span as an spanrsl */
+    C4_ALWAYS_INLINE spanrsl original() const
+    {
+        return spanrsl(m_ptr - m_offset, m_capacity + m_offset, m_capacity + m_offset, 0);
+    }
+    /** recover the original span as a different span type. Example: spanrs<...> orig = s.original<spanrs>(); */
+    template<template<class, class> class OtherSpanType>
+    C4_ALWAYS_INLINE OtherSpanType<T, I> original()
+    {
+        return OtherSpanType<T, I>(m_ptr - m_offset, m_capacity + m_offset);
+    }
+};
+template<class T, class I=C4_SIZE_TYPE> using cspanrsl = spanrsl<const T, I>;
+
+
+} // namespace c4
+
+
+#endif /* _C4_SPAN_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/span.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/type_name.hpp
+// https://github.com/biojppm/c4core/src/c4/type_name.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_TYPENAME_HPP_
+#define _C4_TYPENAME_HPP_
+
+/** @file type_name.hpp compile-time type name */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/span.hpp
+//#include "c4/span.hpp"
+#if !defined(C4_SPAN_HPP_) && !defined(_C4_SPAN_HPP_)
+#error "amalgamate: file c4/span.hpp must have been included at this point"
+#endif /* C4_SPAN_HPP_ */
+
+
+/// @cond dev
+struct _c4t
+{
+    const char *str;
+    size_t sz;
+    template<size_t N>
+    constexpr _c4t(const char (&s)[N]) : str(s), sz(N-1) {} // take off the \0
+};
+// this is a more abbreviated way of getting the type name
+// (if we used span in the return type, the name would involve
+// templates and would create longer type name strings,
+// as well as larger differences between compilers)
+template<class T>
+C4_CONSTEXPR14 C4_ALWAYS_INLINE
+_c4t _c4tn()
+{
+    auto p = _c4t(C4_PRETTY_FUNC);
+    return p;
+}
+/// @endcond
+
+
+namespace c4 {
+
+/** compile-time type name
+ * @see http://stackoverflow.com/a/20170989/5875572 */
+template<class T>
+C4_CONSTEXPR14 cspan<char> type_name()
+{
+    const _c4t p = _c4tn<T>();
+
+#if (0) // _C4_THIS_IS_A_DEBUG_SCAFFOLD
+    for(size_t index = 0; index < p.sz; ++index)
+    {
+        printf(" %2c", p.str[index]);
+    }
+    printf("\n");
+    for(size_t index = 0; index < p.sz; ++index)
+    {
+        printf(" %2d", (int)index);
+    }
+    printf("\n");
+#endif
+
+#if defined(_MSC_VER)
+#   if defined(__clang__) // Visual Studio has the clang toolset
+    // example:
+    // ..........................xxx.
+    // _c4t __cdecl _c4tn() [T = int]
+    enum : size_t { tstart = 26, tend = 1};
+
+#   elif defined(C4_MSVC_2015) || defined(C4_MSVC_2017) || defined(C4_MSVC_2019) || defined(C4_MSVC_2022)
+    // Note: subtract 7 at the end because the function terminates with ">(void)" in VS2015+
+    cspan<char>::size_type tstart = 26, tend = 7;
+
+    const char *s = p.str + tstart; // look at the start
+
+    // we're not using strcmp() or memcmp() to spare the #include
+
+    // does it start with 'class '?
+    if(p.sz > 6 && s[0] == 'c' && s[1] == 'l' && s[2] == 'a' && s[3] == 's' && s[4] == 's' && s[5] == ' ')
+    {
+        tstart += 6;
+    }
+    // does it start with 'struct '?
+    else if(p.sz > 7 && s[0] == 's' && s[1] == 't' && s[2] == 'r' && s[3] == 'u' && s[4] == 'c' && s[5] == 't' && s[6] == ' ')
+    {
+        tstart += 7;
+    }
+
+#   else
+    C4_NOT_IMPLEMENTED();
+#   endif
+
+#elif defined(__ICC)
+    // example:
+    // ........................xxx.
+    // "_c4t _c4tn() [with T = int]"
+    enum : size_t { tstart = 23, tend = 1};
+
+#elif defined(__clang__)
+    // example:
+    // ...................xxx.
+    // "_c4t _c4tn() [T = int]"
+    enum : size_t { tstart = 18, tend = 1};
+
+#elif defined(__GNUC__)
+    #if __GNUC__ >= 7 && C4_CPP >= 14
+        // example:
+        // ..................................xxx.
+        // "constexpr _c4t _c4tn() [with T = int]"
+        enum : size_t { tstart = 33, tend = 1 };
+    #else
+        // example:
+        // ........................xxx.
+        // "_c4t _c4tn() [with T = int]"
+        enum : size_t { tstart = 23, tend = 1 };
+    #endif
+#else
+    C4_NOT_IMPLEMENTED();
+#endif
+
+    cspan<char> o(p.str + tstart, p.sz - tstart - tend);
+
+    return o;
+}
+
+/** compile-time type name
+ * @overload */
+template<class T>
+C4_CONSTEXPR14 C4_ALWAYS_INLINE cspan<char> type_name(T const&)
+{
+    return type_name<T>();
+}
+
+} // namespace c4
+
+#endif //_C4_TYPENAME_HPP_
+
+
+// (end https://github.com/biojppm/c4core/src/c4/type_name.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/base64.hpp
+// https://github.com/biojppm/c4core/src/c4/base64.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_BASE64_HPP_
+#define _C4_BASE64_HPP_
+
+/** @file base64.hpp encoding/decoding for base64.
+ * @see https://en.wikipedia.org/wiki/Base64
+ * @see https://www.base64encode.org/
+ * */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/charconv.hpp
+//#include "c4/charconv.hpp"
+#if !defined(C4_CHARCONV_HPP_) && !defined(_C4_CHARCONV_HPP_)
+#error "amalgamate: file c4/charconv.hpp must have been included at this point"
+#endif /* C4_CHARCONV_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/blob.hpp
+//#include "c4/blob.hpp"
+#if !defined(C4_BLOB_HPP_) && !defined(_C4_BLOB_HPP_)
+#error "amalgamate: file c4/blob.hpp must have been included at this point"
+#endif /* C4_BLOB_HPP_ */
+
+
+namespace c4 {
+
+/** check that the given buffer is a valid base64 encoding
+ * @see https://en.wikipedia.org/wiki/Base64 */
+bool base64_valid(csubstr encoded);
+
+/** base64-encode binary data.
+ * @param encoded [out] output buffer for encoded data
+ * @param data [in] the input buffer with the binary data
+ * @return the number of bytes needed to return the output. No writes occur beyond the end of the output buffer.
+ * @see https://en.wikipedia.org/wiki/Base64 */
+size_t base64_encode(substr encoded, cblob data);
+
+/** decode the base64 encoding in the given buffer
+ * @param encoded [in] the encoded base64
+ * @param data [out] the output buffer
+ * @return the number of bytes needed to return the output.. No writes occur beyond the end of the output buffer.
+ * @see https://en.wikipedia.org/wiki/Base64 */
+size_t base64_decode(csubstr encoded, blob data);
+
+
+namespace fmt {
+
+template<typename CharOrConstChar>
+struct base64_wrapper_
+{
+    blob_<CharOrConstChar> data;
+    base64_wrapper_() : data() {}
+    base64_wrapper_(blob_<CharOrConstChar> blob) : data(blob) {}
+};
+using const_base64_wrapper = base64_wrapper_<cbyte>;
+using base64_wrapper = base64_wrapper_<byte>;
+
+
+/** mark a variable to be written in base64 format */
+template<class ...Args>
+C4_ALWAYS_INLINE const_base64_wrapper cbase64(Args const& C4_RESTRICT ...args)
+{
+    return const_base64_wrapper(cblob(args...));
+}
+/** mark a csubstr to be written in base64 format */
+C4_ALWAYS_INLINE const_base64_wrapper cbase64(csubstr s)
+{
+    return const_base64_wrapper(cblob(s.str, s.len));
+}
+/** mark a variable to be written in base64 format */
+template<class ...Args>
+C4_ALWAYS_INLINE const_base64_wrapper base64(Args const& C4_RESTRICT ...args)
+{
+    return const_base64_wrapper(cblob(args...));
+}
+/** mark a csubstr to be written in base64 format */
+C4_ALWAYS_INLINE const_base64_wrapper base64(csubstr s)
+{
+    return const_base64_wrapper(cblob(s.str, s.len));
+}
+
+/** mark a variable to be read in base64 format */
+template<class ...Args>
+C4_ALWAYS_INLINE base64_wrapper base64(Args &... args)
+{
+    return base64_wrapper(blob(args...));
+}
+/** mark a variable to be read in base64 format */
+C4_ALWAYS_INLINE base64_wrapper base64(substr s)
+{
+    return base64_wrapper(blob(s.str, s.len));
+}
+
+} // namespace fmt
+
+
+/** write a variable in base64 format */
+inline size_t to_chars(substr buf, fmt::const_base64_wrapper b)
+{
+    return base64_encode(buf, b.data);
+}
+
+/** read a variable in base64 format */
+inline size_t from_chars(csubstr buf, fmt::base64_wrapper *b)
+{
+    return base64_decode(buf, b->data);
+}
+
+} // namespace c4
+
+#endif /* _C4_BASE64_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/base64.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/std/string.hpp
+// https://github.com/biojppm/c4core/src/c4/std/string.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_STD_STRING_HPP_
+#define _C4_STD_STRING_HPP_
+
+/** @file string.hpp */
+
+#ifndef C4CORE_SINGLE_HEADER
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/substr.hpp
+//#include "c4/substr.hpp"
+#if !defined(C4_SUBSTR_HPP_) && !defined(_C4_SUBSTR_HPP_)
+#error "amalgamate: file c4/substr.hpp must have been included at this point"
+#endif /* C4_SUBSTR_HPP_ */
+
+#endif
+
+//included above:
+//#include <string>
+
+namespace c4 {
+
+//-----------------------------------------------------------------------------
+
+/** get a writeable view to an existing std::string.
+ * When the string is empty, the returned view will be pointing
+ * at the character with value '\0', but the size will be zero.
+ * @see https://en.cppreference.com/w/cpp/string/basic_string/operator_at
+ */
+C4_ALWAYS_INLINE c4::substr to_substr(std::string &s) noexcept
+{
+    #if C4_CPP < 11
+    #error this function will do undefined behavior
+    #endif
+    // since c++11 it is legal to call s[s.size()].
+    return c4::substr(&s[0], s.size());
+}
+
+/** get a readonly view to an existing std::string.
+ * When the string is empty, the returned view will be pointing
+ * at the character with value '\0', but the size will be zero.
+ * @see https://en.cppreference.com/w/cpp/string/basic_string/operator_at
+ */
+C4_ALWAYS_INLINE c4::csubstr to_csubstr(std::string const& s) noexcept
+{
+    #if C4_CPP < 11
+    #error this function will do undefined behavior
+    #endif
+    // since c++11 it is legal to call s[s.size()].
+    return c4::csubstr(&s[0], s.size());
+}
+
+//-----------------------------------------------------------------------------
+
+C4_ALWAYS_INLINE bool operator== (c4::csubstr ss, std::string const& s) { return ss.compare(to_csubstr(s)) == 0; }
+C4_ALWAYS_INLINE bool operator!= (c4::csubstr ss, std::string const& s) { return ss.compare(to_csubstr(s)) != 0; }
+C4_ALWAYS_INLINE bool operator>= (c4::csubstr ss, std::string const& s) { return ss.compare(to_csubstr(s)) >= 0; }
+C4_ALWAYS_INLINE bool operator>  (c4::csubstr ss, std::string const& s) { return ss.compare(to_csubstr(s)) >  0; }
+C4_ALWAYS_INLINE bool operator<= (c4::csubstr ss, std::string const& s) { return ss.compare(to_csubstr(s)) <= 0; }
+C4_ALWAYS_INLINE bool operator<  (c4::csubstr ss, std::string const& s) { return ss.compare(to_csubstr(s)) <  0; }
+
+C4_ALWAYS_INLINE bool operator== (std::string const& s, c4::csubstr ss) { return ss.compare(to_csubstr(s)) == 0; }
+C4_ALWAYS_INLINE bool operator!= (std::string const& s, c4::csubstr ss) { return ss.compare(to_csubstr(s)) != 0; }
+C4_ALWAYS_INLINE bool operator>= (std::string const& s, c4::csubstr ss) { return ss.compare(to_csubstr(s)) <= 0; }
+C4_ALWAYS_INLINE bool operator>  (std::string const& s, c4::csubstr ss) { return ss.compare(to_csubstr(s)) <  0; }
+C4_ALWAYS_INLINE bool operator<= (std::string const& s, c4::csubstr ss) { return ss.compare(to_csubstr(s)) >= 0; }
+C4_ALWAYS_INLINE bool operator<  (std::string const& s, c4::csubstr ss) { return ss.compare(to_csubstr(s)) >  0; }
+
+//-----------------------------------------------------------------------------
+
+/** copy an std::string to a writeable string view */
+inline size_t to_chars(c4::substr buf, std::string const& s)
+{
+    C4_ASSERT(!buf.overlaps(to_csubstr(s)));
+    size_t len = buf.len < s.size() ? buf.len : s.size();
+    // calling memcpy with null strings is undefined behavior
+    // and will wreak havoc in calling code's branches.
+    // see https://github.com/biojppm/rapidyaml/pull/264#issuecomment-1262133637
+    if(len)
+    {
+        C4_ASSERT(s.data() != nullptr);
+        C4_ASSERT(buf.str != nullptr);
+        memcpy(buf.str, s.data(), len);
+    }
+    return s.size(); // return the number of needed chars
+}
+
+/** copy a string view to an existing std::string */
+inline bool from_chars(c4::csubstr buf, std::string * s)
+{
+    s->resize(buf.len);
+    C4_ASSERT(!buf.overlaps(to_csubstr(*s)));
+    // calling memcpy with null strings is undefined behavior
+    // and will wreak havoc in calling code's branches.
+    // see https://github.com/biojppm/rapidyaml/pull/264#issuecomment-1262133637
+    if(buf.len)
+    {
+        C4_ASSERT(buf.str != nullptr);
+        memcpy(&(*s)[0], buf.str, buf.len);
+    }
+    return true;
+}
+
+} // namespace c4
+
+#endif // _C4_STD_STRING_HPP_
+
+
+// (end https://github.com/biojppm/c4core/src/c4/std/string.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/std/vector.hpp
+// https://github.com/biojppm/c4core/src/c4/std/vector.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_STD_VECTOR_HPP_
+#define _C4_STD_VECTOR_HPP_
+
+/** @file vector.hpp provides conversion and comparison facilities
+ * from/between std::vector<char> to c4::substr and c4::csubstr.
+ * @todo add to_span() and friends
+ */
+
+#ifndef C4CORE_SINGLE_HEADER
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/substr.hpp
+//#include "c4/substr.hpp"
+#if !defined(C4_SUBSTR_HPP_) && !defined(_C4_SUBSTR_HPP_)
+#error "amalgamate: file c4/substr.hpp must have been included at this point"
+#endif /* C4_SUBSTR_HPP_ */
+
+#endif
+
+#include <vector>
+
+namespace c4 {
+
+//-----------------------------------------------------------------------------
+
+/** get a substr (writeable string view) of an existing std::vector<char> */
+template<class Alloc>
+c4::substr to_substr(std::vector<char, Alloc> &vec)
+{
+    char *data = vec.empty() ? nullptr : vec.data(); // data() may or may not return a null pointer.
+    return c4::substr(data, vec.size());
+}
+
+/** get a csubstr (read-only string) view of an existing std::vector<char> */
+template<class Alloc>
+c4::csubstr to_csubstr(std::vector<char, Alloc> const& vec)
+{
+    const char *data = vec.empty() ? nullptr : vec.data(); // data() may or may not return a null pointer.
+    return c4::csubstr(data, vec.size());
+}
+
+//-----------------------------------------------------------------------------
+// comparisons between substrings and std::vector<char>
+
+template<class Alloc> C4_ALWAYS_INLINE bool operator!= (c4::csubstr ss, std::vector<char, Alloc> const& s) { return ss != to_csubstr(s); }
+template<class Alloc> C4_ALWAYS_INLINE bool operator== (c4::csubstr ss, std::vector<char, Alloc> const& s) { return ss == to_csubstr(s); }
+template<class Alloc> C4_ALWAYS_INLINE bool operator>= (c4::csubstr ss, std::vector<char, Alloc> const& s) { return ss >= to_csubstr(s); }
+template<class Alloc> C4_ALWAYS_INLINE bool operator>  (c4::csubstr ss, std::vector<char, Alloc> const& s) { return ss >  to_csubstr(s); }
+template<class Alloc> C4_ALWAYS_INLINE bool operator<= (c4::csubstr ss, std::vector<char, Alloc> const& s) { return ss <= to_csubstr(s); }
+template<class Alloc> C4_ALWAYS_INLINE bool operator<  (c4::csubstr ss, std::vector<char, Alloc> const& s) { return ss <  to_csubstr(s); }
+
+template<class Alloc> C4_ALWAYS_INLINE bool operator!= (std::vector<char, Alloc> const& s, c4::csubstr ss) { return ss != to_csubstr(s); }
+template<class Alloc> C4_ALWAYS_INLINE bool operator== (std::vector<char, Alloc> const& s, c4::csubstr ss) { return ss == to_csubstr(s); }
+template<class Alloc> C4_ALWAYS_INLINE bool operator>= (std::vector<char, Alloc> const& s, c4::csubstr ss) { return ss <= to_csubstr(s); }
+template<class Alloc> C4_ALWAYS_INLINE bool operator>  (std::vector<char, Alloc> const& s, c4::csubstr ss) { return ss <  to_csubstr(s); }
+template<class Alloc> C4_ALWAYS_INLINE bool operator<= (std::vector<char, Alloc> const& s, c4::csubstr ss) { return ss >= to_csubstr(s); }
+template<class Alloc> C4_ALWAYS_INLINE bool operator<  (std::vector<char, Alloc> const& s, c4::csubstr ss) { return ss >  to_csubstr(s); }
+
+//-----------------------------------------------------------------------------
+
+/** copy a std::vector<char> to a writeable string view */
+template<class Alloc>
+inline size_t to_chars(c4::substr buf, std::vector<char, Alloc> const& s)
+{
+    C4_ASSERT(!buf.overlaps(to_csubstr(s)));
+    size_t len = buf.len < s.size() ? buf.len : s.size();
+    // calling memcpy with null strings is undefined behavior
+    // and will wreak havoc in calling code's branches.
+    // see https://github.com/biojppm/rapidyaml/pull/264#issuecomment-1262133637
+    if(len > 0)
+    {
+        memcpy(buf.str, s.data(), len);
+    }
+    return s.size(); // return the number of needed chars
+}
+
+/** copy a string view to an existing std::vector<char> */
+template<class Alloc>
+inline bool from_chars(c4::csubstr buf, std::vector<char, Alloc> * s)
+{
+    s->resize(buf.len);
+    C4_ASSERT(!buf.overlaps(to_csubstr(*s)));
+    // calling memcpy with null strings is undefined behavior
+    // and will wreak havoc in calling code's branches.
+    // see https://github.com/biojppm/rapidyaml/pull/264#issuecomment-1262133637
+    if(buf.len > 0)
+    {
+        memcpy(&(*s)[0], buf.str, buf.len);
+    }
+    return true;
+}
+
+} // namespace c4
+
+#endif // _C4_STD_VECTOR_HPP_
+
+
+// (end https://github.com/biojppm/c4core/src/c4/std/vector.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/std/tuple.hpp
+// https://github.com/biojppm/c4core/src/c4/std/tuple.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_STD_TUPLE_HPP_
+#define _C4_STD_TUPLE_HPP_
+
+/** @file tuple.hpp */
+
+#ifndef C4CORE_SINGLE_HEADER
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/format.hpp
+//#include "c4/format.hpp"
+#if !defined(C4_FORMAT_HPP_) && !defined(_C4_FORMAT_HPP_)
+#error "amalgamate: file c4/format.hpp must have been included at this point"
+#endif /* C4_FORMAT_HPP_ */
+
+#endif
+
+#include <tuple>
+
+/** this is a work in progress */
+#undef C4_TUPLE_TO_CHARS
+
+namespace c4 {
+
+#ifdef C4_TUPLE_TO_CHARS
+namespace detail {
+
+template< size_t Curr, class... Types >
+struct tuple_helper
+{
+    static size_t do_cat(substr buf, std::tuple< Types... > const& tp)
+    {
+        size_t num = to_chars(buf, std::get<Curr>(tp));
+        buf = buf.len >= num ? buf.sub(num) : substr{};
+        num += tuple_helper< Curr+1, Types... >::do_cat(buf, tp);
+        return num;
+    }
+
+    static size_t do_uncat(csubstr buf, std::tuple< Types... > & tp)
+    {
+        size_t num = from_str_trim(buf, &std::get<Curr>(tp));
+        if(num == csubstr::npos) return csubstr::npos;
+        buf = buf.len >= num ? buf.sub(num) : substr{};
+        num += tuple_helper< Curr+1, Types... >::do_uncat(buf, tp);
+        return num;
+    }
+
+    template< class Sep >
+    static size_t do_catsep_more(substr buf, Sep const& sep, std::tuple< Types... > const& tp)
+    {
+        size_t ret = to_chars(buf, sep), num = ret;
+        buf  = buf.len >= ret ? buf.sub(ret) : substr{};
+        ret  = to_chars(buf, std::get<Curr>(tp));
+        num += ret;
+        buf  = buf.len >= ret ? buf.sub(ret) : substr{};
+        ret  = tuple_helper< Curr+1, Types... >::do_catsep_more(buf, sep, tp);
+        num += ret;
+        return num;
+    }
+
+    template< class Sep >
+    static size_t do_uncatsep_more(csubstr buf, Sep & sep, std::tuple< Types... > & tp)
+    {
+        size_t ret = from_str_trim(buf, &sep), num = ret;
+        if(ret == csubstr::npos) return csubstr::npos;
+        buf  = buf.len >= ret ? buf.sub(ret) : substr{};
+        ret  = from_str_trim(buf, &std::get<Curr>(tp));
+        if(ret == csubstr::npos) return csubstr::npos;
+        num += ret;
+        buf  = buf.len >= ret ? buf.sub(ret) : substr{};
+        ret  = tuple_helper< Curr+1, Types... >::do_uncatsep_more(buf, sep, tp);
+        if(ret == csubstr::npos) return csubstr::npos;
+        num += ret;
+        return num;
+    }
+
+    static size_t do_format(substr buf, csubstr fmt, std::tuple< Types... > const& tp)
+    {
+        auto pos = fmt.find("{}");
+        if(pos != csubstr::npos)
+        {
+            size_t num = to_chars(buf, fmt.sub(0, pos));
+            size_t out = num;
+            buf  = buf.len >= num ? buf.sub(num) : substr{};
+            num  = to_chars(buf, std::get<Curr>(tp));
+            out += num;
+            buf  = buf.len >= num ? buf.sub(num) : substr{};
+            num  = tuple_helper< Curr+1, Types... >::do_format(buf, fmt.sub(pos + 2), tp);
+            out += num;
+            return out;
+        }
+        else
+        {
+            return format(buf, fmt);
+        }
+    }
+
+    static size_t do_unformat(csubstr buf, csubstr fmt, std::tuple< Types... > & tp)
+    {
+        auto pos = fmt.find("{}");
+        if(pos != csubstr::npos)
+        {
+            size_t num = pos;
+            size_t out = num;
+            buf  = buf.len >= num ? buf.sub(num) : substr{};
+            num  = from_str_trim(buf, &std::get<Curr>(tp));
+            out += num;
+            buf  = buf.len >= num ? buf.sub(num) : substr{};
+            num  = tuple_helper< Curr+1, Types... >::do_unformat(buf, fmt.sub(pos + 2), tp);
+            out += num;
+            return out;
+        }
+        else
+        {
+            return tuple_helper< sizeof...(Types), Types... >::do_unformat(buf, fmt, tp);
+        }
+    }
+
+};
+
+/** @todo VS compilation fails for this class */
+template< class... Types >
+struct tuple_helper< sizeof...(Types), Types... >
+{
+    static size_t do_cat(substr /*buf*/, std::tuple<Types...> const& /*tp*/) { return 0; }
+    static size_t do_uncat(csubstr /*buf*/, std::tuple<Types...> & /*tp*/) { return 0; }
+
+    template< class Sep > static size_t do_catsep_more(substr /*buf*/, Sep const& /*sep*/, std::tuple<Types...> const& /*tp*/) { return 0; }
+    template< class Sep > static size_t do_uncatsep_more(csubstr /*buf*/, Sep & /*sep*/, std::tuple<Types...> & /*tp*/) { return 0; }
+
+    static size_t do_format(substr buf, csubstr fmt, std::tuple<Types...> const& /*tp*/)
+    {
+        return to_chars(buf, fmt);
+    }
+
+    static size_t do_unformat(csubstr buf, csubstr fmt, std::tuple<Types...> const& /*tp*/)
+    {
+        return 0;
+    }
+};
+
+} // namespace detail
+
+template< class... Types >
+inline size_t cat(substr buf, std::tuple< Types... > const& tp)
+{
+    return detail::tuple_helper< 0, Types... >::do_cat(buf, tp);
+}
+
+template< class... Types >
+inline size_t uncat(csubstr buf, std::tuple< Types... > & tp)
+{
+    return detail::tuple_helper< 0, Types... >::do_uncat(buf, tp);
+}
+
+template< class Sep, class... Types >
+inline size_t catsep(substr buf, Sep const& sep, std::tuple< Types... > const& tp)
+{
+    size_t num = to_chars(buf, std::cref(std::get<0>(tp)));
+    buf  = buf.len >= num ? buf.sub(num) : substr{};
+    num += detail::tuple_helper< 1, Types... >::do_catsep_more(buf, sep, tp);
+    return num;
+}
+
+template< class Sep, class... Types >
+inline size_t uncatsep(csubstr buf, Sep & sep, std::tuple< Types... > & tp)
+{
+    size_t ret = from_str_trim(buf, &std::get<0>(tp)), num = ret;
+    if(ret == csubstr::npos) return csubstr::npos;
+    buf  = buf.len >= ret ? buf.sub(ret) : substr{};
+    ret  = detail::tuple_helper< 1, Types... >::do_uncatsep_more(buf, sep, tp);
+    if(ret == csubstr::npos) return csubstr::npos;
+    num += ret;
+    return num;
+}
+
+template< class... Types >
+inline size_t format(substr buf, csubstr fmt, std::tuple< Types... > const& tp)
+{
+    return detail::tuple_helper< 0, Types... >::do_format(buf, fmt, tp);
+}
+
+template< class... Types >
+inline size_t unformat(csubstr buf, csubstr fmt, std::tuple< Types... > & tp)
+{
+    return detail::tuple_helper< 0, Types... >::do_unformat(buf, fmt, tp);
+}
+#endif // C4_TUPLE_TO_CHARS
+
+} // namespace c4
+
+#endif /* _C4_STD_TUPLE_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/std/tuple.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/ext/rng/rng.hpp
+// https://github.com/biojppm/c4core/src/c4/ext/rng/rng.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+/* Copyright (c) 2018 Arvid Gerstmann.
+ *
+ * https://arvid.io/2018/07/02/better-cxx-prng/
+ *
+ * This code is licensed under MIT license. */
+#ifndef AG_RANDOM_H
+#define AG_RANDOM_H
+
+//included above:
+//#include <stdint.h>
+#include <random>
+
+
+namespace c4 {
+namespace rng {
+
+
+class splitmix
+{
+public:
+    using result_type = uint32_t;
+    static constexpr result_type (min)() { return 0; }
+    static constexpr result_type (max)() { return UINT32_MAX; }
+    friend bool operator==(splitmix const &, splitmix const &);
+    friend bool operator!=(splitmix const &, splitmix const &);
+
+    splitmix() : m_seed(1) {}
+    explicit splitmix(uint64_t s) : m_seed(s) {}
+    explicit splitmix(std::random_device &rd)
+    {
+        seed(rd);
+    }
+
+    void seed(uint64_t s) { m_seed = s; }
+    void seed(std::random_device &rd)
+    {
+        m_seed = uint64_t(rd()) << 31 | uint64_t(rd());
+    }
+
+    result_type operator()()
+    {
+        uint64_t z = (m_seed += UINT64_C(0x9E3779B97F4A7C15));
+        z = (z ^ (z >> 30)) * UINT64_C(0xBF58476D1CE4E5B9);
+        z = (z ^ (z >> 27)) * UINT64_C(0x94D049BB133111EB);
+        return result_type((z ^ (z >> 31)) >> 31);
+    }
+
+    void discard(unsigned long long n)
+    {
+        for (unsigned long long i = 0; i < n; ++i)
+            operator()();
+    }
+
+private:
+    uint64_t m_seed;
+};
+
+inline bool operator==(splitmix const &lhs, splitmix const &rhs)
+{
+    return lhs.m_seed == rhs.m_seed;
+}
+inline bool operator!=(splitmix const &lhs, splitmix const &rhs)
+{
+    return lhs.m_seed != rhs.m_seed;
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+class xorshift
+{
+public:
+    using result_type = uint32_t;
+    static constexpr result_type (min)() { return 0; }
+    static constexpr result_type (max)() { return UINT32_MAX; }
+    friend bool operator==(xorshift const &, xorshift const &);
+    friend bool operator!=(xorshift const &, xorshift const &);
+
+    xorshift() : m_seed(0xc1f651c67c62c6e0ull) {}
+    explicit xorshift(std::random_device &rd)
+    {
+        seed(rd);
+    }
+
+    void seed(uint64_t s) { m_seed = s; }
+    void seed(std::random_device &rd)
+    {
+        m_seed = uint64_t(rd()) << 31 | uint64_t(rd());
+    }
+
+    result_type operator()()
+    {
+        uint64_t result = m_seed * 0xd989bcacc137dcd5ull;
+        m_seed ^= m_seed >> 11;
+        m_seed ^= m_seed << 31;
+        m_seed ^= m_seed >> 18;
+        return uint32_t(result >> 32ull);
+    }
+
+    void discard(unsigned long long n)
+    {
+        for (unsigned long long i = 0; i < n; ++i)
+            operator()();
+    }
+
+private:
+    uint64_t m_seed;
+};
+
+inline bool operator==(xorshift const &lhs, xorshift const &rhs)
+{
+    return lhs.m_seed == rhs.m_seed;
+}
+inline bool operator!=(xorshift const &lhs, xorshift const &rhs)
+{
+    return lhs.m_seed != rhs.m_seed;
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+class pcg
+{
+public:
+    using result_type = uint32_t;
+    static constexpr result_type (min)() { return 0; }
+    static constexpr result_type (max)() { return UINT32_MAX; }
+    friend bool operator==(pcg const &, pcg const &);
+    friend bool operator!=(pcg const &, pcg const &);
+
+    pcg()
+        : m_state(0x853c49e6748fea9bULL)
+        , m_inc(0xda3e39cb94b95bdbULL)
+    {}
+    explicit pcg(uint64_t s) { m_state = s; m_inc = m_state << 1; }
+    explicit pcg(std::random_device &rd)
+    {
+        seed(rd);
+    }
+
+    void seed(uint64_t s) { m_state = s; }
+    void seed(std::random_device &rd)
+    {
+        uint64_t s0 = uint64_t(rd()) << 31 | uint64_t(rd());
+        uint64_t s1 = uint64_t(rd()) << 31 | uint64_t(rd());
+
+        m_state = 0;
+        m_inc = (s1 << 1) | 1;
+        (void)operator()();
+        m_state += s0;
+        (void)operator()();
+    }
+
+    result_type operator()()
+    {
+        uint64_t oldstate = m_state;
+        m_state = oldstate * 6364136223846793005ULL + m_inc;
+        uint32_t xorshifted = uint32_t(((oldstate >> 18u) ^ oldstate) >> 27u);
+        //int rot = oldstate >> 59u; // the original. error?
+        int64_t rot = (int64_t)oldstate >> 59u; // error?
+        return (xorshifted >> rot) | (xorshifted << ((uint64_t)(-rot) & 31));
+    }
+
+    void discard(unsigned long long n)
+    {
+        for (unsigned long long i = 0; i < n; ++i)
+            operator()();
+    }
+
+private:
+    uint64_t m_state;
+    uint64_t m_inc;
+};
+
+inline bool operator==(pcg const &lhs, pcg const &rhs)
+{
+    return lhs.m_state == rhs.m_state
+        && lhs.m_inc == rhs.m_inc;
+}
+inline bool operator!=(pcg const &lhs, pcg const &rhs)
+{
+    return lhs.m_state != rhs.m_state
+        || lhs.m_inc != rhs.m_inc;
+}
+
+} // namespace rng
+} // namespace c4
+
+#endif /* AG_RANDOM_H */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/ext/rng/rng.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/ext/sg14/inplace_function.h
+// https://github.com/biojppm/c4core/src/c4/ext/sg14/inplace_function.h
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+/*
+ * Boost Software License - Version 1.0 - August 17th, 2003
+ *
+ * Permission is hereby granted, free of charge, to any person or organization
+ * obtaining a copy of the software and accompanying documentation covered by
+ * this license (the "Software") to use, reproduce, display, distribute,
+ * execute, and transmit the Software, and to prepare derivative works of the
+ * Software, and to permit third-parties to whom the Software is furnished to
+ * do so, all subject to the following:
+ *
+ * The copyright notices in the Software and this entire statement, including
+ * the above license grant, this restriction and the following disclaimer,
+ * must be included in all copies of the Software, in whole or in part, and
+ * all derivative works of the Software, unless such copies or derivative
+ * works are solely in the form of machine-executable object code generated by
+ * a source language processor.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+ * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef _C4_EXT_SG14_INPLACE_FUNCTION_H_
+#define _C4_EXT_SG14_INPLACE_FUNCTION_H_
+
+//included above:
+//#include <type_traits>
+//included above:
+//#include <utility>
+#include <functional>
+
+namespace stdext {
+
+namespace inplace_function_detail {
+
+static constexpr size_t InplaceFunctionDefaultCapacity = 32;
+
+#if defined(__GLIBCXX__)  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61458
+template<size_t Cap>
+union aligned_storage_helper {
+    struct double1 { double a; };
+    struct double4 { double a[4]; };
+    template<class T> using maybe = typename std::conditional<(Cap >= sizeof(T)), T, char>::type;
+    char real_data[Cap];
+    maybe<int> a;
+    maybe<long> b;
+    maybe<long long> c;
+    maybe<void*> d;
+    maybe<void(*)()> e;
+    maybe<double1> f;
+    maybe<double4> g;
+    maybe<long double> h;
+};
+
+template<size_t Cap, size_t Align = std::alignment_of<aligned_storage_helper<Cap>>::value>
+struct aligned_storage {
+    using type = typename std::aligned_storage<Cap, Align>::type;
+};
+#else
+using std::aligned_storage;
+#endif
+
+template<typename T> struct wrapper
+{
+    using type = T;
+};
+
+template<typename R, typename... Args> struct vtable
+{
+    using storage_ptr_t = void*;
+
+    using invoke_ptr_t = R(*)(storage_ptr_t, Args&&...);
+    using process_ptr_t = void(*)(storage_ptr_t, storage_ptr_t);
+    using destructor_ptr_t = void(*)(storage_ptr_t);
+
+    const invoke_ptr_t invoke_ptr;
+    const process_ptr_t copy_ptr;
+    const process_ptr_t move_ptr;
+    const destructor_ptr_t destructor_ptr;
+
+    explicit constexpr vtable() noexcept :
+        invoke_ptr{ [](storage_ptr_t, Args&&...) -> R
+            { throw std::bad_function_call(); }
+        },
+        copy_ptr{ [](storage_ptr_t, storage_ptr_t) noexcept -> void {} },
+        move_ptr{ [](storage_ptr_t, storage_ptr_t) noexcept -> void {} },
+        destructor_ptr{ [](storage_ptr_t) noexcept -> void {} }
+    {}
+
+    template<typename C> explicit constexpr vtable(wrapper<C>) noexcept :
+        invoke_ptr{ [](storage_ptr_t storage_ptr, Args&&... args)
+            noexcept(noexcept(std::declval<C>()(args...))) -> R
+            { return (*static_cast<C*>(storage_ptr))(
+                std::forward<Args>(args)...
+            ); }
+        },
+        copy_ptr{ [](storage_ptr_t dst_ptr, storage_ptr_t src_ptr)
+            noexcept(std::is_nothrow_copy_constructible<C>::value) -> void
+            { new (dst_ptr) C{ (*static_cast<C*>(src_ptr)) }; }
+        },
+        move_ptr{ [](storage_ptr_t dst_ptr, storage_ptr_t src_ptr)
+            noexcept(std::is_nothrow_move_constructible<C>::value) -> void
+            { new (dst_ptr) C{ std::move(*static_cast<C*>(src_ptr)) }; }
+        },
+        destructor_ptr{ [](storage_ptr_t storage_ptr)
+            noexcept -> void
+            { static_cast<C*>(storage_ptr)->~C(); }
+        }
+    {}
+
+    vtable(const vtable&) = delete;
+    vtable(vtable&&) = delete;
+
+    vtable& operator= (const vtable&) = delete;
+    vtable& operator= (vtable&&) = delete;
+
+    ~vtable() = default;
+};
+
+template<size_t DstCap, size_t DstAlign, size_t SrcCap, size_t SrcAlign>
+struct is_valid_inplace_dst : std::true_type
+{
+    static_assert(DstCap >= SrcCap,
+        "Can't squeeze larger inplace_function into a smaller one"
+    );
+
+    static_assert(DstAlign % SrcAlign == 0,
+        "Incompatible inplace_function alignments"
+    );
+};
+
+} // namespace inplace_function_detail
+
+template<
+    typename Signature,
+    size_t Capacity = inplace_function_detail::InplaceFunctionDefaultCapacity,
+    size_t Alignment = std::alignment_of<typename inplace_function_detail::aligned_storage<Capacity>::type>::value
+>
+class inplace_function; // unspecified
+
+template<
+    typename R,
+    typename... Args,
+    size_t Capacity,
+    size_t Alignment
+>
+class inplace_function<R(Args...), Capacity, Alignment>
+{
+    static const constexpr inplace_function_detail::vtable<R, Args...> empty_vtable{};
+public:
+    using capacity = std::integral_constant<size_t, Capacity>;
+    using alignment = std::integral_constant<size_t, Alignment>;
+
+    using storage_t = typename inplace_function_detail::aligned_storage<Capacity, Alignment>::type;
+    using vtable_t = inplace_function_detail::vtable<R, Args...>;
+    using vtable_ptr_t = const vtable_t*;
+
+    template <typename, size_t, size_t>	friend class inplace_function;
+
+    inplace_function() noexcept :
+        vtable_ptr_{std::addressof(empty_vtable)}
+    {}
+
+    template<
+        typename T,
+        typename C = typename std::decay<T>::type,
+        typename = typename std::enable_if<
+            !(std::is_same<C, inplace_function>::value
+            || std::is_convertible<C, inplace_function>::value)
+        >::type
+    >
+    inplace_function(T&& closure)
+    {
+#if __cplusplus >= 201703L
+        static_assert(std::is_invocable_r<R, C, Args...>::value,
+            "inplace_function cannot be constructed from non-callable type"
+        );
+#endif
+        static_assert(std::is_copy_constructible<C>::value,
+            "inplace_function cannot be constructed from non-copyable type"
+        );
+
+        static_assert(sizeof(C) <= Capacity,
+            "inplace_function cannot be constructed from object with this (large) size"
+        );
+
+        static_assert(Alignment % std::alignment_of<C>::value == 0,
+            "inplace_function cannot be constructed from object with this (large) alignment"
+        );
+
+        static const vtable_t vt{inplace_function_detail::wrapper<C>{}};
+        vtable_ptr_ = std::addressof(vt);
+
+        new (std::addressof(storage_)) C{std::forward<T>(closure)};
+    }
+
+    inplace_function(std::nullptr_t) noexcept :
+        vtable_ptr_{std::addressof(empty_vtable)}
+    {}
+
+    inplace_function(const inplace_function& other) :
+        vtable_ptr_{other.vtable_ptr_}
+    {
+        vtable_ptr_->copy_ptr(
+            std::addressof(storage_),
+            std::addressof(other.storage_)
+        );
+    }
+
+    inplace_function(inplace_function&& other) :
+        vtable_ptr_{other.vtable_ptr_}
+    {
+        vtable_ptr_->move_ptr(
+            std::addressof(storage_),
+            std::addressof(other.storage_)
+        );
+    }
+
+    inplace_function& operator= (std::nullptr_t) noexcept
+    {
+        vtable_ptr_->destructor_ptr(std::addressof(storage_));
+        vtable_ptr_ = std::addressof(empty_vtable);
+        return *this;
+    }
+
+    inplace_function& operator= (const inplace_function& other)
+    {
+        if(this != std::addressof(other))
+        {
+            vtable_ptr_->destructor_ptr(std::addressof(storage_));
+
+            vtable_ptr_ = other.vtable_ptr_;
+            vtable_ptr_->copy_ptr(
+                std::addressof(storage_),
+                std::addressof(other.storage_)
+            );
+        }
+        return *this;
+    }
+
+    inplace_function& operator= (inplace_function&& other)
+    {
+        if(this != std::addressof(other))
+        {
+            vtable_ptr_->destructor_ptr(std::addressof(storage_));
+
+            vtable_ptr_ = other.vtable_ptr_;
+            vtable_ptr_->move_ptr(
+                std::addressof(storage_),
+                std::addressof(other.storage_)
+            );
+        }
+        return *this;
+    }
+
+    ~inplace_function()
+    {
+        vtable_ptr_->destructor_ptr(std::addressof(storage_));
+    }
+
+    R operator() (Args... args) const
+    {
+        return vtable_ptr_->invoke_ptr(
+            std::addressof(storage_),
+            std::forward<Args>(args)...
+        );
+    }
+
+    constexpr bool operator== (std::nullptr_t) const noexcept
+    {
+        return !operator bool();
+    }
+
+    constexpr bool operator!= (std::nullptr_t) const noexcept
+    {
+        return operator bool();
+    }
+
+    explicit constexpr operator bool() const noexcept
+    {
+        return vtable_ptr_ != std::addressof(empty_vtable);
+    }
+
+    template<size_t Cap, size_t Align>
+    operator inplace_function<R(Args...), Cap, Align>() const&
+    {
+        static_assert(inplace_function_detail::is_valid_inplace_dst<
+            Cap, Align, Capacity, Alignment
+        >::value, "conversion not allowed");
+
+        return {vtable_ptr_, vtable_ptr_->copy_ptr, std::addressof(storage_)};
+    }
+
+    template<size_t Cap, size_t Align>
+    operator inplace_function<R(Args...), Cap, Align>() &&
+    {
+        static_assert(inplace_function_detail::is_valid_inplace_dst<
+            Cap, Align, Capacity, Alignment
+        >::value, "conversion not allowed");
+
+        return {vtable_ptr_, vtable_ptr_->move_ptr, std::addressof(storage_)};
+    }
+
+    void swap(inplace_function& other)
+    {
+        if (this == std::addressof(other)) return;
+
+        storage_t tmp;
+        vtable_ptr_->move_ptr(
+            std::addressof(tmp),
+            std::addressof(storage_)
+        );
+        vtable_ptr_->destructor_ptr(std::addressof(storage_));
+
+        other.vtable_ptr_->move_ptr(
+            std::addressof(storage_),
+            std::addressof(other.storage_)
+        );
+        other.vtable_ptr_->destructor_ptr(std::addressof(other.storage_));
+
+        vtable_ptr_->move_ptr(
+            std::addressof(other.storage_),
+            std::addressof(tmp)
+        );
+        vtable_ptr_->destructor_ptr(std::addressof(tmp));
+
+        std::swap(vtable_ptr_, other.vtable_ptr_);
+    }
+
+private:
+    vtable_ptr_t vtable_ptr_;
+    mutable storage_t storage_;
+
+    inplace_function(
+        vtable_ptr_t vtable_ptr,
+        typename vtable_t::process_ptr_t process_ptr,
+        typename vtable_t::storage_ptr_t storage_ptr
+    ) : vtable_ptr_{vtable_ptr}
+    {
+        process_ptr(std::addressof(storage_), storage_ptr);
+    }
+};
+
+} // namespace stdext
+
+#endif /* _C4_EXT_SG14_INPLACE_FUNCTION_H_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/ext/sg14/inplace_function.h)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/language.cpp
+// https://github.com/biojppm/c4core/src/c4/language.cpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifdef C4CORE_SINGLE_HDR_DEFINE_NOW
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/language.hpp
+//#include "c4/language.hpp"
+#if !defined(C4_LANGUAGE_HPP_) && !defined(_C4_LANGUAGE_HPP_)
+#error "amalgamate: file c4/language.hpp must have been included at this point"
+#endif /* C4_LANGUAGE_HPP_ */
+
+
+namespace c4 {
+namespace detail {
+
+#ifndef __GNUC__
+void use_char_pointer(char const volatile* v)
+{
+    C4_UNUSED(v);
+}
+#else
+void foo() {} // to avoid empty file warning from the linker
+#endif
+
+} // namespace detail
+} // namespace c4
+
+#endif /* C4CORE_SINGLE_HDR_DEFINE_NOW */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/language.cpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/format.cpp
+// https://github.com/biojppm/c4core/src/c4/format.cpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifdef C4CORE_SINGLE_HDR_DEFINE_NOW
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/format.hpp
+//#include "c4/format.hpp"
+#if !defined(C4_FORMAT_HPP_) && !defined(_C4_FORMAT_HPP_)
+#error "amalgamate: file c4/format.hpp must have been included at this point"
+#endif /* C4_FORMAT_HPP_ */
+
+
+//included above:
+//#include <memory> // for std::align
+
+#ifdef __clang__
+#   pragma clang diagnostic push
+#   pragma clang diagnostic ignored "-Wformat-nonliteral"
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wformat-nonliteral"
+#endif
+
+namespace c4 {
+
+
+size_t to_chars(substr buf, fmt::const_raw_wrapper r)
+{
+    void * vptr = buf.str;
+    size_t space = buf.len;
+    auto ptr = (decltype(buf.str)) std::align(r.alignment, r.len, vptr, space);
+    if(ptr == nullptr)
+    {
+        // if it was not possible to align, return a conservative estimate
+        // of the required space
+        return r.alignment + r.len;
+    }
+    C4_CHECK(ptr >= buf.begin() && ptr <= buf.end());
+    size_t sz = static_cast<size_t>(ptr - buf.str) + r.len;
+    if(sz <= buf.len)
+    {
+        memcpy(ptr, r.buf, r.len);
+    }
+    return sz;
+}
+
+
+bool from_chars(csubstr buf, fmt::raw_wrapper *r)
+{
+    void * vptr = (void*)buf.str;
+    size_t space = buf.len;
+    auto ptr = (decltype(buf.str)) std::align(r->alignment, r->len, vptr, space);
+    C4_CHECK(ptr != nullptr);
+    C4_CHECK(ptr >= buf.begin() && ptr <= buf.end());
+    //size_t dim = (ptr - buf.str) + r->len;
+    memcpy(r->buf, ptr, r->len);
+    return true;
+}
+
+
+} // namespace c4
+
+#ifdef __clang__
+#   pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic pop
+#endif
+
+#endif /* C4CORE_SINGLE_HDR_DEFINE_NOW */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/format.cpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/memory_util.cpp
+// https://github.com/biojppm/c4core/src/c4/memory_util.cpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifdef C4CORE_SINGLE_HDR_DEFINE_NOW
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/memory_util.hpp
+//#include "c4/memory_util.hpp"
+#if !defined(C4_MEMORY_UTIL_HPP_) && !defined(_C4_MEMORY_UTIL_HPP_)
+#error "amalgamate: file c4/memory_util.hpp must have been included at this point"
+#endif /* C4_MEMORY_UTIL_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/error.hpp
+//#include "c4/error.hpp"
+#if !defined(C4_ERROR_HPP_) && !defined(_C4_ERROR_HPP_)
+#error "amalgamate: file c4/error.hpp must have been included at this point"
+#endif /* C4_ERROR_HPP_ */
+
+
+namespace c4 {
+
+/** Fills 'dest' with the first 'pattern_size' bytes at 'pattern', 'num_times'. */
+void mem_repeat(void* dest, void const* pattern, size_t pattern_size, size_t num_times)
+{
+    if(C4_UNLIKELY(num_times == 0))
+        return;
+    C4_ASSERT( ! mem_overlaps(dest, pattern, num_times*pattern_size, pattern_size));
+    char *begin = (char*)dest;
+    char *end   = begin + num_times * pattern_size;
+    // copy the pattern once
+    ::memcpy(begin, pattern, pattern_size);
+    // now copy from dest to itself, doubling up every time
+    size_t n = pattern_size;
+    while(begin + 2*n < end)
+    {
+        ::memcpy(begin + n, begin, n);
+        n <<= 1; // double n
+    }
+    // copy the missing part
+    if(begin + n < end)
+    {
+        ::memcpy(begin + n, begin, static_cast<size_t>(end - (begin + n)));
+    }
+}
+
+} // namespace c4
+
+#endif /* C4CORE_SINGLE_HDR_DEFINE_NOW */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/memory_util.cpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/char_traits.cpp
+// https://github.com/biojppm/c4core/src/c4/char_traits.cpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifdef C4CORE_SINGLE_HDR_DEFINE_NOW
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/char_traits.hpp
+//#include "c4/char_traits.hpp"
+#if !defined(C4_CHAR_TRAITS_HPP_) && !defined(_C4_CHAR_TRAITS_HPP_)
+#error "amalgamate: file c4/char_traits.hpp must have been included at this point"
+#endif /* C4_CHAR_TRAITS_HPP_ */
+
+
+namespace c4 {
+
+constexpr const char char_traits< char >::whitespace_chars[];
+constexpr const size_t char_traits< char >::num_whitespace_chars;
+constexpr const wchar_t char_traits< wchar_t >::whitespace_chars[];
+constexpr const size_t char_traits< wchar_t >::num_whitespace_chars;
+
+} // namespace c4
+
+#endif /* C4CORE_SINGLE_HDR_DEFINE_NOW */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/char_traits.cpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/memory_resource.cpp
+// https://github.com/biojppm/c4core/src/c4/memory_resource.cpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifdef C4CORE_SINGLE_HDR_DEFINE_NOW
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/memory_resource.hpp
+//#include "c4/memory_resource.hpp"
+#if !defined(C4_MEMORY_RESOURCE_HPP_) && !defined(_C4_MEMORY_RESOURCE_HPP_)
+#error "amalgamate: file c4/memory_resource.hpp must have been included at this point"
+#endif /* C4_MEMORY_RESOURCE_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/memory_util.hpp
+//#include "c4/memory_util.hpp"
+#if !defined(C4_MEMORY_UTIL_HPP_) && !defined(_C4_MEMORY_UTIL_HPP_)
+#error "amalgamate: file c4/memory_util.hpp must have been included at this point"
+#endif /* C4_MEMORY_UTIL_HPP_ */
+
+
+//included above:
+//#include <stdlib.h>
+//included above:
+//#include <string.h>
+#if defined(C4_POSIX) || defined(C4_IOS) || defined(C4_MACOS) || defined(C4_ARM)
+#   include <errno.h>
+#endif
+#if defined(C4_ARM)
+#   include <malloc.h>
+#endif
+
+//included above:
+//#include <memory>
+
+namespace c4 {
+
+namespace detail {
+
+
+#ifdef C4_NO_ALLOC_DEFAULTS
+aalloc_pfn s_aalloc = nullptr;
+free_pfn s_afree = nullptr;
+arealloc_pfn s_arealloc = nullptr;
+#else
+
+
+void afree_impl(void *ptr)
+{
+#if defined(C4_WIN) || defined(C4_XBOX)
+    ::_aligned_free(ptr);
+#else
+    ::free(ptr);
+#endif
+}
+
+
+void* aalloc_impl(size_t size, size_t alignment)
+{
+    void *mem;
+#if defined(C4_WIN) || defined(C4_XBOX)
+    mem = ::_aligned_malloc(size, alignment);
+    C4_CHECK(mem != nullptr || size == 0);
+#elif defined(C4_ARM)
+    // https://stackoverflow.com/questions/53614538/undefined-reference-to-posix-memalign-in-arm-gcc
+    // https://electronics.stackexchange.com/questions/467382/e2-studio-undefined-reference-to-posix-memalign/467753
+    mem = memalign(alignment, size);
+    C4_CHECK(mem != nullptr || size == 0);
+#elif defined(C4_POSIX) || defined(C4_IOS) || defined(C4_MACOS)
+    // NOTE: alignment needs to be sized in multiples of sizeof(void*)
+    size_t amult = alignment;
+    if(C4_UNLIKELY(alignment < sizeof(void*)))
+    {
+        amult = sizeof(void*);
+    }
+    int ret = ::posix_memalign(&mem, amult, size);
+    if(C4_UNLIKELY(ret))
+    {
+        if(ret == EINVAL)
+        {
+            C4_ERROR("The alignment argument %zu was not a power of two, "
+                     "or was not a multiple of sizeof(void*)", alignment);
+        }
+        else if(ret == ENOMEM)
+        {
+            C4_ERROR("There was insufficient memory to fulfill the "
+                     "allocation request of %zu bytes (alignment=%lu)", size, size);
+        }
+        return nullptr;
+    }
+#else
+    C4_NOT_IMPLEMENTED_MSG("need to implement an aligned allocation for this platform");
+#endif
+    C4_ASSERT_MSG((uintptr_t(mem) & (alignment-1)) == 0, "address %p is not aligned to %zu boundary", mem, alignment);
+    return mem;
+}
+
+
+void* arealloc_impl(void* ptr, size_t oldsz, size_t newsz, size_t alignment)
+{
+    /** @todo make this more efficient
+     * @see https://stackoverflow.com/questions/9078259/does-realloc-keep-the-memory-alignment-of-posix-memalign
+     * @see look for qReallocAligned() in http://code.qt.io/cgit/qt/qtbase.git/tree/src/corelib/global/qmalloc.cpp
+     */
+    void *tmp = aalloc(newsz, alignment);
+    size_t min = newsz < oldsz ? newsz : oldsz;
+    if(mem_overlaps(ptr, tmp, oldsz, newsz))
+    {
+        ::memmove(tmp, ptr, min);
+    }
+    else
+    {
+        ::memcpy(tmp, ptr, min);
+    }
+    afree(ptr);
+    return tmp;
+}
+
+aalloc_pfn s_aalloc = aalloc_impl;
+afree_pfn s_afree = afree_impl;
+arealloc_pfn s_arealloc = arealloc_impl;
+
+#endif // C4_NO_ALLOC_DEFAULTS
+
+} // namespace detail
+
+
+aalloc_pfn get_aalloc()
+{
+    return detail::s_aalloc;
+}
+void set_aalloc(aalloc_pfn fn)
+{
+    detail::s_aalloc = fn;
+}
+
+afree_pfn get_afree()
+{
+    return detail::s_afree;
+}
+void set_afree(afree_pfn fn)
+{
+    detail::s_afree = fn;
+}
+
+arealloc_pfn get_arealloc()
+{
+    return detail::s_arealloc;
+}
+void set_arealloc(arealloc_pfn fn)
+{
+    detail::s_arealloc = fn;
+}
+
+
+void* aalloc(size_t sz, size_t alignment)
+{
+    C4_ASSERT_MSG(c4::get_aalloc() != nullptr, "did you forget to call set_aalloc()?");
+    auto fn = c4::get_aalloc();
+    void* ptr = fn(sz, alignment);
+    return ptr;
+}
+
+void afree(void* ptr)
+{
+    C4_ASSERT_MSG(c4::get_afree() != nullptr, "did you forget to call set_afree()?");
+    auto fn = c4::get_afree();
+    fn(ptr);
+}
+
+void* arealloc(void *ptr, size_t oldsz, size_t newsz, size_t alignment)
+{
+    C4_ASSERT_MSG(c4::get_arealloc() != nullptr, "did you forget to call set_arealloc()?");
+    auto fn = c4::get_arealloc();
+    void* nptr = fn(ptr, oldsz, newsz, alignment);
+    return nptr;
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+void detail::_MemoryResourceSingleChunk::release()
+{
+    if(m_mem && m_owner)
+    {
+        impl_type::deallocate(m_mem, m_size);
+    }
+    m_mem = nullptr;
+    m_size = 0;
+    m_owner = false;
+    m_pos = 0;
+}
+
+void detail::_MemoryResourceSingleChunk::acquire(size_t sz)
+{
+    clear();
+    m_owner = true;
+    m_mem = (char*) impl_type::allocate(sz, alignof(max_align_t));
+    m_size = sz;
+    m_pos = 0;
+}
+
+void detail::_MemoryResourceSingleChunk::acquire(void *mem, size_t sz)
+{
+    clear();
+    m_owner = false;
+    m_mem = (char*) mem;
+    m_size = sz;
+    m_pos = 0;
+}
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+void* MemoryResourceLinear::do_allocate(size_t sz, size_t alignment, void *hint)
+{
+    C4_UNUSED(hint);
+    if(sz == 0) return nullptr;
+    // make sure there's enough room to allocate
+    if(m_pos + sz > m_size)
+    {
+        C4_ERROR("out of memory");
+        return nullptr;
+    }
+    void *mem = m_mem + m_pos;
+    size_t space = m_size - m_pos;
+    if(std::align(alignment, sz, mem, space))
+    {
+        C4_ASSERT(m_pos <= m_size);
+        C4_ASSERT(m_size - m_pos >= space);
+        m_pos += (m_size - m_pos) - space;
+        m_pos += sz;
+        C4_ASSERT(m_pos <= m_size);
+    }
+    else
+    {
+        C4_ERROR("could not align memory");
+        mem = nullptr;
+    }
+    return mem;
+}
+
+void MemoryResourceLinear::do_deallocate(void* ptr, size_t sz, size_t alignment)
+{
+    C4_UNUSED(ptr);
+    C4_UNUSED(sz);
+    C4_UNUSED(alignment);
+    // nothing to do!!
+}
+
+void* MemoryResourceLinear::do_reallocate(void* ptr, size_t oldsz, size_t newsz, size_t alignment)
+{
+    if(newsz == oldsz) return ptr;
+    // is ptr the most recently allocated (MRA) block?
+    char *cptr = (char*)ptr;
+    bool same_pos = (m_mem + m_pos == cptr + oldsz);
+    // no need to get more memory when shrinking
+    if(newsz < oldsz)
+    {
+        // if this is the MRA, we can safely shrink the position
+        if(same_pos)
+        {
+            m_pos -= oldsz - newsz;
+        }
+        return ptr;
+    }
+    // we're growing the block, and it fits in size
+    else if(same_pos && cptr + newsz <= m_mem + m_size)
+    {
+        // if this is the MRA, we can safely shrink the position
+        m_pos += newsz - oldsz;
+        return ptr;
+    }
+    // we're growing the block or it doesn't fit -
+    // delegate any of these situations to do_deallocate()
+    return do_allocate(newsz, alignment, ptr);
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** @todo add a free list allocator. A good candidate because of its
+ * small size is TLSF.
+ *
+ * @see https://github.com/mattconte/tlsf
+ *
+ * Comparisons:
+ *
+ * @see https://www.researchgate.net/publication/262375150_A_Comparative_Study_on_Memory_Allocators_in_Multicore_and_Multithreaded_Applications_-_SBESC_2011_-_Presentation_Slides
+ * @see http://webkit.sed.hu/blog/20100324/war-allocators-tlsf-action
+ * @see https://github.com/emeryberger/Malloc-Implementations/tree/master/allocators
+ *
+ * */
+
+} // namespace c4
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#ifdef C4_REDEFINE_CPPNEW
+#include <new>
+void* operator new(size_t size)
+{
+    auto *mr = ::c4::get_memory_resource();
+    return mr->allocate(size);
+}
+void operator delete(void *p) noexcept
+{
+    C4_NEVER_REACH();
+}
+void operator delete(void *p, size_t size)
+{
+    auto *mr = ::c4::get_memory_resource();
+    mr->deallocate(p, size);
+}
+void* operator new[](size_t size)
+{
+    return operator new(size);
+}
+void operator delete[](void *p) noexcept
+{
+    operator delete(p);
+}
+void operator delete[](void *p, size_t size)
+{
+    operator delete(p, size);
+}
+void* operator new(size_t size, std::nothrow_t)
+{
+    return operator new(size);
+}
+void operator delete(void *p, std::nothrow_t)
+{
+    operator delete(p);
+}
+void operator delete(void *p, size_t size, std::nothrow_t)
+{
+    operator delete(p, size);
+}
+void* operator new[](size_t size, std::nothrow_t)
+{
+    return operator new(size);
+}
+void operator delete[](void *p, std::nothrow_t)
+{
+    operator delete(p);
+}
+void operator delete[](void *p, size_t, std::nothrow_t)
+{
+    operator delete(p, size);
+}
+#endif // C4_REDEFINE_CPPNEW
+
+#endif /* C4CORE_SINGLE_HDR_DEFINE_NOW */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/memory_resource.cpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/utf.cpp
+// https://github.com/biojppm/c4core/src/c4/utf.cpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifdef C4CORE_SINGLE_HDR_DEFINE_NOW
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/utf.hpp
+//#include "c4/utf.hpp"
+#if !defined(C4_UTF_HPP_) && !defined(_C4_UTF_HPP_)
+#error "amalgamate: file c4/utf.hpp must have been included at this point"
+#endif /* C4_UTF_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/charconv.hpp
+//#include "c4/charconv.hpp"
+#if !defined(C4_CHARCONV_HPP_) && !defined(_C4_CHARCONV_HPP_)
+#error "amalgamate: file c4/charconv.hpp must have been included at this point"
+#endif /* C4_CHARCONV_HPP_ */
+
+
+namespace c4 {
+
+size_t decode_code_point(uint8_t *C4_RESTRICT buf, size_t buflen, const uint32_t code)
+{
+    C4_UNUSED(buflen);
+    C4_ASSERT(buflen >= 4);
+    if (code <= UINT32_C(0x7f))
+    {
+        buf[0] = (uint8_t)code;
+        return 1u;
+    }
+    else if(code <= UINT32_C(0x7ff))
+    {
+        buf[0] = (uint8_t)(UINT32_C(0xc0) | (code >> 6));             /* 110xxxxx */
+        buf[1] = (uint8_t)(UINT32_C(0x80) | (code & UINT32_C(0x3f))); /* 10xxxxxx */
+        return 2u;
+    }
+    else if(code <= UINT32_C(0xffff))
+    {
+        buf[0] = (uint8_t)(UINT32_C(0xe0) | ((code >> 12)));                  /* 1110xxxx */
+        buf[1] = (uint8_t)(UINT32_C(0x80) | ((code >>  6) & UINT32_C(0x3f))); /* 10xxxxxx */
+        buf[2] = (uint8_t)(UINT32_C(0x80) | ((code      ) & UINT32_C(0x3f))); /* 10xxxxxx */
+        return 3u;
+    }
+    else if(code <= UINT32_C(0x10ffff))
+    {
+        buf[0] = (uint8_t)(UINT32_C(0xf0) | ((code >> 18)));                  /* 11110xxx */
+        buf[1] = (uint8_t)(UINT32_C(0x80) | ((code >> 12) & UINT32_C(0x3f))); /* 10xxxxxx */
+        buf[2] = (uint8_t)(UINT32_C(0x80) | ((code >>  6) & UINT32_C(0x3f))); /* 10xxxxxx */
+        buf[3] = (uint8_t)(UINT32_C(0x80) | ((code      ) & UINT32_C(0x3f))); /* 10xxxxxx */
+        return 4u;
+    }
+    return 0;
+}
+
+substr decode_code_point(substr out, csubstr code_point)
+{
+    C4_ASSERT(out.len >= 4);
+    C4_ASSERT(!code_point.begins_with("U+"));
+    C4_ASSERT(!code_point.begins_with("\\x"));
+    C4_ASSERT(!code_point.begins_with("\\u"));
+    C4_ASSERT(!code_point.begins_with("\\U"));
+    C4_ASSERT(!code_point.begins_with('0'));
+    C4_ASSERT(code_point.len <= 8);
+    C4_ASSERT(code_point.len > 0);
+    uint32_t code_point_val;
+    C4_CHECK(read_hex(code_point, &code_point_val));
+    size_t ret = decode_code_point((uint8_t*)out.str, out.len, code_point_val);
+    C4_ASSERT(ret <= 4);
+    return out.first(ret);
+}
+
+} // namespace c4
+
+#endif /* C4CORE_SINGLE_HDR_DEFINE_NOW */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/utf.cpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/base64.cpp
+// https://github.com/biojppm/c4core/src/c4/base64.cpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifdef C4CORE_SINGLE_HDR_DEFINE_NOW
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/base64.hpp
+//#include "c4/base64.hpp"
+#if !defined(C4_BASE64_HPP_) && !defined(_C4_BASE64_HPP_)
+#error "amalgamate: file c4/base64.hpp must have been included at this point"
+#endif /* C4_BASE64_HPP_ */
+
+
+#ifdef __clang__
+#   pragma clang diagnostic push
+#   pragma clang diagnostic ignored "-Wchar-subscripts" // array subscript is of type 'char'
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wchar-subscripts"
+#   pragma GCC diagnostic ignored "-Wtype-limits"
+#endif
+
+namespace c4 {
+
+namespace detail {
+
+constexpr static const char base64_sextet_to_char_[64] = {
+    /* 0/ 65*/ 'A', /* 1/ 66*/ 'B', /* 2/ 67*/ 'C', /* 3/ 68*/ 'D',
+    /* 4/ 69*/ 'E', /* 5/ 70*/ 'F', /* 6/ 71*/ 'G', /* 7/ 72*/ 'H',
+    /* 8/ 73*/ 'I', /* 9/ 74*/ 'J', /*10/ 75*/ 'K', /*11/ 74*/ 'L',
+    /*12/ 77*/ 'M', /*13/ 78*/ 'N', /*14/ 79*/ 'O', /*15/ 78*/ 'P',
+    /*16/ 81*/ 'Q', /*17/ 82*/ 'R', /*18/ 83*/ 'S', /*19/ 82*/ 'T',
+    /*20/ 85*/ 'U', /*21/ 86*/ 'V', /*22/ 87*/ 'W', /*23/ 88*/ 'X',
+    /*24/ 89*/ 'Y', /*25/ 90*/ 'Z', /*26/ 97*/ 'a', /*27/ 98*/ 'b',
+    /*28/ 99*/ 'c', /*29/100*/ 'd', /*30/101*/ 'e', /*31/102*/ 'f',
+    /*32/103*/ 'g', /*33/104*/ 'h', /*34/105*/ 'i', /*35/106*/ 'j',
+    /*36/107*/ 'k', /*37/108*/ 'l', /*38/109*/ 'm', /*39/110*/ 'n',
+    /*40/111*/ 'o', /*41/112*/ 'p', /*42/113*/ 'q', /*43/114*/ 'r',
+    /*44/115*/ 's', /*45/116*/ 't', /*46/117*/ 'u', /*47/118*/ 'v',
+    /*48/119*/ 'w', /*49/120*/ 'x', /*50/121*/ 'y', /*51/122*/ 'z',
+    /*52/ 48*/ '0', /*53/ 49*/ '1', /*54/ 50*/ '2', /*55/ 51*/ '3',
+    /*56/ 52*/ '4', /*57/ 53*/ '5', /*58/ 54*/ '6', /*59/ 55*/ '7',
+    /*60/ 56*/ '8', /*61/ 57*/ '9', /*62/ 43*/ '+', /*63/ 47*/ '/',
+};
+
+// https://www.cs.cmu.edu/~pattis/15-1XX/common/handouts/ascii.html
+constexpr static const char base64_char_to_sextet_[128] = {
+    #define __ char(-1) // undefined below
+    /*  0 NUL*/ __, /*  1 SOH*/ __, /*  2 STX*/ __, /*  3 ETX*/ __,
+    /*  4 EOT*/ __, /*  5 ENQ*/ __, /*  6 ACK*/ __, /*  7 BEL*/ __,
+    /*  8 BS */ __, /*  9 TAB*/ __, /* 10 LF */ __, /* 11 VT */ __,
+    /* 12 FF */ __, /* 13 CR */ __, /* 14 SO */ __, /* 15 SI */ __,
+    /* 16 DLE*/ __, /* 17 DC1*/ __, /* 18 DC2*/ __, /* 19 DC3*/ __,
+    /* 20 DC4*/ __, /* 21 NAK*/ __, /* 22 SYN*/ __, /* 23 ETB*/ __,
+    /* 24 CAN*/ __, /* 25 EM */ __, /* 26 SUB*/ __, /* 27 ESC*/ __,
+    /* 28 FS */ __, /* 29 GS */ __, /* 30 RS */ __, /* 31 US */ __,
+    /* 32 SPC*/ __, /* 33 !  */ __, /* 34 "  */ __, /* 35 #  */ __,
+    /* 36 $  */ __, /* 37 %  */ __, /* 38 &  */ __, /* 39 '  */ __,
+    /* 40 (  */ __, /* 41 )  */ __, /* 42 *  */ __, /* 43 +  */ 62,
+    /* 44 ,  */ __, /* 45 -  */ __, /* 46 .  */ __, /* 47 /  */ 63,
+    /* 48 0  */ 52, /* 49 1  */ 53, /* 50 2  */ 54, /* 51 3  */ 55,
+    /* 52 4  */ 56, /* 53 5  */ 57, /* 54 6  */ 58, /* 55 7  */ 59,
+    /* 56 8  */ 60, /* 57 9  */ 61, /* 58 :  */ __, /* 59 ;  */ __,
+    /* 60 <  */ __, /* 61 =  */ __, /* 62 >  */ __, /* 63 ?  */ __,
+    /* 64 @  */ __, /* 65 A  */  0, /* 66 B  */  1, /* 67 C  */  2,
+    /* 68 D  */  3, /* 69 E  */  4, /* 70 F  */  5, /* 71 G  */  6,
+    /* 72 H  */  7, /* 73 I  */  8, /* 74 J  */  9, /* 75 K  */ 10,
+    /* 76 L  */ 11, /* 77 M  */ 12, /* 78 N  */ 13, /* 79 O  */ 14,
+    /* 80 P  */ 15, /* 81 Q  */ 16, /* 82 R  */ 17, /* 83 S  */ 18,
+    /* 84 T  */ 19, /* 85 U  */ 20, /* 86 V  */ 21, /* 87 W  */ 22,
+    /* 88 X  */ 23, /* 89 Y  */ 24, /* 90 Z  */ 25, /* 91 [  */ __,
+    /* 92 \  */ __, /* 93 ]  */ __, /* 94 ^  */ __, /* 95 _  */ __,
+    /* 96 `  */ __, /* 97 a  */ 26, /* 98 b  */ 27, /* 99 c  */ 28,
+    /*100 d  */ 29, /*101 e  */ 30, /*102 f  */ 31, /*103 g  */ 32,
+    /*104 h  */ 33, /*105 i  */ 34, /*106 j  */ 35, /*107 k  */ 36,
+    /*108 l  */ 37, /*109 m  */ 38, /*110 n  */ 39, /*111 o  */ 40,
+    /*112 p  */ 41, /*113 q  */ 42, /*114 r  */ 43, /*115 s  */ 44,
+    /*116 t  */ 45, /*117 u  */ 46, /*118 v  */ 47, /*119 w  */ 48,
+    /*120 x  */ 49, /*121 y  */ 50, /*122 z  */ 51, /*123 {  */ __,
+    /*124 |  */ __, /*125 }  */ __, /*126 ~  */ __, /*127 DEL*/ __,
+    #undef __
+};
+
+#ifndef NDEBUG
+void base64_test_tables()
+{
+    for(size_t i = 0; i < C4_COUNTOF(detail::base64_sextet_to_char_); ++i)
+    {
+        char s2c = base64_sextet_to_char_[i];
+        char c2s = base64_char_to_sextet_[(int)s2c];
+        C4_CHECK((size_t)c2s == i);
+    }
+    for(size_t i = 0; i < C4_COUNTOF(detail::base64_char_to_sextet_); ++i)
+    {
+        char c2s = base64_char_to_sextet_[i];
+        if(c2s == char(-1))
+            continue;
+        char s2c = base64_sextet_to_char_[(int)c2s];
+        C4_CHECK((size_t)s2c == i);
+    }
+}
+#endif
+} // namespace detail
+
+
+bool base64_valid(csubstr encoded)
+{
+    if(encoded.len % 4) return false;
+    for(const char c : encoded)
+    {
+        if(c < 0/* || c >= 128*/)
+            return false;
+        if(c == '=')
+            continue;
+        if(detail::base64_char_to_sextet_[c] == char(-1))
+            return false;
+    }
+    return true;
+}
+
+
+size_t base64_encode(substr buf, cblob data)
+{
+    #define c4append_(c) { if(pos < buf.len) { buf.str[pos] = (c); } ++pos; }
+    #define c4append_idx_(char_idx) \
+    {\
+         C4_XASSERT((char_idx) < sizeof(detail::base64_sextet_to_char_));\
+         c4append_(detail::base64_sextet_to_char_[(char_idx)]);\
+    }
+
+    size_t rem, pos = 0;
+    constexpr const uint32_t sextet_mask = uint32_t(1 << 6) - 1;
+    const unsigned char *C4_RESTRICT d = (unsigned char *) data.buf; // cast to unsigned to avoid wrapping high-bits
+    for(rem = data.len; rem >= 3; rem -= 3, d += 3)
+    {
+        const uint32_t val = ((uint32_t(d[0]) << 16) | (uint32_t(d[1]) << 8) | (uint32_t(d[2])));
+        c4append_idx_((val >> 18) & sextet_mask);
+        c4append_idx_((val >> 12) & sextet_mask);
+        c4append_idx_((val >>  6) & sextet_mask);
+        c4append_idx_((val      ) & sextet_mask);
+    }
+    C4_ASSERT(rem < 3);
+    if(rem == 2)
+    {
+        const uint32_t val = ((uint32_t(d[0]) << 16) | (uint32_t(d[1]) << 8));
+        c4append_idx_((val >> 18) & sextet_mask);
+        c4append_idx_((val >> 12) & sextet_mask);
+        c4append_idx_((val >>  6) & sextet_mask);
+        c4append_('=');
+    }
+    else if(rem == 1)
+    {
+        const uint32_t val = ((uint32_t(d[0]) << 16));
+        c4append_idx_((val >> 18) & sextet_mask);
+        c4append_idx_((val >> 12) & sextet_mask);
+        c4append_('=');
+        c4append_('=');
+    }
+    return pos;
+
+    #undef c4append_
+    #undef c4append_idx_
+}
+
+
+size_t base64_decode(csubstr encoded, blob data)
+{
+    #define c4append_(c) { if(wpos < data.len) { data.buf[wpos] = static_cast<c4::byte>(c); } ++wpos; }
+    #define c4appendval_(c, shift)\
+    {\
+        C4_XASSERT(c >= 0);\
+        C4_XASSERT(size_t(c) < sizeof(detail::base64_char_to_sextet_));\
+        val |= static_cast<uint32_t>(detail::base64_char_to_sextet_[(c)]) << ((shift) * 6);\
+    }
+
+    C4_ASSERT(base64_valid(encoded));
+    C4_CHECK(encoded.len % 4 == 0);
+    size_t wpos = 0;  // the write position
+    const char *C4_RESTRICT d = encoded.str;
+    constexpr const uint32_t full_byte = 0xff;
+    // process every quartet of input 6 bits --> triplet of output bytes
+    for(size_t rpos = 0; rpos < encoded.len; rpos += 4, d += 4)
+    {
+        if(d[2] == '=' || d[3] == '=') // skip the last quartet if it is padded
+        {
+            C4_ASSERT(d + 4 == encoded.str + encoded.len);
+            break;
+        }
+        uint32_t val = 0;
+        c4appendval_(d[3], 0);
+        c4appendval_(d[2], 1);
+        c4appendval_(d[1], 2);
+        c4appendval_(d[0], 3);
+        c4append_((val >> (2 * 8)) & full_byte);
+        c4append_((val >> (1 * 8)) & full_byte);
+        c4append_((val           ) & full_byte);
+    }
+    // deal with the last quartet when it is padded
+    if(d == encoded.str + encoded.len)
+        return wpos;
+    if(d[2] == '=') // 2 padding chars
+    {
+        C4_ASSERT(d + 4 == encoded.str + encoded.len);
+        C4_ASSERT(d[3] == '=');
+        uint32_t val = 0;
+        c4appendval_(d[1], 2);
+        c4appendval_(d[0], 3);
+        c4append_((val >> (2 * 8)) & full_byte);
+    }
+    else if(d[3] == '=') // 1 padding char
+    {
+        C4_ASSERT(d + 4 == encoded.str + encoded.len);
+        uint32_t val = 0;
+        c4appendval_(d[2], 1);
+        c4appendval_(d[1], 2);
+        c4appendval_(d[0], 3);
+        c4append_((val >> (2 * 8)) & full_byte);
+        c4append_((val >> (1 * 8)) & full_byte);
+    }
+    return wpos;
+    #undef c4append_
+    #undef c4appendval_
+}
+
+} // namespace c4
+
+#ifdef __clang__
+#    pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#    pragma GCC diagnostic pop
+#endif
+
+#endif /* C4CORE_SINGLE_HDR_DEFINE_NOW */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/base64.cpp)
+
+#define C4_WINDOWS_POP_HPP_
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/windows_push.hpp
+// https://github.com/biojppm/c4core/src/c4/windows_push.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_WINDOWS_PUSH_HPP_
+#define _C4_WINDOWS_PUSH_HPP_
+
+/** @file windows_push.hpp sets up macros to include windows header files
+ * without pulling in all of <windows.h>
+ *
+ * @see #include windows_pop.hpp to undefine these macros
+ *
+ * @see https://aras-p.info/blog/2018/01/12/Minimizing-windows.h/ */
+
+
+#if defined(_WIN64) || defined(_WIN32)
+
+#if defined(_M_AMD64)
+#   ifndef _AMD64_
+#       define _c4_AMD64_
+#       define _AMD64_
+#   endif
+#elif defined(_M_IX86)
+#   ifndef _X86_
+#       define _c4_X86_
+#       define _X86_
+#   endif
+#elif defined(_M_ARM64)
+#   ifndef _ARM64_
+#       define _c4_ARM64_
+#       define _ARM64_
+#   endif
+#elif defined(_M_ARM)
+#   ifndef _ARM_
+#       define _c4_ARM_
+#       define _ARM_
+#   endif
+#endif
+
+#ifndef NOMINMAX
+#    define _c4_NOMINMAX
+#    define NOMINMAX
+#endif
+
+#ifndef NOGDI
+#    define _c4_NOGDI
+#    define NOGDI
+#endif
+
+#ifndef VC_EXTRALEAN
+#    define _c4_VC_EXTRALEAN
+#    define VC_EXTRALEAN
+#endif
+
+#ifndef WIN32_LEAN_AND_MEAN
+#    define _c4_WIN32_LEAN_AND_MEAN
+#    define WIN32_LEAN_AND_MEAN
+#endif
+
+/*  If defined, the following flags inhibit definition
+ *     of the indicated items.
+ *
+ *  NOGDICAPMASKS     - CC_*, LC_*, PC_*, CP_*, TC_*, RC_
+ *  NOVIRTUALKEYCODES - VK_*
+ *  NOWINMESSAGES     - WM_*, EM_*, LB_*, CB_*
+ *  NOWINSTYLES       - WS_*, CS_*, ES_*, LBS_*, SBS_*, CBS_*
+ *  NOSYSMETRICS      - SM_*
+ *  NOMENUS           - MF_*
+ *  NOICONS           - IDI_*
+ *  NOKEYSTATES       - MK_*
+ *  NOSYSCOMMANDS     - SC_*
+ *  NORASTEROPS       - Binary and Tertiary raster ops
+ *  NOSHOWWINDOW      - SW_*
+ *  OEMRESOURCE       - OEM Resource values
+ *  NOATOM            - Atom Manager routines
+ *  NOCLIPBOARD       - Clipboard routines
+ *  NOCOLOR           - Screen colors
+ *  NOCTLMGR          - Control and Dialog routines
+ *  NODRAWTEXT        - DrawText() and DT_*
+ *  NOGDI             - All GDI defines and routines
+ *  NOKERNEL          - All KERNEL defines and routines
+ *  NOUSER            - All USER defines and routines
+ *  NONLS             - All NLS defines and routines
+ *  NOMB              - MB_* and MessageBox()
+ *  NOMEMMGR          - GMEM_*, LMEM_*, GHND, LHND, associated routines
+ *  NOMETAFILE        - typedef METAFILEPICT
+ *  NOMINMAX          - Macros min(a,b) and max(a,b)
+ *  NOMSG             - typedef MSG and associated routines
+ *  NOOPENFILE        - OpenFile(), OemToAnsi, AnsiToOem, and OF_*
+ *  NOSCROLL          - SB_* and scrolling routines
+ *  NOSERVICE         - All Service Controller routines, SERVICE_ equates, etc.
+ *  NOSOUND           - Sound driver routines
+ *  NOTEXTMETRIC      - typedef TEXTMETRIC and associated routines
+ *  NOWH              - SetWindowsHook and WH_*
+ *  NOWINOFFSETS      - GWL_*, GCL_*, associated routines
+ *  NOCOMM            - COMM driver routines
+ *  NOKANJI           - Kanji support stuff.
+ *  NOHELP            - Help engine interface.
+ *  NOPROFILER        - Profiler interface.
+ *  NODEFERWINDOWPOS  - DeferWindowPos routines
+ *  NOMCX             - Modem Configuration Extensions
+ */
+
+#endif /* defined(_WIN64) || defined(_WIN32) */
+
+#endif /* _C4_WINDOWS_PUSH_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/windows_push.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/windows.hpp
+// https://github.com/biojppm/c4core/src/c4/windows.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_WINDOWS_HPP_
+#define _C4_WINDOWS_HPP_
+
+#if defined(_WIN64) || defined(_WIN32)
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/windows_push.hpp
+//#include "c4/windows_push.hpp"
+#if !defined(C4_WINDOWS_PUSH_HPP_) && !defined(_C4_WINDOWS_PUSH_HPP_)
+#error "amalgamate: file c4/windows_push.hpp must have been included at this point"
+#endif /* C4_WINDOWS_PUSH_HPP_ */
+
+#include <windows.h>
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/windows_pop.hpp
+//#include "c4/windows_pop.hpp"
+#if !defined(C4_WINDOWS_POP_HPP_) && !defined(_C4_WINDOWS_POP_HPP_)
+#error "amalgamate: file c4/windows_pop.hpp must have been included at this point"
+#endif /* C4_WINDOWS_POP_HPP_ */
+
+#endif
+
+#endif /* _C4_WINDOWS_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/windows.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/windows_pop.hpp
+// https://github.com/biojppm/c4core/src/c4/windows_pop.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_WINDOWS_POP_HPP_
+#define _C4_WINDOWS_POP_HPP_
+
+#if defined(_WIN64) || defined(_WIN32)
+
+#ifdef _c4_AMD64_
+#    undef _c4_AMD64_
+#    undef _AMD64_
+#endif
+#ifdef _c4_X86_
+#    undef _c4_X86_
+#    undef _X86_
+#endif
+#ifdef _c4_ARM_
+#    undef _c4_ARM_
+#    undef _ARM_
+#endif
+
+#ifdef _c4_NOMINMAX
+#    undef _c4_NOMINMAX
+#    undef NOMINMAX
+#endif
+
+#ifdef NOGDI
+#    undef _c4_NOGDI
+#    undef NOGDI
+#endif
+
+#ifdef VC_EXTRALEAN
+#    undef _c4_VC_EXTRALEAN
+#    undef VC_EXTRALEAN
+#endif
+
+#ifdef WIN32_LEAN_AND_MEAN
+#    undef _c4_WIN32_LEAN_AND_MEAN
+#    undef WIN32_LEAN_AND_MEAN
+#endif
+
+#endif /* defined(_WIN64) || defined(_WIN32) */
+
+#endif /* _C4_WINDOWS_POP_HPP_ */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/windows_pop.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/error.cpp
+// https://github.com/biojppm/c4core/src/c4/error.cpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifdef C4CORE_SINGLE_HDR_DEFINE_NOW
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/error.hpp
+//#include "c4/error.hpp"
+#if !defined(C4_ERROR_HPP_) && !defined(_C4_ERROR_HPP_)
+#error "amalgamate: file c4/error.hpp must have been included at this point"
+#endif /* C4_ERROR_HPP_ */
+
+
+//included above:
+//#include <stdlib.h>
+//included above:
+//#include <stdio.h>
+//included above:
+//#include <stdarg.h>
+
+#define C4_LOGF_ERR(...) fprintf(stderr, __VA_ARGS__); fflush(stderr)
+#define C4_LOGF_WARN(...) fprintf(stderr, __VA_ARGS__); fflush(stderr)
+#define C4_LOGP(msg, ...) printf(msg)
+
+#if defined(C4_XBOX) || (defined(C4_WIN) && defined(C4_MSVC))
+// amalgamate: removed include of
+// https://github.com/biojppm/c4core/src/c4/windows.hpp
+//#   include "c4/windows.hpp"
+#if !defined(C4_WINDOWS_HPP_) && !defined(_C4_WINDOWS_HPP_)
+#error "amalgamate: file c4/windows.hpp must have been included at this point"
+#endif /* C4_WINDOWS_HPP_ */
+
+#elif defined(C4_PS4)
+#   include <libdbg.h>
+#elif defined(C4_UNIX) || defined(C4_LINUX)
+#   include <sys/stat.h>
+//included above:
+//#   include <cstring>
+#   include <fcntl.h>
+#elif defined(C4_MACOS) || defined(C4_IOS)
+//included above:
+//#   include <assert.h>
+#   include <stdbool.h>
+#   include <sys/types.h>
+#   include <sys/sysctl.h>
+#endif
+// the amalgamation tool is dumb and was omitting this include under MACOS.
+// So do it only once:
+#if defined(C4_UNIX) || defined(C4_LINUX) || defined(C4_MACOS) || defined(C4_IOS)
+#   include <unistd.h>
+#endif
+
+#if defined(C4_EXCEPTIONS_ENABLED) && defined(C4_ERROR_THROWS_EXCEPTION)
+#   include <exception>
+#endif
+
+#ifdef __clang__
+#   pragma clang diagnostic push
+#   pragma clang diagnostic ignored "-Wformat-nonliteral"
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wformat-nonliteral"
+#endif
+
+
+//-----------------------------------------------------------------------------
+namespace c4 {
+
+static error_flags         s_error_flags = ON_ERROR_DEFAULTS;
+static error_callback_type s_error_callback = nullptr;
+
+//-----------------------------------------------------------------------------
+
+error_flags get_error_flags()
+{
+    return s_error_flags;
+}
+void set_error_flags(error_flags flags)
+{
+    s_error_flags = flags;
+}
+
+error_callback_type get_error_callback()
+{
+    return s_error_callback;
+}
+/** Set the function which is called when an error occurs. */
+void set_error_callback(error_callback_type cb)
+{
+    s_error_callback = cb;
+}
+
+//-----------------------------------------------------------------------------
+
+void handle_error(srcloc where, const char *fmt, ...)
+{
+    char buf[1024];
+    size_t msglen = 0;
+    if(s_error_flags & (ON_ERROR_LOG|ON_ERROR_CALLBACK))
+    {
+        va_list args;
+        va_start(args, fmt);
+        int ilen = vsnprintf(buf, sizeof(buf), fmt, args); // ss.vprintf(fmt, args);
+        va_end(args);
+        msglen = ilen >= 0 && ilen < (int)sizeof(buf) ? static_cast<size_t>(ilen) : sizeof(buf)-1;
+    }
+
+    if(s_error_flags & ON_ERROR_LOG)
+    {
+        C4_LOGF_ERR("\n");
+#if defined(C4_ERROR_SHOWS_FILELINE) && defined(C4_ERROR_SHOWS_FUNC)
+        C4_LOGF_ERR("%s:%d: ERROR: %s\n", where.file, where.line, buf);
+        C4_LOGF_ERR("%s:%d: ERROR here: %s\n", where.file, where.line, where.func);
+#elif defined(C4_ERROR_SHOWS_FILELINE)
+        C4_LOGF_ERR("%s:%d: ERROR: %s\n", where.file, where.line, buf);
+#elif ! defined(C4_ERROR_SHOWS_FUNC)
+        C4_LOGF_ERR("ERROR: %s\n", buf);
+#endif
+    }
+
+    if(s_error_flags & ON_ERROR_CALLBACK)
+    {
+        if(s_error_callback)
+        {
+            s_error_callback(buf, msglen/*ss.c_strp(), ss.tellp()*/);
+        }
+    }
+
+    if(s_error_flags & ON_ERROR_ABORT)
+    {
+        abort();
+    }
+
+    if(s_error_flags & ON_ERROR_THROW)
+    {
+#if defined(C4_EXCEPTIONS_ENABLED) && defined(C4_ERROR_THROWS_EXCEPTION)
+        throw Exception(buf);
+#else
+        abort();
+#endif
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+void handle_warning(srcloc where, const char *fmt, ...)
+{
+    va_list args;
+    char buf[1024]; //sstream<c4::string> ss;
+    va_start(args, fmt);
+    vsnprintf(buf, sizeof(buf), fmt, args);
+    va_end(args);
+    C4_LOGF_WARN("\n");
+#if defined(C4_ERROR_SHOWS_FILELINE) && defined(C4_ERROR_SHOWS_FUNC)
+    C4_LOGF_WARN("%s:%d: WARNING: %s\n", where.file, where.line, buf/*ss.c_strp()*/);
+    C4_LOGF_WARN("%s:%d: WARNING: here: %s\n", where.file, where.line, where.func);
+#elif defined(C4_ERROR_SHOWS_FILELINE)
+    C4_LOGF_WARN("%s:%d: WARNING: %s\n", where.file, where.line, buf/*ss.c_strp()*/);
+#elif ! defined(C4_ERROR_SHOWS_FUNC)
+    C4_LOGF_WARN("WARNING: %s\n", buf/*ss.c_strp()*/);
+#endif
+    //c4::log.flush();
+}
+
+//-----------------------------------------------------------------------------
+bool is_debugger_attached()
+{
+#if defined(C4_UNIX) || defined(C4_LINUX)
+    static bool first_call = true;
+    static bool first_call_result = false;
+    if(first_call)
+    {
+        first_call = false;
+        //! @see http://stackoverflow.com/questions/3596781/how-to-detect-if-the-current-process-is-being-run-by-gdb
+        //! (this answer: http://stackoverflow.com/a/24969863/3968589 )
+        char buf[1024] = "";
+
+        int status_fd = open("/proc/self/status", O_RDONLY);
+        if (status_fd == -1)
+        {
+            return 0;
+        }
+
+        ssize_t num_read = ::read(status_fd, buf, sizeof(buf));
+
+        if (num_read > 0)
+        {
+            static const char TracerPid[] = "TracerPid:";
+            char *tracer_pid;
+
+            if(num_read < 1024)
+            {
+                buf[num_read] = 0;
+            }
+            tracer_pid = strstr(buf, TracerPid);
+            if (tracer_pid)
+            {
+                first_call_result = !!::atoi(tracer_pid + sizeof(TracerPid) - 1);
+            }
+        }
+    }
+    return first_call_result;
+#elif defined(C4_PS4)
+    return (sceDbgIsDebuggerAttached() != 0);
+#elif defined(C4_XBOX) || (defined(C4_WIN) && defined(C4_MSVC))
+    return IsDebuggerPresent() != 0;
+#elif defined(C4_MACOS) || defined(C4_IOS)
+    // https://stackoverflow.com/questions/2200277/detecting-debugger-on-mac-os-x
+    // Returns true if the current process is being debugged (either
+    // running under the debugger or has a debugger attached post facto).
+    [[maybe_unused]] int                 junk;
+    int                 mib[4];
+    struct kinfo_proc   info;
+    size_t              size;
+
+    // Initialize the flags so that, if sysctl fails for some bizarre
+    // reason, we get a predictable result.
+
+    info.kp_proc.p_flag = 0;
+
+    // Initialize mib, which tells sysctl the info we want, in this case
+    // we're looking for information about a specific process ID.
+
+    mib[0] = CTL_KERN;
+    mib[1] = KERN_PROC;
+    mib[2] = KERN_PROC_PID;
+    mib[3] = getpid();
+
+    // Call sysctl.
+
+    size = sizeof(info);
+    junk = sysctl(mib, sizeof(mib) / sizeof(*mib), &info, &size, NULL, 0);
+    assert(junk == 0);
+
+    // We're being debugged if the P_TRACED flag is set.
+    return ((info.kp_proc.p_flag & P_TRACED) != 0);
+#else
+    return false;
+#endif
+} // is_debugger_attached()
+
+} // namespace c4
+
+
+#ifdef __clang__
+#   pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic pop
+#endif
+
+#endif /* C4CORE_SINGLE_HDR_DEFINE_NOW */
+
+
+// (end https://github.com/biojppm/c4core/src/c4/error.cpp)
+
+#endif /* _C4CORE_SINGLE_HEADER_AMALGAMATED_HPP_ */
+
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/c4core_all.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/export.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/export.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef C4_YML_EXPORT_HPP_
+#define C4_YML_EXPORT_HPP_
+
+#ifdef _WIN32
+    #ifdef RYML_SHARED
+        #ifdef RYML_EXPORTS
+            #define RYML_EXPORT __declspec(dllexport)
+        #else
+            #define RYML_EXPORT __declspec(dllimport)
+        #endif
+    #else
+        #define RYML_EXPORT
+    #endif
+#else
+    #define RYML_EXPORT
+#endif
+
+#endif /* C4_YML_EXPORT_HPP_ */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/export.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/common.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/common.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_YML_COMMON_HPP_
+#define _C4_YML_COMMON_HPP_
+
+//included above:
+//#include <cstddef>
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/substr.hpp
+//#include <c4/substr.hpp>
+#if !defined(C4_SUBSTR_HPP_) && !defined(_C4_SUBSTR_HPP_)
+#error "amalgamate: file c4/substr.hpp must have been included at this point"
+#endif /* C4_SUBSTR_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/export.hpp
+//#include <c4/yml/export.hpp>
+#if !defined(C4_YML_EXPORT_HPP_) && !defined(_C4_YML_EXPORT_HPP_)
+#error "amalgamate: file c4/yml/export.hpp must have been included at this point"
+#endif /* C4_YML_EXPORT_HPP_ */
+
+
+
+#ifndef RYML_USE_ASSERT
+#   define RYML_USE_ASSERT C4_USE_ASSERT
+#endif
+
+
+#if RYML_USE_ASSERT
+#   define RYML_ASSERT(cond) RYML_CHECK(cond)
+#   define RYML_ASSERT_MSG(cond, msg) RYML_CHECK_MSG(cond, msg)
+#else
+#   define RYML_ASSERT(cond)
+#   define RYML_ASSERT_MSG(cond, msg)
+#endif
+
+
+#if defined(NDEBUG) || defined(C4_NO_DEBUG_BREAK)
+#   define RYML_DEBUG_BREAK()
+#else
+#   define RYML_DEBUG_BREAK()                               \
+    {                                                       \
+        if(c4::get_error_flags() & c4::ON_ERROR_DEBUGBREAK) \
+        {                                                   \
+            C4_DEBUG_BREAK();                               \
+        }                                                   \
+    }
+#endif
+
+
+#define RYML_CHECK(cond)                                                \
+    do {                                                                \
+        if(!(cond))                                                     \
+        {                                                               \
+            RYML_DEBUG_BREAK()                                          \
+            c4::yml::error("check failed: " #cond, c4::yml::Location(__FILE__, __LINE__, 0)); \
+        }                                                               \
+    } while(0)
+
+#define RYML_CHECK_MSG(cond, msg)                                       \
+    do                                                                  \
+    {                                                                   \
+        if(!(cond))                                                     \
+        {                                                               \
+            RYML_DEBUG_BREAK()                                          \
+            c4::yml::error(msg ": check failed: " #cond, c4::yml::Location(__FILE__, __LINE__, 0)); \
+        }                                                               \
+    } while(0)
+
+
+#if C4_CPP >= 14
+#   define RYML_DEPRECATED(msg) [[deprecated(msg)]]
+#else
+#   if defined(_MSC_VER)
+#       define RYML_DEPRECATED(msg) __declspec(deprecated(msg))
+#   else // defined(__GNUC__) || defined(__clang__)
+#       define RYML_DEPRECATED(msg) __attribute__((deprecated(msg)))
+#   endif
+#endif
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+namespace c4 {
+namespace yml {
+
+enum : size_t {
+    /** a null position */
+    npos = size_t(-1),
+    /** an index to none */
+    NONE = size_t(-1)
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+//! holds a position into a source buffer
+struct RYML_EXPORT LineCol
+{
+    //! number of bytes from the beginning of the source buffer
+    size_t offset;
+    //! line
+    size_t line;
+    //! column
+    size_t col;
+
+    LineCol() : offset(), line(), col() {}
+    //! construct from line and column
+    LineCol(size_t l, size_t c) : offset(0), line(l), col(c) {}
+    //! construct from offset, line and column
+    LineCol(size_t o, size_t l, size_t c) : offset(o), line(l), col(c) {}
+};
+
+
+//! a source file position
+struct RYML_EXPORT Location : public LineCol
+{
+    csubstr name;
+
+    operator bool () const { return !name.empty() || line != 0 || offset != 0; }
+
+    Location() : LineCol(), name() {}
+    Location(                         size_t l, size_t c) : LineCol{   l, c}, name( ) {}
+    Location(    csubstr n,           size_t l, size_t c) : LineCol{   l, c}, name(n) {}
+    Location(    csubstr n, size_t b, size_t l, size_t c) : LineCol{b, l, c}, name(n) {}
+    Location(const char *n,           size_t l, size_t c) : LineCol{   l, c}, name(to_csubstr(n)) {}
+    Location(const char *n, size_t b, size_t l, size_t c) : LineCol{b, l, c}, name(to_csubstr(n)) {}
+};
+
+
+//-----------------------------------------------------------------------------
+
+/** the type of the function used to report errors. This function must
+ * interrupt execution, either by raising an exception or calling
+ * std::abort().
+ *
+ * @warning the error callback must never return: it must either abort
+ * or throw an exception. Otherwise, the parser will enter into an
+ * infinite loop, or the program may crash. */
+using pfn_error = void (*)(const char* msg, size_t msg_len, Location location, void *user_data);
+/** the type of the function used to allocate memory */
+using pfn_allocate = void* (*)(size_t len, void* hint, void *user_data);
+/** the type of the function used to free memory */
+using pfn_free = void (*)(void* mem, size_t size, void *user_data);
+
+/** trigger an error: call the current error callback. */
+RYML_EXPORT void error(const char *msg, size_t msg_len, Location loc);
+/** @overload error */
+inline void error(const char *msg, size_t msg_len)
+{
+    error(msg, msg_len, Location{});
+}
+/** @overload error */
+template<size_t N>
+inline void error(const char (&msg)[N], Location loc)
+{
+    error(msg, N-1, loc);
+}
+/** @overload error */
+template<size_t N>
+inline void error(const char (&msg)[N])
+{
+    error(msg, N-1, Location{});
+}
+
+//-----------------------------------------------------------------------------
+
+/** a c-style callbacks class
+ *
+ * @warning the error callback must never return: it must either abort
+ * or throw an exception. Otherwise, the parser will enter into an
+ * infinite loop, or the program may crash. */
+struct RYML_EXPORT Callbacks
+{
+    void *       m_user_data;
+    pfn_allocate m_allocate;
+    pfn_free     m_free;
+    pfn_error    m_error;
+
+    Callbacks();
+    Callbacks(void *user_data, pfn_allocate alloc, pfn_free free, pfn_error error_);
+
+    bool operator!= (Callbacks const& that) const { return !operator==(that); }
+    bool operator== (Callbacks const& that) const
+    {
+        return (m_user_data == that.m_user_data &&
+                m_allocate == that.m_allocate &&
+                m_free == that.m_free &&
+                m_error == that.m_error);
+    }
+};
+
+/** set the global callbacks.
+ *
+ * @warning the error callback must never return: it must either abort
+ * or throw an exception. Otherwise, the parser will enter into an
+ * infinite loop, or the program may crash. */
+RYML_EXPORT void set_callbacks(Callbacks const& c);
+/// get the global callbacks
+RYML_EXPORT Callbacks const& get_callbacks();
+/// set the global callbacks back to their defaults
+RYML_EXPORT void reset_callbacks();
+
+/// @cond dev
+#define _RYML_CB_ERR(cb, msg_literal)                                   \
+do                                                                      \
+{                                                                       \
+    const char msg[] = msg_literal;                                     \
+    RYML_DEBUG_BREAK()                                                  \
+    (cb).m_error(msg, sizeof(msg), c4::yml::Location(__FILE__, 0, __LINE__, 0), (cb).m_user_data); \
+} while(0)
+#define _RYML_CB_CHECK(cb, cond)                                        \
+    do                                                                  \
+    {                                                                   \
+        if(!(cond))                                                     \
+        {                                                               \
+            const char msg[] = "check failed: " #cond;                  \
+            RYML_DEBUG_BREAK()                                          \
+            (cb).m_error(msg, sizeof(msg), c4::yml::Location(__FILE__, 0, __LINE__, 0), (cb).m_user_data); \
+        }                                                               \
+    } while(0)
+#ifdef RYML_USE_ASSERT
+#define _RYML_CB_ASSERT(cb, cond) _RYML_CB_CHECK((cb), (cond))
+#else
+#define _RYML_CB_ASSERT(cb, cond) do {} while(0)
+#endif
+#define _RYML_CB_ALLOC_HINT(cb, T, num, hint) (T*) (cb).m_allocate((num) * sizeof(T), (hint), (cb).m_user_data)
+#define _RYML_CB_ALLOC(cb, T, num) _RYML_CB_ALLOC_HINT((cb), (T), (num), nullptr)
+#define _RYML_CB_FREE(cb, buf, T, num)                              \
+    do {                                                            \
+        (cb).m_free((buf), (num) * sizeof(T), (cb).m_user_data);    \
+        (buf) = nullptr;                                            \
+    } while(0)
+
+
+
+namespace detail {
+template<int8_t signedval, uint8_t unsignedval>
+struct _charconstant_t
+    : public std::conditional<std::is_signed<char>::value,
+                              std::integral_constant<int8_t, signedval>,
+                              std::integral_constant<uint8_t, unsignedval>>::type
+{};
+#define _RYML_CHCONST(signedval, unsignedval) ::c4::yml::detail::_charconstant_t<INT8_C(signedval), UINT8_C(unsignedval)>::value
+} // namespace detail
+
+
+namespace detail {
+struct _SubstrWriter
+{
+    substr buf;
+    size_t pos;
+    _SubstrWriter(substr buf_, size_t pos_=0) : buf(buf_), pos(pos_) {}
+    void append(csubstr s)
+    {
+        C4_ASSERT(!s.overlaps(buf));
+        if(pos + s.len <= buf.len)
+            memcpy(buf.str + pos, s.str, s.len);
+        pos += s.len;
+    }
+    void append(char c)
+    {
+        if(pos < buf.len)
+            buf.str[pos] = c;
+        ++pos;
+    }
+    void append_n(char c, size_t numtimes)
+    {
+        if(pos + numtimes < buf.len)
+            memset(buf.str + pos, c, numtimes);
+        pos += numtimes;
+    }
+    size_t slack() const { return pos <= buf.len ? buf.len - pos : 0; }
+    size_t excess() const { return pos > buf.len ? pos - buf.len : 0; }
+    //! get the part written so far
+    csubstr curr() const { return pos <= buf.len ? buf.first(pos) : buf; }
+    //! get the part that is still free to write to (the remainder)
+    substr rem() { return pos < buf.len ? buf.sub(pos) : buf.last(0); }
+
+    size_t advance(size_t more) { pos += more; return pos; }
+};
+} // namespace detail
+
+/// @endcond
+
+} // namespace yml
+} // namespace c4
+
+#endif /* _C4_YML_COMMON_HPP_ */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/common.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/tree.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/tree.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_YML_TREE_HPP_
+#define _C4_YML_TREE_HPP_
+
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/error.hpp
+//#include "c4/error.hpp"
+#if !defined(C4_ERROR_HPP_) && !defined(_C4_ERROR_HPP_)
+#error "amalgamate: file c4/error.hpp must have been included at this point"
+#endif /* C4_ERROR_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/types.hpp
+//#include "c4/types.hpp"
+#if !defined(C4_TYPES_HPP_) && !defined(_C4_TYPES_HPP_)
+#error "amalgamate: file c4/types.hpp must have been included at this point"
+#endif /* C4_TYPES_HPP_ */
+
+#ifndef _C4_YML_COMMON_HPP_
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/common.hpp
+//#include "c4/yml/common.hpp"
+#if !defined(C4_YML_COMMON_HPP_) && !defined(_C4_YML_COMMON_HPP_)
+#error "amalgamate: file c4/yml/common.hpp must have been included at this point"
+#endif /* C4_YML_COMMON_HPP_ */
+
+#endif
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/charconv.hpp
+//#include <c4/charconv.hpp>
+#if !defined(C4_CHARCONV_HPP_) && !defined(_C4_CHARCONV_HPP_)
+#error "amalgamate: file c4/charconv.hpp must have been included at this point"
+#endif /* C4_CHARCONV_HPP_ */
+
+//included above:
+//#include <cmath>
+//included above:
+//#include <limits>
+
+
+C4_SUPPRESS_WARNING_MSVC_PUSH
+C4_SUPPRESS_WARNING_MSVC(4251) // needs to have dll-interface to be used by clients of struct
+C4_SUPPRESS_WARNING_MSVC(4296) // expression is always 'boolean_value'
+C4_SUPPRESS_WARNING_GCC_CLANG_PUSH
+C4_SUPPRESS_WARNING_GCC("-Wtype-limits")
+
+
+namespace c4 {
+namespace yml {
+
+struct NodeScalar;
+struct NodeInit;
+struct NodeData;
+class NodeRef;
+class ConstNodeRef;
+class Tree;
+
+
+/** encode a floating point value to a string. */
+template<class T>
+size_t to_chars_float(substr buf, T val)
+{
+    C4_SUPPRESS_WARNING_GCC_CLANG_WITH_PUSH("-Wfloat-equal");
+    static_assert(std::is_floating_point<T>::value, "must be floating point");
+    if(C4_UNLIKELY(std::isnan(val)))
+        return to_chars(buf, csubstr(".nan"));
+    else if(C4_UNLIKELY(val == std::numeric_limits<T>::infinity()))
+        return to_chars(buf, csubstr(".inf"));
+    else if(C4_UNLIKELY(val == -std::numeric_limits<T>::infinity()))
+        return to_chars(buf, csubstr("-.inf"));
+    return to_chars(buf, val);
+    C4_SUPPRESS_WARNING_GCC_CLANG_POP
+}
+
+
+/** decode a floating point from string. Accepts special values: .nan,
+ * .inf, -.inf */
+template<class T>
+bool from_chars_float(csubstr buf, T *C4_RESTRICT val)
+{
+    static_assert(std::is_floating_point<T>::value, "must be floating point");
+    if(C4_LIKELY(from_chars(buf, val)))
+    {
+        return true;
+    }
+    else if(C4_UNLIKELY(buf == ".nan" || buf == ".NaN" || buf == ".NAN"))
+    {
+        *val = std::numeric_limits<T>::quiet_NaN();
+        return true;
+    }
+    else if(C4_UNLIKELY(buf == ".inf" || buf == ".Inf" || buf == ".INF"))
+    {
+        *val = std::numeric_limits<T>::infinity();
+        return true;
+    }
+    else if(C4_UNLIKELY(buf == "-.inf" || buf == "-.Inf" || buf == "-.INF"))
+    {
+        *val = -std::numeric_limits<T>::infinity();
+        return true;
+    }
+    else
+    {
+        return false;
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** the integral type necessary to cover all the bits marking node tags */
+using tag_bits = uint16_t;
+
+/** a bit mask for marking tags for types */
+typedef enum : tag_bits {
+    // container types
+    TAG_NONE      =  0,
+    TAG_MAP       =  1, /**< !!map   Unordered set of key: value pairs without duplicates. @see https://yaml.org/type/map.html */
+    TAG_OMAP      =  2, /**< !!omap  Ordered sequence of key: value pairs without duplicates. @see https://yaml.org/type/omap.html */
+    TAG_PAIRS     =  3, /**< !!pairs Ordered sequence of key: value pairs allowing duplicates. @see https://yaml.org/type/pairs.html */
+    TAG_SET       =  4, /**< !!set   Unordered set of non-equal values. @see https://yaml.org/type/set.html */
+    TAG_SEQ       =  5, /**< !!seq   Sequence of arbitrary values. @see https://yaml.org/type/seq.html */
+    // scalar types
+    TAG_BINARY    =  6, /**< !!binary A sequence of zero or more octets (8 bit values). @see https://yaml.org/type/binary.html */
+    TAG_BOOL      =  7, /**< !!bool   Mathematical Booleans. @see https://yaml.org/type/bool.html */
+    TAG_FLOAT     =  8, /**< !!float  Floating-point approximation to real numbers. https://yaml.org/type/float.html */
+    TAG_INT       =  9, /**< !!float  Mathematical integers. https://yaml.org/type/int.html */
+    TAG_MERGE     = 10, /**< !!merge  Specify one or more mapping to be merged with the current one. https://yaml.org/type/merge.html */
+    TAG_NULL      = 11, /**< !!null   Devoid of value. https://yaml.org/type/null.html */
+    TAG_STR       = 12, /**< !!str    A sequence of zero or more Unicode characters. https://yaml.org/type/str.html */
+    TAG_TIMESTAMP = 13, /**< !!timestamp A point in time https://yaml.org/type/timestamp.html */
+    TAG_VALUE     = 14, /**< !!value  Specify the default value of a mapping https://yaml.org/type/value.html */
+    TAG_YAML      = 15, /**< !!yaml   Specify the default value of a mapping https://yaml.org/type/yaml.html */
+} YamlTag_e;
+
+YamlTag_e to_tag(csubstr tag);
+csubstr from_tag(YamlTag_e tag);
+csubstr from_tag_long(YamlTag_e tag);
+csubstr normalize_tag(csubstr tag);
+csubstr normalize_tag_long(csubstr tag);
+
+struct TagDirective
+{
+    /** Eg `!e!` in `%TAG !e! tag:example.com,2000:app/` */
+    csubstr handle;
+    /** Eg `tag:example.com,2000:app/` in `%TAG !e! tag:example.com,2000:app/` */
+    csubstr prefix;
+    /** The next node to which this tag directive applies */
+    size_t next_node_id;
+};
+
+#ifndef RYML_MAX_TAG_DIRECTIVES
+/** the maximum number of tag directives in a Tree */
+#define RYML_MAX_TAG_DIRECTIVES 4
+#endif
+
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+
+/** the integral type necessary to cover all the bits marking node types */
+using type_bits = uint64_t;
+
+
+/** a bit mask for marking node types */
+typedef enum : type_bits {
+    // a convenience define, undefined below
+    #define c4bit(v) (type_bits(1) << v)
+    NOTYPE  = 0,            ///< no node type is set
+    VAL     = c4bit(0),     ///< a leaf node, has a (possibly empty) value
+    KEY     = c4bit(1),     ///< is member of a map, must have non-empty key
+    MAP     = c4bit(2),     ///< a map: a parent of keyvals
+    SEQ     = c4bit(3),     ///< a seq: a parent of vals
+    DOC     = c4bit(4),     ///< a document
+    STREAM  = c4bit(5)|SEQ, ///< a stream: a seq of docs
+    KEYREF  = c4bit(6),     ///< a *reference: the key references an &anchor
+    VALREF  = c4bit(7),     ///< a *reference: the val references an &anchor
+    KEYANCH = c4bit(8),     ///< the key has an &anchor
+    VALANCH = c4bit(9),     ///< the val has an &anchor
+    KEYTAG  = c4bit(10),    ///< the key has an explicit tag/type
+    VALTAG  = c4bit(11),    ///< the val has an explicit tag/type
+    _TYMASK = c4bit(12)-1,  // all the bits up to here
+    VALQUO  = c4bit(12),    ///< the val is quoted by '', "", > or |
+    KEYQUO  = c4bit(13),    ///< the key is quoted by '', "", > or |
+    KEYVAL  = KEY|VAL,
+    KEYSEQ  = KEY|SEQ,
+    KEYMAP  = KEY|MAP,
+    DOCMAP  = DOC|MAP,
+    DOCSEQ  = DOC|SEQ,
+    DOCVAL  = DOC|VAL,
+    _KEYMASK = KEY | KEYQUO | KEYANCH | KEYREF | KEYTAG,
+    _VALMASK = VAL | VALQUO | VALANCH | VALREF | VALTAG,
+    // these flags are from a work in progress and should not be used yet
+    _WIP_STYLE_FLOW_SL = c4bit(14), ///< mark container with single-line flow format (seqs as '[val1,val2], maps as '{key: val, key2: val2}')
+    _WIP_STYLE_FLOW_ML = c4bit(15), ///< mark container with multi-line flow format (seqs as '[val1,\nval2], maps as '{key: val,\nkey2: val2}')
+    _WIP_STYLE_BLOCK   = c4bit(16), ///< mark container with block format (seqs as '- val\n', maps as 'key: val')
+    _WIP_KEY_LITERAL   = c4bit(17), ///< mark key scalar as multiline, block literal |
+    _WIP_VAL_LITERAL   = c4bit(18), ///< mark val scalar as multiline, block literal |
+    _WIP_KEY_FOLDED    = c4bit(19), ///< mark key scalar as multiline, block folded >
+    _WIP_VAL_FOLDED    = c4bit(20), ///< mark val scalar as multiline, block folded >
+    _WIP_KEY_SQUO      = c4bit(21), ///< mark key scalar as single quoted
+    _WIP_VAL_SQUO      = c4bit(22), ///< mark val scalar as single quoted
+    _WIP_KEY_DQUO      = c4bit(23), ///< mark key scalar as double quoted
+    _WIP_VAL_DQUO      = c4bit(24), ///< mark val scalar as double quoted
+    _WIP_KEY_PLAIN     = c4bit(25), ///< mark key scalar as plain scalar (unquoted, even when multiline)
+    _WIP_VAL_PLAIN     = c4bit(26), ///< mark val scalar as plain scalar (unquoted, even when multiline)
+    _WIP_KEY_STYLE     = _WIP_KEY_LITERAL|_WIP_KEY_FOLDED|_WIP_KEY_SQUO|_WIP_KEY_DQUO|_WIP_KEY_PLAIN,
+    _WIP_VAL_STYLE     = _WIP_VAL_LITERAL|_WIP_VAL_FOLDED|_WIP_VAL_SQUO|_WIP_VAL_DQUO|_WIP_VAL_PLAIN,
+    _WIP_KEY_FT_NL     = c4bit(27), ///< features: mark key scalar as having \n in its contents
+    _WIP_VAL_FT_NL     = c4bit(28), ///< features: mark val scalar as having \n in its contents
+    _WIP_KEY_FT_SQ     = c4bit(29), ///< features: mark key scalar as having single quotes in its contents
+    _WIP_VAL_FT_SQ     = c4bit(30), ///< features: mark val scalar as having single quotes in its contents
+    _WIP_KEY_FT_DQ     = c4bit(31), ///< features: mark key scalar as having double quotes in its contents
+    _WIP_VAL_FT_DQ     = c4bit(32), ///< features: mark val scalar as having double quotes in its contents
+    #undef c4bit
+} NodeType_e;
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** wraps a NodeType_e element with some syntactic sugar and predicates */
+struct NodeType
+{
+public:
+
+    NodeType_e type;
+
+public:
+
+    C4_ALWAYS_INLINE NodeType() : type(NOTYPE) {}
+    C4_ALWAYS_INLINE NodeType(NodeType_e t) : type(t) {}
+    C4_ALWAYS_INLINE NodeType(type_bits t) : type((NodeType_e)t) {}
+
+    C4_ALWAYS_INLINE const char *type_str() const { return type_str(type); }
+    static const char* type_str(NodeType_e t);
+
+    C4_ALWAYS_INLINE void set(NodeType_e t) { type = t; }
+    C4_ALWAYS_INLINE void set(type_bits  t) { type = (NodeType_e)t; }
+
+    C4_ALWAYS_INLINE void add(NodeType_e t) { type = (NodeType_e)(type|t); }
+    C4_ALWAYS_INLINE void add(type_bits  t) { type = (NodeType_e)(type|t); }
+
+    C4_ALWAYS_INLINE void rem(NodeType_e t) { type = (NodeType_e)(type & ~t); }
+    C4_ALWAYS_INLINE void rem(type_bits  t) { type = (NodeType_e)(type & ~t); }
+
+    C4_ALWAYS_INLINE void clear() { type = NOTYPE; }
+
+public:
+
+    C4_ALWAYS_INLINE operator NodeType_e      & C4_RESTRICT ()       { return type; }
+    C4_ALWAYS_INLINE operator NodeType_e const& C4_RESTRICT () const { return type; }
+
+    C4_ALWAYS_INLINE bool operator== (NodeType_e t) const { return type == t; }
+    C4_ALWAYS_INLINE bool operator!= (NodeType_e t) const { return type != t; }
+
+public:
+
+    #if defined(__clang__)
+    #   pragma clang diagnostic push
+    #   pragma clang diagnostic ignored "-Wnull-dereference"
+    #elif defined(__GNUC__)
+    #   pragma GCC diagnostic push
+    #   if __GNUC__ >= 6
+    #       pragma GCC diagnostic ignored "-Wnull-dereference"
+    #   endif
+    #endif
+
+    C4_ALWAYS_INLINE bool is_notype() const { return type == NOTYPE; }
+    C4_ALWAYS_INLINE bool is_stream() const { return ((type & STREAM) == STREAM) != 0; }
+    C4_ALWAYS_INLINE bool is_doc() const { return (type & DOC) != 0; }
+    C4_ALWAYS_INLINE bool is_container() const { return (type & (MAP|SEQ|STREAM)) != 0; }
+    C4_ALWAYS_INLINE bool is_map() const { return (type & MAP) != 0; }
+    C4_ALWAYS_INLINE bool is_seq() const { return (type & SEQ) != 0; }
+    C4_ALWAYS_INLINE bool has_key() const { return (type & KEY) != 0; }
+    C4_ALWAYS_INLINE bool has_val() const { return (type & VAL) != 0; }
+    C4_ALWAYS_INLINE bool is_val() const { return (type & KEYVAL) == VAL; }
+    C4_ALWAYS_INLINE bool is_keyval() const { return (type & KEYVAL) == KEYVAL; }
+    C4_ALWAYS_INLINE bool has_key_tag() const { return (type & (KEY|KEYTAG)) == (KEY|KEYTAG); }
+    C4_ALWAYS_INLINE bool has_val_tag() const { return ((type & VALTAG) && (type & (VAL|MAP|SEQ))); }
+    C4_ALWAYS_INLINE bool has_key_anchor() const { return (type & (KEY|KEYANCH)) == (KEY|KEYANCH); }
+    C4_ALWAYS_INLINE bool is_key_anchor() const { return (type & (KEY|KEYANCH)) == (KEY|KEYANCH); }
+    C4_ALWAYS_INLINE bool has_val_anchor() const { return (type & VALANCH) != 0 && (type & (VAL|SEQ|MAP)) != 0; }
+    C4_ALWAYS_INLINE bool is_val_anchor() const { return (type & VALANCH) != 0 && (type & (VAL|SEQ|MAP)) != 0; }
+    C4_ALWAYS_INLINE bool has_anchor() const { return (type & (KEYANCH|VALANCH)) != 0; }
+    C4_ALWAYS_INLINE bool is_anchor() const { return (type & (KEYANCH|VALANCH)) != 0; }
+    C4_ALWAYS_INLINE bool is_key_ref() const { return (type & KEYREF) != 0; }
+    C4_ALWAYS_INLINE bool is_val_ref() const { return (type & VALREF) != 0; }
+    C4_ALWAYS_INLINE bool is_ref() const { return (type & (KEYREF|VALREF)) != 0; }
+    C4_ALWAYS_INLINE bool is_anchor_or_ref() const { return (type & (KEYANCH|VALANCH|KEYREF|VALREF)) != 0; }
+    C4_ALWAYS_INLINE bool is_key_quoted() const { return (type & (KEY|KEYQUO)) == (KEY|KEYQUO); }
+    C4_ALWAYS_INLINE bool is_val_quoted() const { return (type & (VAL|VALQUO)) == (VAL|VALQUO); }
+    C4_ALWAYS_INLINE bool is_quoted() const { return (type & (KEY|KEYQUO)) == (KEY|KEYQUO) || (type & (VAL|VALQUO)) == (VAL|VALQUO); }
+
+    // these predicates are a work in progress and subject to change. Don't use yet.
+    C4_ALWAYS_INLINE bool default_block() const { return (type & (_WIP_STYLE_BLOCK|_WIP_STYLE_FLOW_ML|_WIP_STYLE_FLOW_SL)) == 0; }
+    C4_ALWAYS_INLINE bool marked_block() const { return (type & (_WIP_STYLE_BLOCK)) != 0; }
+    C4_ALWAYS_INLINE bool marked_flow_sl() const { return (type & (_WIP_STYLE_FLOW_SL)) != 0; }
+    C4_ALWAYS_INLINE bool marked_flow_ml() const { return (type & (_WIP_STYLE_FLOW_ML)) != 0; }
+    C4_ALWAYS_INLINE bool marked_flow() const { return (type & (_WIP_STYLE_FLOW_ML|_WIP_STYLE_FLOW_SL)) != 0; }
+    C4_ALWAYS_INLINE bool key_marked_literal() const { return (type & (_WIP_KEY_LITERAL)) != 0; }
+    C4_ALWAYS_INLINE bool val_marked_literal() const { return (type & (_WIP_VAL_LITERAL)) != 0; }
+    C4_ALWAYS_INLINE bool key_marked_folded() const { return (type & (_WIP_KEY_FOLDED)) != 0; }
+    C4_ALWAYS_INLINE bool val_marked_folded() const { return (type & (_WIP_VAL_FOLDED)) != 0; }
+    C4_ALWAYS_INLINE bool key_marked_squo() const { return (type & (_WIP_KEY_SQUO)) != 0; }
+    C4_ALWAYS_INLINE bool val_marked_squo() const { return (type & (_WIP_VAL_SQUO)) != 0; }
+    C4_ALWAYS_INLINE bool key_marked_dquo() const { return (type & (_WIP_KEY_DQUO)) != 0; }
+    C4_ALWAYS_INLINE bool val_marked_dquo() const { return (type & (_WIP_VAL_DQUO)) != 0; }
+    C4_ALWAYS_INLINE bool key_marked_plain() const { return (type & (_WIP_KEY_PLAIN)) != 0; }
+    C4_ALWAYS_INLINE bool val_marked_plain() const { return (type & (_WIP_VAL_PLAIN)) != 0; }
+
+    #if defined(__clang__)
+    #   pragma clang diagnostic pop
+    #elif defined(__GNUC__)
+    #   pragma GCC diagnostic pop
+    #endif
+
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** a node scalar is a csubstr, which may be tagged and anchored. */
+struct NodeScalar
+{
+    csubstr tag;
+    csubstr scalar;
+    csubstr anchor;
+
+public:
+
+    /// initialize as an empty scalar
+    inline NodeScalar() noexcept : tag(), scalar(), anchor() {}
+
+    /// initialize as an untagged scalar
+    template<size_t N>
+    inline NodeScalar(const char (&s)[N]) noexcept : tag(), scalar(s), anchor() {}
+    inline NodeScalar(csubstr      s    ) noexcept : tag(), scalar(s), anchor() {}
+
+    /// initialize as a tagged scalar
+    template<size_t N, size_t M>
+    inline NodeScalar(const char (&t)[N], const char (&s)[N]) noexcept : tag(t), scalar(s), anchor() {}
+    inline NodeScalar(csubstr      t    , csubstr      s    ) noexcept : tag(t), scalar(s), anchor() {}
+
+public:
+
+    ~NodeScalar() noexcept = default;
+    NodeScalar(NodeScalar &&) noexcept = default;
+    NodeScalar(NodeScalar const&) noexcept = default;
+    NodeScalar& operator= (NodeScalar &&) noexcept = default;
+    NodeScalar& operator= (NodeScalar const&) noexcept = default;
+
+public:
+
+    bool empty() const noexcept { return tag.empty() && scalar.empty() && anchor.empty(); }
+
+    void clear() noexcept { tag.clear(); scalar.clear(); anchor.clear(); }
+
+    void set_ref_maybe_replacing_scalar(csubstr ref, bool has_scalar) noexcept
+    {
+        csubstr trimmed = ref.begins_with('*') ? ref.sub(1) : ref;
+        anchor = trimmed;
+        if((!has_scalar) || !scalar.ends_with(trimmed))
+            scalar = ref;
+    }
+};
+C4_MUST_BE_TRIVIAL_COPY(NodeScalar);
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** convenience class to initialize nodes */
+struct NodeInit
+{
+
+    NodeType   type;
+    NodeScalar key;
+    NodeScalar val;
+
+public:
+
+    /// initialize as an empty node
+    NodeInit() : type(NOTYPE), key(), val() {}
+    /// initialize as a typed node
+    NodeInit(NodeType_e t) : type(t), key(), val() {}
+    /// initialize as a sequence member
+    NodeInit(NodeScalar const& v) : type(VAL), key(), val(v) { _add_flags(); }
+    /// initialize as a mapping member
+    NodeInit(              NodeScalar const& k, NodeScalar const& v) : type(KEYVAL), key(k.tag, k.scalar), val(v.tag, v.scalar) { _add_flags(); }
+    /// initialize as a mapping member with explicit type
+    NodeInit(NodeType_e t, NodeScalar const& k, NodeScalar const& v) : type(t     ), key(k.tag, k.scalar), val(v.tag, v.scalar) { _add_flags(); }
+    /// initialize as a mapping member with explicit type (eg SEQ or MAP)
+    NodeInit(NodeType_e t, NodeScalar const& k                     ) : type(t     ), key(k.tag, k.scalar), val(               ) { _add_flags(KEY); }
+
+public:
+
+    void clear()
+    {
+        type.clear();
+        key.clear();
+        val.clear();
+    }
+
+    void _add_flags(type_bits more_flags=0)
+    {
+        type = (type|more_flags);
+        if( ! key.tag.empty())
+            type = (type|KEYTAG);
+        if( ! val.tag.empty())
+            type = (type|VALTAG);
+        if( ! key.anchor.empty())
+            type = (type|KEYANCH);
+        if( ! val.anchor.empty())
+            type = (type|VALANCH);
+    }
+
+    bool _check() const
+    {
+        // key cannot be empty
+        RYML_ASSERT(key.scalar.empty() == ((type & KEY) == 0));
+        // key tag cannot be empty
+        RYML_ASSERT(key.tag.empty() == ((type & KEYTAG) == 0));
+        // val may be empty even though VAL is set. But when VAL is not set, val must be empty
+        RYML_ASSERT(((type & VAL) != 0) || val.scalar.empty());
+        // val tag cannot be empty
+        RYML_ASSERT(val.tag.empty() == ((type & VALTAG) == 0));
+        return true;
+    }
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** contains the data for each YAML node. */
+struct NodeData
+{
+    NodeType   m_type;
+
+    NodeScalar m_key;
+    NodeScalar m_val;
+
+    size_t     m_parent;
+    size_t     m_first_child;
+    size_t     m_last_child;
+    size_t     m_next_sibling;
+    size_t     m_prev_sibling;
+};
+C4_MUST_BE_TRIVIAL_COPY(NodeData);
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+class RYML_EXPORT Tree
+{
+public:
+
+    /** @name construction and assignment */
+    /** @{ */
+
+    Tree() : Tree(get_callbacks()) {}
+    Tree(Callbacks const& cb);
+    Tree(size_t node_capacity, size_t arena_capacity=0) : Tree(node_capacity, arena_capacity, get_callbacks()) {}
+    Tree(size_t node_capacity, size_t arena_capacity, Callbacks const& cb);
+
+    ~Tree();
+
+    Tree(Tree const& that) noexcept;
+    Tree(Tree     && that) noexcept;
+
+    Tree& operator= (Tree const& that) noexcept;
+    Tree& operator= (Tree     && that) noexcept;
+
+    /** @} */
+
+public:
+
+    /** @name memory and sizing */
+    /** @{ */
+
+    void reserve(size_t node_capacity);
+
+    /** clear the tree and zero every node
+     * @note does NOT clear the arena
+     * @see clear_arena() */
+    void clear();
+    inline void clear_arena() { m_arena_pos = 0; }
+
+    inline bool   empty() const { return m_size == 0; }
+
+    inline size_t size() const { return m_size; }
+    inline size_t capacity() const { return m_cap; }
+    inline size_t slack() const { RYML_ASSERT(m_cap >= m_size); return m_cap - m_size; }
+
+    Callbacks const& callbacks() const { return m_callbacks; }
+    void callbacks(Callbacks const& cb) { m_callbacks = cb; }
+
+    /** @} */
+
+public:
+
+    /** @name node getters */
+    /** @{ */
+
+    //! get the index of a node belonging to this tree.
+    //! @p n can be nullptr, in which case a
+    size_t id(NodeData const* n) const
+    {
+        if( ! n)
+        {
+            return NONE;
+        }
+        RYML_ASSERT(n >= m_buf && n < m_buf + m_cap);
+        return static_cast<size_t>(n - m_buf);
+    }
+
+    //! get a pointer to a node's NodeData.
+    //! i can be NONE, in which case a nullptr is returned
+    inline NodeData *get(size_t i)
+    {
+        if(i == NONE)
+            return nullptr;
+        RYML_ASSERT(i >= 0 && i < m_cap);
+        return m_buf + i;
+    }
+    //! get a pointer to a node's NodeData.
+    //! i can be NONE, in which case a nullptr is returned.
+    inline NodeData const *get(size_t i) const
+    {
+        if(i == NONE)
+            return nullptr;
+        RYML_ASSERT(i >= 0 && i < m_cap);
+        return m_buf + i;
+    }
+
+    //! An if-less form of get() that demands a valid node index.
+    //! This function is implementation only; use at your own risk.
+    inline NodeData       * _p(size_t i)       { RYML_ASSERT(i != NONE && i >= 0 && i < m_cap); return m_buf + i; }
+    //! An if-less form of get() that demands a valid node index.
+    //! This function is implementation only; use at your own risk.
+    inline NodeData const * _p(size_t i) const { RYML_ASSERT(i != NONE && i >= 0 && i < m_cap); return m_buf + i; }
+
+    //! Get the id of the root node
+    size_t root_id()       { if(m_cap == 0) { reserve(16); } RYML_ASSERT(m_cap > 0 && m_size > 0); return 0; }
+    //! Get the id of the root node
+    size_t root_id() const {                                 RYML_ASSERT(m_cap > 0 && m_size > 0); return 0; }
+
+    //! Get a NodeRef of a node by id
+    NodeRef      ref(size_t id);
+    //! Get a NodeRef of a node by id
+    ConstNodeRef ref(size_t id) const;
+    //! Get a NodeRef of a node by id
+    ConstNodeRef cref(size_t id);
+    //! Get a NodeRef of a node by id
+    ConstNodeRef cref(size_t id) const;
+
+    //! Get the root as a NodeRef
+    NodeRef      rootref();
+    //! Get the root as a NodeRef
+    ConstNodeRef rootref() const;
+    //! Get the root as a NodeRef
+    ConstNodeRef crootref();
+    //! Get the root as a NodeRef
+    ConstNodeRef crootref() const;
+
+    //! find a root child by name, return it as a NodeRef
+    //! @note requires the root to be a map.
+    NodeRef      operator[] (csubstr key);
+    //! find a root child by name, return it as a NodeRef
+    //! @note requires the root to be a map.
+    ConstNodeRef operator[] (csubstr key) const;
+
+    //! find a root child by index: return the root node's @p i-th child as a NodeRef
+    //! @note @i is NOT the node id, but the child's position
+    NodeRef      operator[] (size_t i);
+    //! find a root child by index: return the root node's @p i-th child as a NodeRef
+    //! @note @i is NOT the node id, but the child's position
+    ConstNodeRef operator[] (size_t i) const;
+
+    //! get the i-th document of the stream
+    //! @note @i is NOT the node id, but the doc position within the stream
+    NodeRef      docref(size_t i);
+    //! get the i-th document of the stream
+    //! @note @i is NOT the node id, but the doc position within the stream
+    ConstNodeRef docref(size_t i) const;
+
+    /** @} */
+
+public:
+
+    /** @name node property getters */
+    /** @{ */
+
+    NodeType type(size_t node) const { return _p(node)->m_type; }
+    const char* type_str(size_t node) const { return NodeType::type_str(_p(node)->m_type); }
+
+    csubstr    const& key       (size_t node) const { RYML_ASSERT(has_key(node)); return _p(node)->m_key.scalar; }
+    csubstr    const& key_tag   (size_t node) const { RYML_ASSERT(has_key_tag(node)); return _p(node)->m_key.tag; }
+    csubstr    const& key_ref   (size_t node) const { RYML_ASSERT(is_key_ref(node) && ! has_key_anchor(node)); return _p(node)->m_key.anchor; }
+    csubstr    const& key_anchor(size_t node) const { RYML_ASSERT( ! is_key_ref(node) && has_key_anchor(node)); return _p(node)->m_key.anchor; }
+    NodeScalar const& keysc     (size_t node) const { RYML_ASSERT(has_key(node)); return _p(node)->m_key; }
+
+    csubstr    const& val       (size_t node) const { RYML_ASSERT(has_val(node)); return _p(node)->m_val.scalar; }
+    csubstr    const& val_tag   (size_t node) const { RYML_ASSERT(has_val_tag(node)); return _p(node)->m_val.tag; }
+    csubstr    const& val_ref   (size_t node) const { RYML_ASSERT(is_val_ref(node) && ! has_val_anchor(node)); return _p(node)->m_val.anchor; }
+    csubstr    const& val_anchor(size_t node) const { RYML_ASSERT( ! is_val_ref(node) && has_val_anchor(node)); return _p(node)->m_val.anchor; }
+    NodeScalar const& valsc     (size_t node) const { RYML_ASSERT(has_val(node)); return _p(node)->m_val; }
+
+    /** @} */
+
+public:
+
+    /** @name node predicates */
+    /** @{ */
+
+    C4_ALWAYS_INLINE bool is_stream(size_t node) const { return _p(node)->m_type.is_stream(); }
+    C4_ALWAYS_INLINE bool is_doc(size_t node) const { return _p(node)->m_type.is_doc(); }
+    C4_ALWAYS_INLINE bool is_container(size_t node) const { return _p(node)->m_type.is_container(); }
+    C4_ALWAYS_INLINE bool is_map(size_t node) const { return _p(node)->m_type.is_map(); }
+    C4_ALWAYS_INLINE bool is_seq(size_t node) const { return _p(node)->m_type.is_seq(); }
+    C4_ALWAYS_INLINE bool has_key(size_t node) const { return _p(node)->m_type.has_key(); }
+    C4_ALWAYS_INLINE bool has_val(size_t node) const { return _p(node)->m_type.has_val(); }
+    C4_ALWAYS_INLINE bool is_val(size_t node) const { return _p(node)->m_type.is_val(); }
+    C4_ALWAYS_INLINE bool is_keyval(size_t node) const { return _p(node)->m_type.is_keyval(); }
+    C4_ALWAYS_INLINE bool has_key_tag(size_t node) const { return _p(node)->m_type.has_key_tag(); }
+    C4_ALWAYS_INLINE bool has_val_tag(size_t node) const { return _p(node)->m_type.has_val_tag(); }
+    C4_ALWAYS_INLINE bool has_key_anchor(size_t node) const { return _p(node)->m_type.has_key_anchor(); }
+    C4_ALWAYS_INLINE bool is_key_anchor(size_t node) const { return _p(node)->m_type.is_key_anchor(); }
+    C4_ALWAYS_INLINE bool has_val_anchor(size_t node) const { return _p(node)->m_type.has_val_anchor(); }
+    C4_ALWAYS_INLINE bool is_val_anchor(size_t node) const { return _p(node)->m_type.is_val_anchor(); }
+    C4_ALWAYS_INLINE bool has_anchor(size_t node) const { return _p(node)->m_type.has_anchor(); }
+    C4_ALWAYS_INLINE bool is_anchor(size_t node) const { return _p(node)->m_type.is_anchor(); }
+    C4_ALWAYS_INLINE bool is_key_ref(size_t node) const { return _p(node)->m_type.is_key_ref(); }
+    C4_ALWAYS_INLINE bool is_val_ref(size_t node) const { return _p(node)->m_type.is_val_ref(); }
+    C4_ALWAYS_INLINE bool is_ref(size_t node) const { return _p(node)->m_type.is_ref(); }
+    C4_ALWAYS_INLINE bool is_anchor_or_ref(size_t node) const { return _p(node)->m_type.is_anchor_or_ref(); }
+    C4_ALWAYS_INLINE bool is_key_quoted(size_t node) const { return _p(node)->m_type.is_key_quoted(); }
+    C4_ALWAYS_INLINE bool is_val_quoted(size_t node) const { return _p(node)->m_type.is_val_quoted(); }
+    C4_ALWAYS_INLINE bool is_quoted(size_t node) const { return _p(node)->m_type.is_quoted(); }
+
+    C4_ALWAYS_INLINE bool parent_is_seq(size_t node) const { RYML_ASSERT(has_parent(node)); return is_seq(_p(node)->m_parent); }
+    C4_ALWAYS_INLINE bool parent_is_map(size_t node) const { RYML_ASSERT(has_parent(node)); return is_map(_p(node)->m_parent); }
+
+    /** true when key and val are empty, and has no children */
+    C4_ALWAYS_INLINE bool empty(size_t node) const { return ! has_children(node) && _p(node)->m_key.empty() && (( ! (_p(node)->m_type & VAL)) || _p(node)->m_val.empty()); }
+    /** true when the node has an anchor named a */
+    C4_ALWAYS_INLINE bool has_anchor(size_t node, csubstr a) const { return _p(node)->m_key.anchor == a || _p(node)->m_val.anchor == a; }
+
+    C4_ALWAYS_INLINE bool key_is_null(size_t node) const { RYML_ASSERT(has_key(node)); NodeData const* C4_RESTRICT n = _p(node); return !n->m_type.is_key_quoted() && _is_null(n->m_key.scalar); }
+    C4_ALWAYS_INLINE bool val_is_null(size_t node) const { RYML_ASSERT(has_val(node)); NodeData const* C4_RESTRICT n = _p(node); return !n->m_type.is_val_quoted() && _is_null(n->m_val.scalar); }
+    static bool _is_null(csubstr s) noexcept
+    {
+        return s.str == nullptr ||
+            s == "~" ||
+            s == "null" ||
+            s == "Null" ||
+            s == "NULL";
+    }
+
+    /** @} */
+
+public:
+
+    /** @name hierarchy predicates */
+    /** @{ */
+
+    bool is_root(size_t node) const { RYML_ASSERT(_p(node)->m_parent != NONE || node == 0); return _p(node)->m_parent == NONE; }
+
+    bool has_parent(size_t node) const { return _p(node)->m_parent != NONE; }
+
+    /** true if @p node has a child with id @p ch */
+    bool has_child(size_t node, size_t ch) const { return _p(ch)->m_parent == node; }
+    /** true if @p node has a child with key @p key */
+    bool has_child(size_t node, csubstr key) const { return find_child(node, key) != npos; }
+    /** true if @p node has any children key */
+    bool has_children(size_t node) const { return _p(node)->m_first_child != NONE; }
+
+    /** true if @p node has a sibling with id @p sib */
+    bool has_sibling(size_t node, size_t sib) const { return _p(node)->m_parent == _p(sib)->m_parent; }
+    /** true if one of the node's siblings has the given key */
+    bool has_sibling(size_t node, csubstr key) const { return find_sibling(node, key) != npos; }
+    /** true if node is not a single child */
+    bool has_other_siblings(size_t node) const
+    {
+        NodeData const *n = _p(node);
+        if(C4_LIKELY(n->m_parent != NONE))
+        {
+            n = _p(n->m_parent);
+            return n->m_first_child != n->m_last_child;
+        }
+        return false;
+    }
+
+    RYML_DEPRECATED("use has_other_siblings()") bool has_siblings(size_t /*node*/) const { return true; }
+
+    /** @} */
+
+public:
+
+    /** @name hierarchy getters */
+    /** @{ */
+
+    size_t parent(size_t node) const { return _p(node)->m_parent; }
+
+    size_t prev_sibling(size_t node) const { return _p(node)->m_prev_sibling; }
+    size_t next_sibling(size_t node) const { return _p(node)->m_next_sibling; }
+
+    /** O(#num_children) */
+    size_t num_children(size_t node) const;
+    size_t child_pos(size_t node, size_t ch) const;
+    size_t first_child(size_t node) const { return _p(node)->m_first_child; }
+    size_t last_child(size_t node) const { return _p(node)->m_last_child; }
+    size_t child(size_t node, size_t pos) const;
+    size_t find_child(size_t node, csubstr const& key) const;
+
+    /** O(#num_siblings) */
+    /** counts with this */
+    size_t num_siblings(size_t node) const { return is_root(node) ? 1 : num_children(_p(node)->m_parent); }
+    /** does not count with this */
+    size_t num_other_siblings(size_t node) const { size_t ns = num_siblings(node); RYML_ASSERT(ns > 0); return ns-1; }
+    size_t sibling_pos(size_t node, size_t sib) const { RYML_ASSERT( ! is_root(node) || node == root_id()); return child_pos(_p(node)->m_parent, sib); }
+    size_t first_sibling(size_t node) const { return is_root(node) ? node : _p(_p(node)->m_parent)->m_first_child; }
+    size_t last_sibling(size_t node) const { return is_root(node) ? node : _p(_p(node)->m_parent)->m_last_child; }
+    size_t sibling(size_t node, size_t pos) const { return child(_p(node)->m_parent, pos); }
+    size_t find_sibling(size_t node, csubstr const& key) const { return find_child(_p(node)->m_parent, key); }
+
+    size_t doc(size_t i) const { size_t rid = root_id(); RYML_ASSERT(is_stream(rid)); return child(rid, i); } //!< gets the @p i document node index. requires that the root node is a stream.
+
+    /** @} */
+
+public:
+
+    /** @name node modifiers */
+    /** @{ */
+
+    void to_keyval(size_t node, csubstr key, csubstr val, type_bits more_flags=0);
+    void to_map(size_t node, csubstr key, type_bits more_flags=0);
+    void to_seq(size_t node, csubstr key, type_bits more_flags=0);
+    void to_val(size_t node, csubstr val, type_bits more_flags=0);
+    void to_map(size_t node, type_bits more_flags=0);
+    void to_seq(size_t node, type_bits more_flags=0);
+    void to_doc(size_t node, type_bits more_flags=0);
+    void to_stream(size_t node, type_bits more_flags=0);
+
+    void set_key(size_t node, csubstr key) { RYML_ASSERT(has_key(node)); _p(node)->m_key.scalar = key; }
+    void set_val(size_t node, csubstr val) { RYML_ASSERT(has_val(node)); _p(node)->m_val.scalar = val; }
+
+    void set_key_tag(size_t node, csubstr tag) { RYML_ASSERT(has_key(node)); _p(node)->m_key.tag = tag; _add_flags(node, KEYTAG); }
+    void set_val_tag(size_t node, csubstr tag) { RYML_ASSERT(has_val(node) || is_container(node)); _p(node)->m_val.tag = tag; _add_flags(node, VALTAG); }
+
+    void set_key_anchor(size_t node, csubstr anchor) { RYML_ASSERT( ! is_key_ref(node)); _p(node)->m_key.anchor = anchor.triml('&'); _add_flags(node, KEYANCH); }
+    void set_val_anchor(size_t node, csubstr anchor) { RYML_ASSERT( ! is_val_ref(node)); _p(node)->m_val.anchor = anchor.triml('&'); _add_flags(node, VALANCH); }
+    void set_key_ref   (size_t node, csubstr ref   ) { RYML_ASSERT( ! has_key_anchor(node)); NodeData* C4_RESTRICT n = _p(node); n->m_key.set_ref_maybe_replacing_scalar(ref, n->m_type.has_key()); _add_flags(node, KEY|KEYREF); }
+    void set_val_ref   (size_t node, csubstr ref   ) { RYML_ASSERT( ! has_val_anchor(node)); NodeData* C4_RESTRICT n = _p(node); n->m_val.set_ref_maybe_replacing_scalar(ref, n->m_type.has_val()); _add_flags(node, VAL|VALREF); }
+
+    void rem_key_anchor(size_t node) { _p(node)->m_key.anchor.clear(); _rem_flags(node, KEYANCH); }
+    void rem_val_anchor(size_t node) { _p(node)->m_val.anchor.clear(); _rem_flags(node, VALANCH); }
+    void rem_key_ref   (size_t node) { _p(node)->m_key.anchor.clear(); _rem_flags(node, KEYREF); }
+    void rem_val_ref   (size_t node) { _p(node)->m_val.anchor.clear(); _rem_flags(node, VALREF); }
+    void rem_anchor_ref(size_t node) { _p(node)->m_key.anchor.clear(); _p(node)->m_val.anchor.clear(); _rem_flags(node, KEYANCH|VALANCH|KEYREF|VALREF); }
+
+    /** @} */
+
+public:
+
+    /** @name tree modifiers */
+    /** @{ */
+
+    /** reorder the tree in memory so that all the nodes are stored
+     * in a linear sequence when visited in depth-first order.
+     * This will invalidate existing ids, since the node id is its
+     * position in the node array. */
+    void reorder();
+
+    /** Resolve references (aliases <- anchors) in the tree.
+     *
+     * Dereferencing is opt-in; after parsing, Tree::resolve()
+     * has to be called explicitly for obtaining resolved references in the
+     * tree. This method will resolve all references and substitute the
+     * anchored values in place of the reference.
+     *
+     * This method first does a full traversal of the tree to gather all
+     * anchors and references in a separate collection, then it goes through
+     * that collection to locate the names, which it does by obeying the YAML
+     * standard diktat that "an alias node refers to the most recent node in
+     * the serialization having the specified anchor"
+     *
+     * So, depending on the number of anchor/alias nodes, this is a
+     * potentially expensive operation, with a best-case linear complexity
+     * (from the initial traversal). This potential cost is the reason for
+     * requiring an explicit call.
+     */
+    void resolve();
+
+    /** @} */
+
+public:
+
+    /** @name tag directives */
+    /** @{ */
+
+    void resolve_tags();
+
+    size_t num_tag_directives() const;
+    size_t add_tag_directive(TagDirective const& td);
+    void clear_tag_directives();
+
+    size_t resolve_tag(substr output, csubstr tag, size_t node_id) const;
+    csubstr resolve_tag_sub(substr output, csubstr tag, size_t node_id) const
+    {
+        size_t needed = resolve_tag(output, tag, node_id);
+        return needed <= output.len ? output.first(needed) : output;
+    }
+
+    using tag_directive_const_iterator = TagDirective const*;
+    tag_directive_const_iterator begin_tag_directives() const { return m_tag_directives; }
+    tag_directive_const_iterator end_tag_directives() const { return m_tag_directives + num_tag_directives(); }
+
+    struct TagDirectiveProxy
+    {
+        tag_directive_const_iterator b, e;
+        tag_directive_const_iterator begin() const { return b; }
+        tag_directive_const_iterator end() const { return e; }
+    };
+
+    TagDirectiveProxy tag_directives() const { return TagDirectiveProxy{begin_tag_directives(), end_tag_directives()}; }
+
+    /** @} */
+
+public:
+
+    /** @name modifying hierarchy */
+    /** @{ */
+
+    /** create and insert a new child of @p parent. insert after the (to-be)
+     * sibling @p after, which must be a child of @p parent. To insert as the
+     * first child, set after to NONE */
+    C4_ALWAYS_INLINE size_t insert_child(size_t parent, size_t after)
+    {
+        RYML_ASSERT(parent != NONE);
+        RYML_ASSERT(is_container(parent) || is_root(parent));
+        RYML_ASSERT(after == NONE || (_p(after)->m_parent == parent));
+        size_t child = _claim();
+        _set_hierarchy(child, parent, after);
+        return child;
+    }
+    /** create and insert a node as the first child of @p parent */
+    C4_ALWAYS_INLINE size_t prepend_child(size_t parent) { return insert_child(parent, NONE); }
+    /** create and insert a node as the last child of @p parent */
+    C4_ALWAYS_INLINE size_t  append_child(size_t parent) { return insert_child(parent, _p(parent)->m_last_child); }
+
+public:
+
+    #if defined(__clang__)
+    #   pragma clang diagnostic push
+    #   pragma clang diagnostic ignored "-Wnull-dereference"
+    #elif defined(__GNUC__)
+    #   pragma GCC diagnostic push
+    #   if __GNUC__ >= 6
+    #       pragma GCC diagnostic ignored "-Wnull-dereference"
+    #   endif
+    #endif
+
+    //! create and insert a new sibling of n. insert after "after"
+    C4_ALWAYS_INLINE size_t insert_sibling(size_t node, size_t after)
+    {
+        return insert_child(_p(node)->m_parent, after);
+    }
+    /** create and insert a node as the first node of @p parent */
+    C4_ALWAYS_INLINE size_t prepend_sibling(size_t node) { return prepend_child(_p(node)->m_parent); }
+    C4_ALWAYS_INLINE size_t  append_sibling(size_t node) { return append_child(_p(node)->m_parent); }
+
+public:
+
+    /** remove an entire branch at once: ie remove the children and the node itself */
+    inline void remove(size_t node)
+    {
+        remove_children(node);
+        _release(node);
+    }
+
+    /** remove all the node's children, but keep the node itself */
+    void remove_children(size_t node);
+
+    /** change the @p type of the node to one of MAP, SEQ or VAL.  @p
+     * type must have one and only one of MAP,SEQ,VAL; @p type may
+     * possibly have KEY, but if it does, then the @p node must also
+     * have KEY. Changing to the same type is a no-op. Otherwise,
+     * changing to a different type will initialize the node with an
+     * empty value of the desired type: changing to VAL will
+     * initialize with a null scalar (~), changing to MAP will
+     * initialize with an empty map ({}), and changing to SEQ will
+     * initialize with an empty seq ([]). */
+    bool change_type(size_t node, NodeType type);
+
+    bool change_type(size_t node, type_bits type)
+    {
+        return change_type(node, (NodeType)type);
+    }
+
+    #if defined(__clang__)
+    #   pragma clang diagnostic pop
+    #elif defined(__GNUC__)
+    #   pragma GCC diagnostic pop
+    #endif
+
+public:
+
+    /** change the node's position in the parent */
+    void move(size_t node, size_t after);
+
+    /** change the node's parent and position */
+    void move(size_t node, size_t new_parent, size_t after);
+
+    /** change the node's parent and position to a different tree
+     * @return the index of the new node in the destination tree */
+    size_t move(Tree * src, size_t node, size_t new_parent, size_t after);
+
+    /** ensure the first node is a stream. Eg, change this tree
+     *
+     *  DOCMAP
+     *    MAP
+     *      KEYVAL
+     *      KEYVAL
+     *    SEQ
+     *      VAL
+     *
+     * to
+     *
+     *  STREAM
+     *    DOCMAP
+     *      MAP
+     *        KEYVAL
+     *        KEYVAL
+     *      SEQ
+     *        VAL
+     *
+     * If the root is already a stream, this is a no-op.
+     */
+    void set_root_as_stream();
+
+public:
+
+    /** recursively duplicate a node from this tree into a new parent,
+     * placing it after one of its children
+     * @return the index of the copy */
+    size_t duplicate(size_t node, size_t new_parent, size_t after);
+    /** recursively duplicate a node from a different tree into a new parent,
+     * placing it after one of its children
+     * @return the index of the copy */
+    size_t duplicate(Tree const* src, size_t node, size_t new_parent, size_t after);
+
+    /** recursively duplicate the node's children (but not the node)
+     * @return the index of the last duplicated child */
+    size_t duplicate_children(size_t node, size_t parent, size_t after);
+    /** recursively duplicate the node's children (but not the node), where
+     * the node is from a different tree
+     * @return the index of the last duplicated child */
+    size_t duplicate_children(Tree const* src, size_t node, size_t parent, size_t after);
+
+    void duplicate_contents(size_t node, size_t where);
+    void duplicate_contents(Tree const* src, size_t node, size_t where);
+
+    /** duplicate the node's children (but not the node) in a new parent, but
+     * omit repetitions where a duplicated node has the same key (in maps) or
+     * value (in seqs). If one of the duplicated children has the same key
+     * (in maps) or value (in seqs) as one of the parent's children, the one
+     * that is placed closest to the end will prevail. */
+    size_t duplicate_children_no_rep(size_t node, size_t parent, size_t after);
+    size_t duplicate_children_no_rep(Tree const* src, size_t node, size_t parent, size_t after);
+
+public:
+
+    void merge_with(Tree const* src, size_t src_node=NONE, size_t dst_root=NONE);
+
+    /** @} */
+
+public:
+
+    /** @name internal string arena */
+    /** @{ */
+
+    /** get the current size of the tree's internal arena */
+    RYML_DEPRECATED("use arena_size() instead") size_t arena_pos() const { return m_arena_pos; }
+    /** get the current size of the tree's internal arena */
+    inline size_t arena_size() const { return m_arena_pos; }
+    /** get the current capacity of the tree's internal arena */
+    inline size_t arena_capacity() const { return m_arena.len; }
+    /** get the current slack of the tree's internal arena */
+    inline size_t arena_slack() const { RYML_ASSERT(m_arena.len >= m_arena_pos); return m_arena.len - m_arena_pos; }
+
+    /** get the current arena */
+    substr arena() const { return m_arena.first(m_arena_pos); }
+
+    /** return true if the given substring is part of the tree's string arena */
+    bool in_arena(csubstr s) const
+    {
+        return m_arena.is_super(s);
+    }
+
+    /** serialize the given floating-point variable to the tree's
+     * arena, growing it as needed to accomodate the serialization.
+     *
+     * @note Growing the arena may cause relocation of the entire
+     * existing arena, and thus change the contents of individual
+     * nodes, and thus cost O(numnodes)+O(arenasize). To avoid this
+     * cost, ensure that the arena is reserved to an appropriate size
+     * using .reserve_arena()
+     *
+     * @see alloc_arena() */
+    template<class T>
+    typename std::enable_if<std::is_floating_point<T>::value, csubstr>::type
+    to_arena(T const& C4_RESTRICT a)
+    {
+        substr rem(m_arena.sub(m_arena_pos));
+        size_t num = to_chars_float(rem, a);
+        if(num > rem.len)
+        {
+            rem = _grow_arena(num);
+            num = to_chars_float(rem, a);
+            RYML_ASSERT(num <= rem.len);
+        }
+        rem = _request_span(num);
+        return rem;
+    }
+
+    /** serialize the given non-floating-point variable to the tree's
+     * arena, growing it as needed to accomodate the serialization.
+     *
+     * @note Growing the arena may cause relocation of the entire
+     * existing arena, and thus change the contents of individual
+     * nodes, and thus cost O(numnodes)+O(arenasize). To avoid this
+     * cost, ensure that the arena is reserved to an appropriate size
+     * using .reserve_arena()
+     *
+     * @see alloc_arena() */
+    template<class T>
+    typename std::enable_if<!std::is_floating_point<T>::value, csubstr>::type
+    to_arena(T const& C4_RESTRICT a)
+    {
+        substr rem(m_arena.sub(m_arena_pos));
+        size_t num = to_chars(rem, a);
+        if(num > rem.len)
+        {
+            rem = _grow_arena(num);
+            num = to_chars(rem, a);
+            RYML_ASSERT(num <= rem.len);
+        }
+        rem = _request_span(num);
+        return rem;
+    }
+
+    /** serialize the given csubstr to the tree's arena, growing the
+     * arena as needed to accomodate the serialization.
+     *
+     * @note Growing the arena may cause relocation of the entire
+     * existing arena, and thus change the contents of individual
+     * nodes, and thus cost O(numnodes)+O(arenasize). To avoid this
+     * cost, ensure that the arena is reserved to an appropriate size
+     * using .reserve_arena()
+     *
+     * @see alloc_arena() */
+    csubstr to_arena(csubstr a)
+    {
+        if(a.len > 0)
+        {
+            substr rem(m_arena.sub(m_arena_pos));
+            size_t num = to_chars(rem, a);
+            if(num > rem.len)
+            {
+                rem = _grow_arena(num);
+                num = to_chars(rem, a);
+                RYML_ASSERT(num <= rem.len);
+            }
+            return _request_span(num);
+        }
+        else
+        {
+            if(a.str == nullptr)
+            {
+                return csubstr{};
+            }
+            else if(m_arena.str == nullptr)
+            {
+                // Arena is empty and we want to store a non-null
+                // zero-length string.
+                // Even though the string has zero length, we need
+                // some "memory" to store a non-nullptr string
+                _grow_arena(1);
+            }
+            return _request_span(0);
+        }
+    }
+    C4_ALWAYS_INLINE csubstr to_arena(const char *s)
+    {
+        return to_arena(to_csubstr(s));
+    }
+    C4_ALWAYS_INLINE csubstr to_arena(std::nullptr_t)
+    {
+        return csubstr{};
+    }
+
+    /** copy the given substr to the tree's arena, growing it by the
+     * required size
+     *
+     * @note Growing the arena may cause relocation of the entire
+     * existing arena, and thus change the contents of individual
+     * nodes, and thus cost O(numnodes)+O(arenasize). To avoid this
+     * cost, ensure that the arena is reserved to an appropriate size
+     * using .reserve_arena()
+     *
+     * @see alloc_arena() */
+    substr copy_to_arena(csubstr s)
+    {
+        substr cp = alloc_arena(s.len);
+        RYML_ASSERT(cp.len == s.len);
+        RYML_ASSERT(!s.overlaps(cp));
+        #if (!defined(__clang__)) && (defined(__GNUC__) && __GNUC__ >= 10)
+        C4_SUPPRESS_WARNING_GCC_PUSH
+        C4_SUPPRESS_WARNING_GCC("-Wstringop-overflow=") // no need for terminating \0
+        C4_SUPPRESS_WARNING_GCC( "-Wrestrict") // there's an assert to ensure no violation of restrict behavior
+        #endif
+        if(s.len)
+            memcpy(cp.str, s.str, s.len);
+        #if (!defined(__clang__)) && (defined(__GNUC__) && __GNUC__ >= 10)
+        C4_SUPPRESS_WARNING_GCC_POP
+        #endif
+        return cp;
+    }
+
+    /** grow the tree's string arena by the given size and return a substr
+     * of the added portion
+     *
+     * @note Growing the arena may cause relocation of the entire
+     * existing arena, and thus change the contents of individual
+     * nodes, and thus cost O(numnodes)+O(arenasize). To avoid this
+     * cost, ensure that the arena is reserved to an appropriate size
+     * using .reserve_arena().
+     *
+     * @see reserve_arena() */
+    substr alloc_arena(size_t sz)
+    {
+        if(sz > arena_slack())
+            _grow_arena(sz - arena_slack());
+        substr s = _request_span(sz);
+        return s;
+    }
+
+    /** ensure the tree's internal string arena is at least the given capacity
+     * @note This operation has a potential complexity of O(numNodes)+O(arenasize).
+     * Growing the arena may cause relocation of the entire
+     * existing arena, and thus change the contents of individual nodes. */
+    void reserve_arena(size_t arena_cap)
+    {
+        if(arena_cap > m_arena.len)
+        {
+            substr buf;
+            buf.str = (char*) m_callbacks.m_allocate(arena_cap, m_arena.str, m_callbacks.m_user_data);
+            buf.len = arena_cap;
+            if(m_arena.str)
+            {
+                RYML_ASSERT(m_arena.len >= 0);
+                _relocate(buf); // does a memcpy and changes nodes using the arena
+                m_callbacks.m_free(m_arena.str, m_arena.len, m_callbacks.m_user_data);
+            }
+            m_arena = buf;
+        }
+    }
+
+    /** @} */
+
+private:
+
+    substr _grow_arena(size_t more)
+    {
+        size_t cap = m_arena.len + more;
+        cap = cap < 2 * m_arena.len ? 2 * m_arena.len : cap;
+        cap = cap < 64 ? 64 : cap;
+        reserve_arena(cap);
+        return m_arena.sub(m_arena_pos);
+    }
+
+    substr _request_span(size_t sz)
+    {
+        substr s;
+        s = m_arena.sub(m_arena_pos, sz);
+        m_arena_pos += sz;
+        return s;
+    }
+
+    substr _relocated(csubstr s, substr next_arena) const
+    {
+        RYML_ASSERT(m_arena.is_super(s));
+        RYML_ASSERT(m_arena.sub(0, m_arena_pos).is_super(s));
+        auto pos = (s.str - m_arena.str);
+        substr r(next_arena.str + pos, s.len);
+        RYML_ASSERT(r.str - next_arena.str == pos);
+        RYML_ASSERT(next_arena.sub(0, m_arena_pos).is_super(r));
+        return r;
+    }
+
+public:
+
+    /** @name lookup */
+    /** @{ */
+
+    struct lookup_result
+    {
+        size_t  target;
+        size_t  closest;
+        size_t  path_pos;
+        csubstr path;
+
+        inline operator bool() const { return target != NONE; }
+
+        lookup_result() : target(NONE), closest(NONE), path_pos(0), path() {}
+        lookup_result(csubstr path_, size_t start) : target(NONE), closest(start), path_pos(0), path(path_) {}
+
+        /** get the part ot the input path that was resolved */
+        csubstr resolved() const;
+        /** get the part ot the input path that was unresolved */
+        csubstr unresolved() const;
+    };
+
+    /** for example foo.bar[0].baz */
+    lookup_result lookup_path(csubstr path, size_t start=NONE) const;
+
+    /** defaulted lookup: lookup @p path; if the lookup fails, recursively modify
+     * the tree so that the corresponding lookup_path() would return the
+     * default value.
+     * @see lookup_path() */
+    size_t lookup_path_or_modify(csubstr default_value, csubstr path, size_t start=NONE);
+
+    /** defaulted lookup: lookup @p path; if the lookup fails, recursively modify
+     * the tree so that the corresponding lookup_path() would return the
+     * branch @p src_node (from the tree @p src).
+     * @see lookup_path() */
+    size_t lookup_path_or_modify(Tree const *src, size_t src_node, csubstr path, size_t start=NONE);
+
+    /** @} */
+
+private:
+
+    struct _lookup_path_token
+    {
+        csubstr value;
+        NodeType type;
+        _lookup_path_token() : value(), type() {}
+        _lookup_path_token(csubstr v, NodeType t) : value(v), type(t) {}
+        inline operator bool() const { return type != NOTYPE; }
+        bool is_index() const { return value.begins_with('[') && value.ends_with(']'); }
+    };
+
+    size_t _lookup_path_or_create(csubstr path, size_t start);
+
+    void   _lookup_path       (lookup_result *r) const;
+    void   _lookup_path_modify(lookup_result *r);
+
+    size_t _next_node       (lookup_result *r, _lookup_path_token *parent) const;
+    size_t _next_node_modify(lookup_result *r, _lookup_path_token *parent);
+
+    void   _advance(lookup_result *r, size_t more) const;
+
+    _lookup_path_token _next_token(lookup_result *r, _lookup_path_token const& parent) const;
+
+private:
+
+    void _clear();
+    void _free();
+    void _copy(Tree const& that);
+    void _move(Tree      & that);
+
+    void _relocate(substr next_arena);
+
+public:
+
+    #if ! RYML_USE_ASSERT
+    C4_ALWAYS_INLINE void _check_next_flags(size_t, type_bits) {}
+    #else
+    void _check_next_flags(size_t node, type_bits f)
+    {
+        auto n = _p(node);
+        type_bits o = n->m_type; // old
+        C4_UNUSED(o);
+        if(f & MAP)
+        {
+            RYML_ASSERT_MSG((f & SEQ) == 0, "cannot mark simultaneously as map and seq");
+            RYML_ASSERT_MSG((f & VAL) == 0, "cannot mark simultaneously as map and val");
+            RYML_ASSERT_MSG((o & SEQ) == 0, "cannot turn a seq into a map; clear first");
+            RYML_ASSERT_MSG((o & VAL) == 0, "cannot turn a val into a map; clear first");
+        }
+        else if(f & SEQ)
+        {
+            RYML_ASSERT_MSG((f & MAP) == 0, "cannot mark simultaneously as seq and map");
+            RYML_ASSERT_MSG((f & VAL) == 0, "cannot mark simultaneously as seq and val");
+            RYML_ASSERT_MSG((o & MAP) == 0, "cannot turn a map into a seq; clear first");
+            RYML_ASSERT_MSG((o & VAL) == 0, "cannot turn a val into a seq; clear first");
+        }
+        if(f & KEY)
+        {
+            RYML_ASSERT(!is_root(node));
+            auto pid = parent(node); C4_UNUSED(pid);
+            RYML_ASSERT(is_map(pid));
+        }
+        if((f & VAL) && !is_root(node))
+        {
+            auto pid = parent(node); C4_UNUSED(pid);
+            RYML_ASSERT(is_map(pid) || is_seq(pid));
+        }
+    }
+    #endif
+
+    inline void _set_flags(size_t node, NodeType_e f) { _check_next_flags(node, f); _p(node)->m_type = f; }
+    inline void _set_flags(size_t node, type_bits  f) { _check_next_flags(node, f); _p(node)->m_type = f; }
+
+    inline void _add_flags(size_t node, NodeType_e f) { NodeData *d = _p(node); type_bits fb = f |  d->m_type; _check_next_flags(node, fb); d->m_type = (NodeType_e) fb; }
+    inline void _add_flags(size_t node, type_bits  f) { NodeData *d = _p(node);                f |= d->m_type; _check_next_flags(node,  f); d->m_type = f; }
+
+    inline void _rem_flags(size_t node, NodeType_e f) { NodeData *d = _p(node); type_bits fb = d->m_type & ~f; _check_next_flags(node, fb); d->m_type = (NodeType_e) fb; }
+    inline void _rem_flags(size_t node, type_bits  f) { NodeData *d = _p(node);            f = d->m_type & ~f; _check_next_flags(node,  f); d->m_type = f; }
+
+    void _set_key(size_t node, csubstr key, type_bits more_flags=0)
+    {
+        _p(node)->m_key.scalar = key;
+        _add_flags(node, KEY|more_flags);
+    }
+    void _set_key(size_t node, NodeScalar const& key, type_bits more_flags=0)
+    {
+        _p(node)->m_key = key;
+        _add_flags(node, KEY|more_flags);
+    }
+
+    void _set_val(size_t node, csubstr val, type_bits more_flags=0)
+    {
+        RYML_ASSERT(num_children(node) == 0);
+        RYML_ASSERT(!is_seq(node) && !is_map(node));
+        _p(node)->m_val.scalar = val;
+        _add_flags(node, VAL|more_flags);
+    }
+    void _set_val(size_t node, NodeScalar const& val, type_bits more_flags=0)
+    {
+        RYML_ASSERT(num_children(node) == 0);
+        RYML_ASSERT( ! is_container(node));
+        _p(node)->m_val = val;
+        _add_flags(node, VAL|more_flags);
+    }
+
+    void _set(size_t node, NodeInit const& i)
+    {
+        RYML_ASSERT(i._check());
+        NodeData *n = _p(node);
+        RYML_ASSERT(n->m_key.scalar.empty() || i.key.scalar.empty() || i.key.scalar == n->m_key.scalar);
+        _add_flags(node, i.type);
+        if(n->m_key.scalar.empty())
+        {
+            if( ! i.key.scalar.empty())
+            {
+                _set_key(node, i.key.scalar);
+            }
+        }
+        n->m_key.tag = i.key.tag;
+        n->m_val = i.val;
+    }
+
+    void _set_parent_as_container_if_needed(size_t in)
+    {
+        NodeData const* n = _p(in);
+        size_t ip = parent(in);
+        if(ip != NONE)
+        {
+            if( ! (is_seq(ip) || is_map(ip)))
+            {
+                if((in == first_child(ip)) && (in == last_child(ip)))
+                {
+                    if( ! n->m_key.empty() || has_key(in))
+                    {
+                        _add_flags(ip, MAP);
+                    }
+                    else
+                    {
+                        _add_flags(ip, SEQ);
+                    }
+                }
+            }
+        }
+    }
+
+    void _seq2map(size_t node)
+    {
+        RYML_ASSERT(is_seq(node));
+        for(size_t i = first_child(node); i != NONE; i = next_sibling(i))
+        {
+            NodeData *C4_RESTRICT ch = _p(i);
+            if(ch->m_type.is_keyval())
+                continue;
+            ch->m_type.add(KEY);
+            ch->m_key = ch->m_val;
+        }
+        auto *C4_RESTRICT n = _p(node);
+        n->m_type.rem(SEQ);
+        n->m_type.add(MAP);
+    }
+
+    size_t _do_reorder(size_t *node, size_t count);
+
+    void _swap(size_t n_, size_t m_);
+    void _swap_props(size_t n_, size_t m_);
+    void _swap_hierarchy(size_t n_, size_t m_);
+    void _copy_hierarchy(size_t dst_, size_t src_);
+
+    inline void _copy_props(size_t dst_, size_t src_)
+    {
+        _copy_props(dst_, this, src_);
+    }
+
+    inline void _copy_props_wo_key(size_t dst_, size_t src_)
+    {
+        _copy_props_wo_key(dst_, this, src_);
+    }
+
+    void _copy_props(size_t dst_, Tree const* that_tree, size_t src_)
+    {
+        auto      & C4_RESTRICT dst = *_p(dst_);
+        auto const& C4_RESTRICT src = *that_tree->_p(src_);
+        dst.m_type = src.m_type;
+        dst.m_key  = src.m_key;
+        dst.m_val  = src.m_val;
+    }
+
+    void _copy_props_wo_key(size_t dst_, Tree const* that_tree, size_t src_)
+    {
+        auto      & C4_RESTRICT dst = *_p(dst_);
+        auto const& C4_RESTRICT src = *that_tree->_p(src_);
+        dst.m_type = (src.m_type & ~_KEYMASK) | (dst.m_type & _KEYMASK);
+        dst.m_val  = src.m_val;
+    }
+
+    inline void _clear_type(size_t node)
+    {
+        _p(node)->m_type = NOTYPE;
+    }
+
+    inline void _clear(size_t node)
+    {
+        auto *C4_RESTRICT n = _p(node);
+        n->m_type = NOTYPE;
+        n->m_key.clear();
+        n->m_val.clear();
+        n->m_parent = NONE;
+        n->m_first_child = NONE;
+        n->m_last_child = NONE;
+    }
+
+    inline void _clear_key(size_t node)
+    {
+        _p(node)->m_key.clear();
+        _rem_flags(node, KEY);
+    }
+
+    inline void _clear_val(size_t node)
+    {
+        _p(node)->m_val.clear();
+        _rem_flags(node, VAL);
+    }
+
+private:
+
+    void _clear_range(size_t first, size_t num);
+
+    size_t _claim();
+    void   _claim_root();
+    void   _release(size_t node);
+    void   _free_list_add(size_t node);
+    void   _free_list_rem(size_t node);
+
+    void _set_hierarchy(size_t node, size_t parent, size_t after_sibling);
+    void _rem_hierarchy(size_t node);
+
+public:
+
+    // members are exposed, but you should NOT access them directly
+
+    NodeData * m_buf;
+    size_t m_cap;
+
+    size_t m_size;
+
+    size_t m_free_head;
+    size_t m_free_tail;
+
+    substr m_arena;
+    size_t m_arena_pos;
+
+    Callbacks m_callbacks;
+
+    TagDirective m_tag_directives[RYML_MAX_TAG_DIRECTIVES];
+
+};
+
+} // namespace yml
+} // namespace c4
+
+
+C4_SUPPRESS_WARNING_MSVC_POP
+C4_SUPPRESS_WARNING_GCC_CLANG_POP
+
+
+#endif /* _C4_YML_TREE_HPP_ */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/tree.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/node.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/node.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_YML_NODE_HPP_
+#define _C4_YML_NODE_HPP_
+
+/** @file node.hpp
+ * @see NodeRef */
+
+//included above:
+//#include <cstddef>
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/tree.hpp
+//#include "c4/yml/tree.hpp"
+#if !defined(C4_YML_TREE_HPP_) && !defined(_C4_YML_TREE_HPP_)
+#error "amalgamate: file c4/yml/tree.hpp must have been included at this point"
+#endif /* C4_YML_TREE_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/base64.hpp
+//#include "c4/base64.hpp"
+#if !defined(C4_BASE64_HPP_) && !defined(_C4_BASE64_HPP_)
+#error "amalgamate: file c4/base64.hpp must have been included at this point"
+#endif /* C4_BASE64_HPP_ */
+
+
+#ifdef __GNUC__
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wtype-limits"
+#endif
+
+#if defined(_MSC_VER)
+#   pragma warning(push)
+#   pragma warning(disable: 4251/*needs to have dll-interface to be used by clients of struct*/)
+#   pragma warning(disable: 4296/*expression is always 'boolean_value'*/)
+#endif
+
+namespace c4 {
+namespace yml {
+
+template<class K> struct Key { K & k; };
+template<> struct Key<fmt::const_base64_wrapper> { fmt::const_base64_wrapper wrapper; };
+template<> struct Key<fmt::base64_wrapper> { fmt::base64_wrapper wrapper; };
+
+template<class K> C4_ALWAYS_INLINE Key<K> key(K & k) { return Key<K>{k}; }
+C4_ALWAYS_INLINE Key<fmt::const_base64_wrapper> key(fmt::const_base64_wrapper w) { return {w}; }
+C4_ALWAYS_INLINE Key<fmt::base64_wrapper> key(fmt::base64_wrapper w) { return {w}; }
+
+template<class T> void write(NodeRef *n, T const& v);
+
+template<class T>
+typename std::enable_if< ! std::is_floating_point<T>::value, bool>::type
+read(NodeRef const& n, T *v);
+
+template<class T>
+typename std::enable_if<   std::is_floating_point<T>::value, bool>::type
+read(NodeRef const& n, T *v);
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+// forward decls
+class NodeRef;
+class ConstNodeRef;
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+namespace detail {
+
+template<class NodeRefType>
+struct child_iterator
+{
+    using value_type = NodeRefType;
+    using tree_type = typename NodeRefType::tree_type;
+
+    tree_type * C4_RESTRICT m_tree;
+    size_t m_child_id;
+
+    child_iterator(tree_type * t, size_t id) : m_tree(t), m_child_id(id) {}
+
+    child_iterator& operator++ () { RYML_ASSERT(m_child_id != NONE); m_child_id = m_tree->next_sibling(m_child_id); return *this; }
+    child_iterator& operator-- () { RYML_ASSERT(m_child_id != NONE); m_child_id = m_tree->prev_sibling(m_child_id); return *this; }
+
+    NodeRefType operator*  () const { return NodeRefType(m_tree, m_child_id); }
+    NodeRefType operator-> () const { return NodeRefType(m_tree, m_child_id); }
+
+    bool operator!= (child_iterator that) const { RYML_ASSERT(m_tree == that.m_tree); return m_child_id != that.m_child_id; }
+    bool operator== (child_iterator that) const { RYML_ASSERT(m_tree == that.m_tree); return m_child_id == that.m_child_id; }
+};
+
+template<class NodeRefType>
+struct children_view_
+{
+    using n_iterator = child_iterator<NodeRefType>;
+
+    n_iterator b, e;
+
+    inline children_view_(n_iterator const& C4_RESTRICT b_,
+                          n_iterator const& C4_RESTRICT e_) : b(b_), e(e_) {}
+
+    inline n_iterator begin() const { return b; }
+    inline n_iterator end  () const { return e; }
+};
+
+template<class NodeRefType, class Visitor>
+bool _visit(NodeRefType &node, Visitor fn, size_t indentation_level, bool skip_root=false)
+{
+    size_t increment = 0;
+    if( ! (node.is_root() && skip_root))
+    {
+        if(fn(node, indentation_level))
+            return true;
+        ++increment;
+    }
+    if(node.has_children())
+    {
+        for(auto ch : node.children())
+        {
+            if(_visit(ch, fn, indentation_level + increment, false)) // no need to forward skip_root as it won't be root
+            {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+template<class NodeRefType, class Visitor>
+bool _visit_stacked(NodeRefType &node, Visitor fn, size_t indentation_level, bool skip_root=false)
+{
+    size_t increment = 0;
+    if( ! (node.is_root() && skip_root))
+    {
+        if(fn(node, indentation_level))
+        {
+            return true;
+        }
+        ++increment;
+    }
+    if(node.has_children())
+    {
+        fn.push(node, indentation_level);
+        for(auto ch : node.children())
+        {
+            if(_visit_stacked(ch, fn, indentation_level + increment, false)) // no need to forward skip_root as it won't be root
+            {
+                fn.pop(node, indentation_level);
+                return true;
+            }
+        }
+        fn.pop(node, indentation_level);
+    }
+    return false;
+}
+
+
+//-----------------------------------------------------------------------------
+
+/** a CRTP base for read-only node methods */
+template<class Impl, class ConstImpl>
+struct RoNodeMethods
+{
+    C4_SUPPRESS_WARNING_GCC_CLANG_WITH_PUSH("-Wcast-align")
+    // helper CRTP macros, undefined at the end
+    #define tree_ ((ConstImpl const* C4_RESTRICT)this)->m_tree
+    #define id_ ((ConstImpl const* C4_RESTRICT)this)->m_id
+    #define tree__ ((Impl const* C4_RESTRICT)this)->m_tree
+    #define id__ ((Impl const* C4_RESTRICT)this)->m_id
+    // require valid
+    #define _C4RV()                                       \
+        RYML_ASSERT(tree_ != nullptr);                    \
+        _RYML_CB_ASSERT(tree_->m_callbacks, id_ != NONE)
+    #define _C4_IF_MUTABLE(ty) typename std::enable_if<!std::is_same<U, ConstImpl>::value, ty>::type
+
+public:
+
+    /** @name node property getters */
+    /** @{ */
+
+    /** returns the data or null when the id is NONE */
+    C4_ALWAYS_INLINE C4_PURE NodeData const* get() const noexcept { RYML_ASSERT(tree_ != nullptr); return tree_->get(id_); }
+    /** returns the data or null when the id is NONE */
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto get() noexcept -> _C4_IF_MUTABLE(NodeData*) { RYML_ASSERT(tree_ != nullptr); return tree__->get(id__); }
+
+    C4_ALWAYS_INLINE C4_PURE NodeType    type() const noexcept { _C4RV(); return tree_->type(id_); }
+    C4_ALWAYS_INLINE C4_PURE const char* type_str() const noexcept { return tree_->type_str(id_); }
+
+    C4_ALWAYS_INLINE C4_PURE csubstr key()        const noexcept { _C4RV(); return tree_->key(id_); }
+    C4_ALWAYS_INLINE C4_PURE csubstr key_tag()    const noexcept { _C4RV(); return tree_->key_tag(id_); }
+    C4_ALWAYS_INLINE C4_PURE csubstr key_ref()    const noexcept { _C4RV(); return tree_->key_ref(id_); }
+    C4_ALWAYS_INLINE C4_PURE csubstr key_anchor() const noexcept { _C4RV(); return tree_->key_anchor(id_); }
+
+    C4_ALWAYS_INLINE C4_PURE csubstr val()        const noexcept { _C4RV(); return tree_->val(id_); }
+    C4_ALWAYS_INLINE C4_PURE csubstr val_tag()    const noexcept { _C4RV(); return tree_->val_tag(id_); }
+    C4_ALWAYS_INLINE C4_PURE csubstr val_ref()    const noexcept { _C4RV(); return tree_->val_ref(id_); }
+    C4_ALWAYS_INLINE C4_PURE csubstr val_anchor() const noexcept { _C4RV(); return tree_->val_anchor(id_); }
+
+    C4_ALWAYS_INLINE C4_PURE NodeScalar const& keysc() const noexcept { _C4RV(); return tree_->keysc(id_); }
+    C4_ALWAYS_INLINE C4_PURE NodeScalar const& valsc() const noexcept { _C4RV(); return tree_->valsc(id_); }
+
+    C4_ALWAYS_INLINE C4_PURE bool key_is_null() const noexcept { _C4RV(); return tree_->key_is_null(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool val_is_null() const noexcept { _C4RV(); return tree_->val_is_null(id_); }
+
+    /** @} */
+
+public:
+
+    /** @name node property predicates */
+    /** @{ */
+
+    C4_ALWAYS_INLINE C4_PURE bool empty()            const noexcept { _C4RV(); return tree_->empty(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool is_stream()        const noexcept { _C4RV(); return tree_->is_stream(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool is_doc()           const noexcept { _C4RV(); return tree_->is_doc(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool is_container()     const noexcept { _C4RV(); return tree_->is_container(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool is_map()           const noexcept { _C4RV(); return tree_->is_map(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool is_seq()           const noexcept { _C4RV(); return tree_->is_seq(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool has_val()          const noexcept { _C4RV(); return tree_->has_val(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool has_key()          const noexcept { _C4RV(); return tree_->has_key(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool is_val()           const noexcept { _C4RV(); return tree_->is_val(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool is_keyval()        const noexcept { _C4RV(); return tree_->is_keyval(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool has_key_tag()      const noexcept { _C4RV(); return tree_->has_key_tag(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool has_val_tag()      const noexcept { _C4RV(); return tree_->has_val_tag(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool has_key_anchor()   const noexcept { _C4RV(); return tree_->has_key_anchor(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool is_key_anchor()    const noexcept { _C4RV(); return tree_->is_key_anchor(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool has_val_anchor()   const noexcept { _C4RV(); return tree_->has_val_anchor(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool is_val_anchor()    const noexcept { _C4RV(); return tree_->is_val_anchor(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool has_anchor()       const noexcept { _C4RV(); return tree_->has_anchor(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool is_anchor()        const noexcept { _C4RV(); return tree_->is_anchor(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool is_key_ref()       const noexcept { _C4RV(); return tree_->is_key_ref(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool is_val_ref()       const noexcept { _C4RV(); return tree_->is_val_ref(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool is_ref()           const noexcept { _C4RV(); return tree_->is_ref(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool is_anchor_or_ref() const noexcept { _C4RV(); return tree_->is_anchor_or_ref(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool is_key_quoted()    const noexcept { _C4RV(); return tree_->is_key_quoted(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool is_val_quoted()    const noexcept { _C4RV(); return tree_->is_val_quoted(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool is_quoted()        const noexcept { _C4RV(); return tree_->is_quoted(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool parent_is_seq()    const noexcept { _C4RV(); return tree_->parent_is_seq(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool parent_is_map()    const noexcept { _C4RV(); return tree_->parent_is_map(id_); }
+
+    /** @} */
+
+public:
+
+    /** @name hierarchy predicates */
+    /** @{ */
+
+    C4_ALWAYS_INLINE C4_PURE bool is_root()    const noexcept { _C4RV(); return tree_->is_root(id_); }
+    C4_ALWAYS_INLINE C4_PURE bool has_parent() const noexcept { _C4RV(); return tree_->has_parent(id_); }
+
+    C4_ALWAYS_INLINE C4_PURE bool has_child(ConstImpl const& ch) const noexcept { _C4RV(); return tree_->has_child(id_, ch.m_id); }
+    C4_ALWAYS_INLINE C4_PURE bool has_child(csubstr name) const noexcept { _C4RV(); return tree_->has_child(id_, name); }
+    C4_ALWAYS_INLINE C4_PURE bool has_children() const noexcept { _C4RV(); return tree_->has_children(id_); }
+
+    C4_ALWAYS_INLINE C4_PURE bool has_sibling(ConstImpl const& n) const noexcept { _C4RV(); return tree_->has_sibling(id_, n.m_id); }
+    C4_ALWAYS_INLINE C4_PURE bool has_sibling(csubstr name) const noexcept { _C4RV(); return tree_->has_sibling(id_, name); }
+    /** counts with this */
+    C4_ALWAYS_INLINE C4_PURE bool has_siblings() const noexcept { _C4RV(); return tree_->has_siblings(id_); }
+    /** does not count with this */
+    C4_ALWAYS_INLINE C4_PURE bool has_other_siblings() const noexcept { _C4RV(); return tree_->has_other_siblings(id_); }
+
+    /** @} */
+
+public:
+
+    /** @name hierarchy getters */
+    /** @{ */
+
+
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto doc(size_t num) noexcept -> _C4_IF_MUTABLE(Impl) { _C4RV(); return {tree__, tree__->doc(num)}; }
+    C4_ALWAYS_INLINE C4_PURE ConstImpl doc(size_t num) const noexcept { _C4RV(); return {tree_, tree_->doc(num)}; }
+
+
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto parent() noexcept -> _C4_IF_MUTABLE(Impl) { _C4RV(); return {tree__, tree__->parent(id__)}; }
+    C4_ALWAYS_INLINE C4_PURE ConstImpl parent() const noexcept { _C4RV(); return {tree_, tree_->parent(id_)}; }
+
+
+    /** O(#num_children) */
+    C4_ALWAYS_INLINE C4_PURE size_t child_pos(ConstImpl const& n) const noexcept { _C4RV(); return tree_->child_pos(id_, n.m_id); }
+    C4_ALWAYS_INLINE C4_PURE size_t num_children() const noexcept { _C4RV(); return tree_->num_children(id_); }
+
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto first_child() noexcept -> _C4_IF_MUTABLE(Impl) { _C4RV(); return {tree__, tree__->first_child(id__)}; }
+    C4_ALWAYS_INLINE C4_PURE ConstImpl first_child() const noexcept { _C4RV(); return {tree_, tree_->first_child(id_)}; }
+
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto last_child() noexcept -> _C4_IF_MUTABLE(Impl) { _C4RV(); return {tree__, tree__->last_child(id__)}; }
+    C4_ALWAYS_INLINE C4_PURE ConstImpl last_child () const noexcept { _C4RV(); return {tree_, tree_->last_child (id_)}; }
+
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto child(size_t pos) noexcept -> _C4_IF_MUTABLE(Impl) { _C4RV(); return {tree__, tree__->child(id__, pos)}; }
+    C4_ALWAYS_INLINE C4_PURE ConstImpl child(size_t pos) const noexcept { _C4RV(); return {tree_, tree_->child(id_, pos)}; }
+
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto find_child(csubstr name)  noexcept -> _C4_IF_MUTABLE(Impl) { _C4RV(); return {tree__, tree__->find_child(id__, name)}; }
+    C4_ALWAYS_INLINE C4_PURE ConstImpl find_child(csubstr name) const noexcept { _C4RV(); return {tree_, tree_->find_child(id_, name)}; }
+
+
+    /** O(#num_siblings) */
+    C4_ALWAYS_INLINE C4_PURE size_t num_siblings() const noexcept { _C4RV(); return tree_->num_siblings(id_); }
+    C4_ALWAYS_INLINE C4_PURE size_t num_other_siblings() const noexcept { _C4RV(); return tree_->num_other_siblings(id_); }
+    C4_ALWAYS_INLINE C4_PURE size_t sibling_pos(ConstImpl const& n) const noexcept { _C4RV(); return tree_->child_pos(tree_->parent(id_), n.m_id); }
+
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto prev_sibling() noexcept -> _C4_IF_MUTABLE(Impl) { _C4RV(); return {tree__, tree__->prev_sibling(id__)}; }
+    C4_ALWAYS_INLINE C4_PURE ConstImpl prev_sibling() const noexcept { _C4RV(); return {tree_, tree_->prev_sibling(id_)}; }
+
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto next_sibling() noexcept -> _C4_IF_MUTABLE(Impl) { _C4RV(); return {tree__, tree__->next_sibling(id__)}; }
+    C4_ALWAYS_INLINE C4_PURE ConstImpl next_sibling() const noexcept { _C4RV(); return {tree_, tree_->next_sibling(id_)}; }
+
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto first_sibling() noexcept -> _C4_IF_MUTABLE(Impl) { _C4RV(); return {tree__, tree__->first_sibling(id__)}; }
+    C4_ALWAYS_INLINE C4_PURE ConstImpl first_sibling() const noexcept { _C4RV(); return {tree_, tree_->first_sibling(id_)}; }
+
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto last_sibling() noexcept -> _C4_IF_MUTABLE(Impl) { _C4RV(); return {tree__, tree__->last_sibling(id__)}; }
+    C4_ALWAYS_INLINE C4_PURE ConstImpl last_sibling () const noexcept { _C4RV(); return {tree_, tree_->last_sibling(id_)}; }
+
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto sibling(size_t pos) noexcept -> _C4_IF_MUTABLE(Impl) { _C4RV(); return {tree__, tree__->sibling(id__, pos)}; }
+    C4_ALWAYS_INLINE C4_PURE ConstImpl sibling(size_t pos) const noexcept { _C4RV(); return {tree_, tree_->sibling(id_, pos)}; }
+
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto find_sibling(csubstr name) noexcept -> _C4_IF_MUTABLE(Impl) { _C4RV(); return {tree__, tree__->find_sibling(id__, name)}; }
+    C4_ALWAYS_INLINE C4_PURE ConstImpl find_sibling(csubstr name) const noexcept { _C4RV(); return {tree_, tree_->find_sibling(id_, name)}; }
+
+
+    /** O(num_children) */
+    C4_ALWAYS_INLINE C4_PURE ConstImpl operator[] (csubstr k) const noexcept
+    {
+        _C4RV();
+        size_t ch = tree_->find_child(id_, k);
+        _RYML_CB_ASSERT(tree_->m_callbacks, ch != NONE);
+        return {tree_, ch};
+    }
+    /** Find child by key. O(num_children). returns a seed node if no such child is found.  */
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto operator[] (csubstr k) noexcept -> _C4_IF_MUTABLE(Impl)
+    {
+        _C4RV();
+        size_t ch = tree__->find_child(id__, k);
+        return ch != NONE ? Impl(tree__, ch) : NodeRef(tree__, id__, k);
+    }
+
+    /** O(num_children) */
+    C4_ALWAYS_INLINE C4_PURE ConstImpl operator[] (size_t pos) const noexcept
+    {
+        _C4RV();
+        size_t ch = tree_->child(id_, pos);
+        _RYML_CB_ASSERT(tree_->m_callbacks, ch != NONE);
+        return {tree_, ch};
+    }
+
+    /** Find child by position. O(pos). returns a seed node if no such child is found.  */
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto operator[] (size_t pos) noexcept -> _C4_IF_MUTABLE(Impl)
+    {
+        _C4RV();
+        size_t ch = tree__->child(id__, pos);
+        return ch != NONE ? Impl(tree__, ch) : NodeRef(tree__, id__, pos);
+    }
+
+    /** @} */
+
+public:
+
+    /** deserialization */
+    /** @{ */
+
+    template<class T>
+    ConstImpl const& operator>> (T &v) const
+    {
+        _C4RV();
+        if( ! read((ConstImpl const&)*this, &v))
+            _RYML_CB_ERR(tree_->m_callbacks, "could not deserialize value");
+        return *((ConstImpl const*)this);
+    }
+
+    /** deserialize the node's key to the given variable */
+    template<class T>
+    ConstImpl const& operator>> (Key<T> v) const
+    {
+        _C4RV();
+        if( ! from_chars(key(), &v.k))
+            _RYML_CB_ERR(tree_->m_callbacks, "could not deserialize key");
+        return *((ConstImpl const*)this);
+    }
+
+    /** deserialize the node's key as base64 */
+    ConstImpl const& operator>> (Key<fmt::base64_wrapper> w) const
+    {
+        deserialize_key(w.wrapper);
+        return *((ConstImpl const*)this);
+    }
+
+    /** deserialize the node's val as base64 */
+    ConstImpl const& operator>> (fmt::base64_wrapper w) const
+    {
+        deserialize_val(w);
+        return *((ConstImpl const*)this);
+    }
+
+    /** decode the base64-encoded key and assign the
+     * decoded blob to the given buffer/
+     * @return the size of base64-decoded blob */
+    size_t deserialize_key(fmt::base64_wrapper v) const
+    {
+        _C4RV();
+        return from_chars(key(), &v);
+    }
+    /** decode the base64-encoded key and assign the
+     * decoded blob to the given buffer/
+     * @return the size of base64-decoded blob */
+    size_t deserialize_val(fmt::base64_wrapper v) const
+    {
+        _C4RV();
+        return from_chars(val(), &v);
+    };
+
+    template<class T>
+    bool get_if(csubstr name, T *var) const
+    {
+        auto ch = find_child(name);
+        if(!ch.valid())
+            return false;
+        ch >> *var;
+        return true;
+    }
+
+    template<class T>
+    bool get_if(csubstr name, T *var, T const& fallback) const
+    {
+        auto ch = find_child(name);
+        if(ch.valid())
+        {
+            ch >> *var;
+            return true;
+        }
+        else
+        {
+            *var = fallback;
+            return false;
+        }
+    }
+
+    template<class T>
+    T as() const 
+    {
+        T val{};
+        tree__->cref(id__) >> val;
+        return val;
+    }
+
+    /** @} */
+
+public:
+
+    #if defined(__clang__)
+    #   pragma clang diagnostic push
+    #   pragma clang diagnostic ignored "-Wnull-dereference"
+    #elif defined(__GNUC__)
+    #   pragma GCC diagnostic push
+    #   if __GNUC__ >= 6
+    #       pragma GCC diagnostic ignored "-Wnull-dereference"
+    #   endif
+    #endif
+
+    /** @name iteration */
+    /** @{ */
+
+    using iterator = detail::child_iterator<Impl>;
+    using const_iterator = detail::child_iterator<ConstImpl>;
+    using children_view = detail::children_view_<Impl>;
+    using const_children_view = detail::children_view_<ConstImpl>;
+
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto begin() noexcept -> _C4_IF_MUTABLE(iterator) { _C4RV(); return iterator(tree__, tree__->first_child(id__)); }
+    C4_ALWAYS_INLINE C4_PURE const_iterator begin() const noexcept { _C4RV(); return const_iterator(tree_, tree_->first_child(id_)); }
+    C4_ALWAYS_INLINE C4_PURE const_iterator cbegin() const noexcept { _C4RV(); return const_iterator(tree_, tree_->first_child(id_)); }
+
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto end() noexcept -> _C4_IF_MUTABLE(iterator) { _C4RV(); return iterator(tree__, NONE); }
+    C4_ALWAYS_INLINE C4_PURE const_iterator end() const noexcept { _C4RV(); return const_iterator(tree_, NONE); }
+    C4_ALWAYS_INLINE C4_PURE const_iterator cend() const noexcept { _C4RV(); return const_iterator(tree_, tree_->first_child(id_)); }
+
+    /** get an iterable view over children */
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto children() noexcept -> _C4_IF_MUTABLE(children_view) { _C4RV(); return children_view(begin(), end()); }
+    /** get an iterable view over children */
+    C4_ALWAYS_INLINE C4_PURE const_children_view children() const noexcept { _C4RV(); return const_children_view(begin(), end()); }
+    /** get an iterable view over children */
+    C4_ALWAYS_INLINE C4_PURE const_children_view cchildren() const noexcept { _C4RV(); return const_children_view(begin(), end()); }
+
+    /** get an iterable view over all siblings (including the calling node) */
+    template<class U=Impl>
+    C4_ALWAYS_INLINE C4_PURE auto siblings() noexcept -> _C4_IF_MUTABLE(children_view)
+    {
+        _C4RV();
+        NodeData const *nd = tree__->get(id__);
+        return (nd->m_parent != NONE) ? // does it have a parent?
+            children_view(iterator(tree__, tree_->get(nd->m_parent)->m_first_child), iterator(tree__, NONE))
+            :
+            children_view(end(), end());
+    }
+    /** get an iterable view over all siblings (including the calling node) */
+    C4_ALWAYS_INLINE C4_PURE const_children_view siblings() const noexcept
+    {
+        _C4RV();
+        NodeData const *nd = tree_->get(id_);
+        return (nd->m_parent != NONE) ? // does it have a parent?
+            const_children_view(const_iterator(tree_, tree_->get(nd->m_parent)->m_first_child), const_iterator(tree_, NONE))
+            :
+            const_children_view(end(), end());
+    }
+    /** get an iterable view over all siblings (including the calling node) */
+    C4_ALWAYS_INLINE C4_PURE const_children_view csiblings() const noexcept { return siblings(); }
+
+    /** visit every child node calling fn(node) */
+    template<class Visitor>
+    C4_ALWAYS_INLINE C4_PURE bool visit(Visitor fn, size_t indentation_level=0, bool skip_root=true) const noexcept
+    {
+        return detail::_visit(*(ConstImpl*)this, fn, indentation_level, skip_root);
+    }
+    /** visit every child node calling fn(node) */
+    template<class Visitor, class U=Impl>
+    auto visit(Visitor fn, size_t indentation_level=0, bool skip_root=true) noexcept
+        -> _C4_IF_MUTABLE(bool)
+    {
+        return detail::_visit(*(Impl*)this, fn, indentation_level, skip_root);
+    }
+
+    /** visit every child node calling fn(node, level) */
+    template<class Visitor>
+    C4_ALWAYS_INLINE C4_PURE bool visit_stacked(Visitor fn, size_t indentation_level=0, bool skip_root=true) const noexcept
+    {
+        return detail::_visit_stacked(*(ConstImpl*)this, fn, indentation_level, skip_root);
+    }
+    /** visit every child node calling fn(node, level) */
+    template<class Visitor, class U=Impl>
+    auto visit_stacked(Visitor fn, size_t indentation_level=0, bool skip_root=true) noexcept
+        -> _C4_IF_MUTABLE(bool)
+    {
+        return detail::_visit_stacked(*(Impl*)this, fn, indentation_level, skip_root);
+    }
+
+    /** @} */
+
+    #if defined(__clang__)
+    #   pragma clang diagnostic pop
+    #elif defined(__GNUC__)
+    #   pragma GCC diagnostic pop
+    #endif
+
+    #undef _C4_IF_MUTABLE
+    #undef _C4RV
+    #undef tree_
+    #undef tree__
+    #undef id_
+    #undef id__
+
+    C4_SUPPRESS_WARNING_GCC_CLANG_POP
+};
+
+} // namespace detail
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+class RYML_EXPORT ConstNodeRef : public detail::RoNodeMethods<ConstNodeRef, ConstNodeRef>
+{
+public:
+
+    using tree_type = Tree const;
+
+public:
+
+    Tree const* C4_RESTRICT m_tree;
+    size_t m_id;
+
+    friend NodeRef;
+    friend struct detail::RoNodeMethods<ConstNodeRef, ConstNodeRef>;
+
+public:
+
+    /** @name construction */
+    /** @{ */
+
+    ConstNodeRef() : m_tree(nullptr), m_id(NONE) {}
+    ConstNodeRef(Tree const &t) : m_tree(&t), m_id(t .root_id()) {}
+    ConstNodeRef(Tree const *t) : m_tree(t ), m_id(t->root_id()) {}
+    ConstNodeRef(Tree const *t, size_t id) : m_tree(t), m_id(id) {}
+    ConstNodeRef(std::nullptr_t) : m_tree(nullptr), m_id(NONE) {}
+
+    ConstNodeRef(ConstNodeRef const&) = default;
+    ConstNodeRef(ConstNodeRef     &&) = default;
+
+    ConstNodeRef(NodeRef const&);
+    ConstNodeRef(NodeRef     &&);
+
+    /** @} */
+
+public:
+
+    /** @name assignment */
+    /** @{ */
+
+    ConstNodeRef& operator= (std::nullptr_t) { m_tree = nullptr; m_id = NONE; return *this; }
+
+    ConstNodeRef& operator= (ConstNodeRef const&) = default;
+    ConstNodeRef& operator= (ConstNodeRef     &&) = default;
+
+    ConstNodeRef& operator= (NodeRef const&);
+    ConstNodeRef& operator= (NodeRef     &&);
+
+
+    /** @} */
+
+public:
+
+    /** @name state queries */
+    /** @{ */
+
+    C4_ALWAYS_INLINE C4_PURE bool valid() const noexcept { return m_tree != nullptr && m_id != NONE; }
+
+    /** @} */
+
+public:
+
+    /** @name member getters */
+    /** @{ */
+
+    C4_ALWAYS_INLINE C4_PURE Tree const* tree() const noexcept { return m_tree; }
+    C4_ALWAYS_INLINE C4_PURE size_t id() const noexcept { return m_id; }
+
+    /** @} */
+
+public:
+
+    /** @name comparisons */
+    /** @{ */
+
+    C4_ALWAYS_INLINE C4_PURE bool operator== (ConstNodeRef const& that) const noexcept { RYML_ASSERT(that.m_tree == m_tree); return m_id == that.m_id; }
+    C4_ALWAYS_INLINE C4_PURE bool operator!= (ConstNodeRef const& that) const noexcept { RYML_ASSERT(that.m_tree == m_tree); return ! this->operator==(that); }
+
+    C4_ALWAYS_INLINE C4_PURE bool operator== (std::nullptr_t) const noexcept { return m_tree == nullptr || m_id == NONE; }
+    C4_ALWAYS_INLINE C4_PURE bool operator!= (std::nullptr_t) const noexcept { return ! this->operator== (nullptr); }
+
+    C4_ALWAYS_INLINE C4_PURE bool operator== (csubstr val) const noexcept { RYML_ASSERT(has_val()); return m_tree->val(m_id) == val; }
+    C4_ALWAYS_INLINE C4_PURE bool operator!= (csubstr val) const noexcept { RYML_ASSERT(has_val()); return m_tree->val(m_id) != val; }
+
+    /** @} */
+
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** a reference to a node in an existing yaml tree, offering a more
+ * convenient API than the index-based API used in the tree. */
+class RYML_EXPORT NodeRef : public detail::RoNodeMethods<NodeRef, ConstNodeRef>
+{
+public:
+
+    using tree_type = Tree;
+    using base_type = detail::RoNodeMethods<NodeRef, ConstNodeRef>;
+
+private:
+
+    Tree *C4_RESTRICT m_tree;
+    size_t m_id;
+
+    /** This member is used to enable lazy operator[] writing. When a child
+     * with a key or index is not found, m_id is set to the id of the parent
+     * and the asked-for key or index are stored in this member until a write
+     * does happen. Then it is given as key or index for creating the child.
+     * When a key is used, the csubstr stores it (so the csubstr's string is
+     * non-null and the csubstr's size is different from NONE). When an index is
+     * used instead, the csubstr's string is set to null, and only the csubstr's
+     * size is set to a value different from NONE. Otherwise, when operator[]
+     * does find the child then this member is empty: the string is null and
+     * the size is NONE. */
+    csubstr m_seed;
+
+    friend ConstNodeRef;
+    friend struct detail::RoNodeMethods<NodeRef, ConstNodeRef>;
+
+    // require valid: a helper macro, undefined at the end
+    #define _C4RV()                                                         \
+        RYML_ASSERT(m_tree != nullptr);                                     \
+        _RYML_CB_ASSERT(m_tree->m_callbacks, m_id != NONE && !is_seed())
+
+public:
+
+    /** @name construction */
+    /** @{ */
+
+    NodeRef() : m_tree(nullptr), m_id(NONE), m_seed() { _clear_seed(); }
+    NodeRef(Tree &t) : m_tree(&t), m_id(t .root_id()), m_seed() { _clear_seed(); }
+    NodeRef(Tree *t) : m_tree(t ), m_id(t->root_id()), m_seed() { _clear_seed(); }
+    NodeRef(Tree *t, size_t id) : m_tree(t), m_id(id), m_seed() { _clear_seed(); }
+    NodeRef(Tree *t, size_t id, size_t seed_pos) : m_tree(t), m_id(id), m_seed() { m_seed.str = nullptr; m_seed.len = seed_pos; }
+    NodeRef(Tree *t, size_t id, csubstr  seed_key) : m_tree(t), m_id(id), m_seed(seed_key) {}
+    NodeRef(std::nullptr_t) : m_tree(nullptr), m_id(NONE), m_seed() {}
+
+    /** @} */
+
+public:
+
+    /** @name assignment */
+    /** @{ */
+
+    NodeRef(NodeRef const&) = default;
+    NodeRef(NodeRef     &&) = default;
+
+    NodeRef& operator= (NodeRef const&) = default;
+    NodeRef& operator= (NodeRef     &&) = default;
+
+    /** @} */
+
+public:
+
+    /** @name state queries */
+    /** @{ */
+
+    inline bool valid() const { return m_tree != nullptr && m_id != NONE; }
+    inline bool is_seed() const { return m_seed.str != nullptr || m_seed.len != NONE; }
+
+    inline void _clear_seed() { /*do this manually or an assert is triggered*/ m_seed.str = nullptr; m_seed.len = NONE; }
+
+    /** @} */
+
+public:
+
+    /** @name comparisons */
+    /** @{ */
+
+    inline bool operator== (NodeRef const& that) const { _C4RV(); RYML_ASSERT(that.valid() && !that.is_seed()); RYML_ASSERT(that.m_tree == m_tree); return m_id == that.m_id; }
+    inline bool operator!= (NodeRef const& that) const { return ! this->operator==(that); }
+
+    inline bool operator== (ConstNodeRef const& that) const { _C4RV(); RYML_ASSERT(that.valid()); RYML_ASSERT(that.m_tree == m_tree); return m_id == that.m_id; }
+    inline bool operator!= (ConstNodeRef const& that) const { return ! this->operator==(that); }
+
+    inline bool operator== (std::nullptr_t) const { return m_tree == nullptr || m_id == NONE || is_seed(); }
+    inline bool operator!= (std::nullptr_t) const { return m_tree != nullptr && m_id != NONE && !is_seed(); }
+
+    inline bool operator== (csubstr val) const { _C4RV(); RYML_ASSERT(has_val()); return m_tree->val(m_id) == val; }
+    inline bool operator!= (csubstr val) const { _C4RV(); RYML_ASSERT(has_val()); return m_tree->val(m_id) != val; }
+
+    //inline operator bool () const { return m_tree == nullptr || m_id == NONE || is_seed(); }
+
+    /** @} */
+
+public:
+
+    /** @name node property getters */
+    /** @{ */
+
+    C4_ALWAYS_INLINE C4_PURE Tree * tree() noexcept { return m_tree; }
+    C4_ALWAYS_INLINE C4_PURE Tree const* tree() const noexcept { return m_tree; }
+
+    C4_ALWAYS_INLINE C4_PURE size_t id() const noexcept { return m_id; }
+
+    /** @} */
+
+public:
+
+    /** @name node modifiers */
+    /** @{ */
+
+    void change_type(NodeType t) { _C4RV(); m_tree->change_type(m_id, t); }
+
+    void set_type(NodeType t) { _C4RV(); m_tree->_set_flags(m_id, t); }
+    void set_key(csubstr key) { _C4RV(); m_tree->_set_key(m_id, key); }
+    void set_val(csubstr val) { _C4RV(); m_tree->_set_val(m_id, val); }
+    void set_key_tag(csubstr key_tag) { _C4RV(); m_tree->set_key_tag(m_id, key_tag); }
+    void set_val_tag(csubstr val_tag) { _C4RV(); m_tree->set_val_tag(m_id, val_tag); }
+    void set_key_anchor(csubstr key_anchor) { _C4RV(); m_tree->set_key_anchor(m_id, key_anchor); }
+    void set_val_anchor(csubstr val_anchor) { _C4RV(); m_tree->set_val_anchor(m_id, val_anchor); }
+    void set_key_ref(csubstr key_ref) { _C4RV(); m_tree->set_key_ref(m_id, key_ref); }
+    void set_val_ref(csubstr val_ref) { _C4RV(); m_tree->set_val_ref(m_id, val_ref); }
+
+    template<class T>
+    size_t set_key_serialized(T const& C4_RESTRICT k)
+    {
+        _C4RV();
+        csubstr s = m_tree->to_arena(k);
+        m_tree->_set_key(m_id, s);
+        return s.len;
+    }
+    template<class T>
+    size_t set_val_serialized(T const& C4_RESTRICT v)
+    {
+        _C4RV();
+        csubstr s = m_tree->to_arena(v);
+        m_tree->_set_val(m_id, s);
+        return s.len;
+    }
+    size_t set_val_serialized(std::nullptr_t)
+    {
+        _C4RV();
+        m_tree->_set_val(m_id, csubstr{});
+        return 0;
+    }
+
+    /** encode a blob as base64, then assign the result to the node's key
+     * @return the size of base64-encoded blob */
+    size_t set_key_serialized(fmt::const_base64_wrapper w);
+    /** encode a blob as base64, then assign the result to the node's val
+     * @return the size of base64-encoded blob */
+    size_t set_val_serialized(fmt::const_base64_wrapper w);
+
+public:
+
+    inline void clear()
+    {
+        if(is_seed())
+            return;
+        m_tree->remove_children(m_id);
+        m_tree->_clear(m_id);
+    }
+
+    inline void clear_key()
+    {
+        if(is_seed())
+            return;
+        m_tree->_clear_key(m_id);
+    }
+
+    inline void clear_val()
+    {
+        if(is_seed())
+            return;
+        m_tree->_clear_val(m_id);
+    }
+
+    inline void clear_children()
+    {
+        if(is_seed())
+            return;
+        m_tree->remove_children(m_id);
+    }
+
+    void create() { _apply_seed(); }
+
+    inline void operator= (NodeType_e t)
+    {
+        _apply_seed();
+        m_tree->_add_flags(m_id, t);
+    }
+
+    inline void operator|= (NodeType_e t)
+    {
+        _apply_seed();
+        m_tree->_add_flags(m_id, t);
+    }
+
+    inline void operator= (NodeInit const& v)
+    {
+        _apply_seed();
+        _apply(v);
+    }
+
+    inline void operator= (NodeScalar const& v)
+    {
+        _apply_seed();
+        _apply(v);
+    }
+
+    inline void operator= (std::nullptr_t)
+    {
+        _apply_seed();
+        _apply(csubstr{});
+    }
+
+    inline void operator= (csubstr v)
+    {
+        _apply_seed();
+        _apply(v);
+    }
+
+    template<size_t N>
+    inline void operator= (const char (&v)[N])
+    {
+        _apply_seed();
+        csubstr sv;
+        sv.assign<N>(v);
+        _apply(sv);
+    }
+
+    /** @} */
+
+public:
+
+    /** @name serialization */
+    /** @{ */
+
+    /** serialize a variable to the arena */
+    template<class T>
+    inline csubstr to_arena(T const& C4_RESTRICT s)
+    {
+        _C4RV();
+        return m_tree->to_arena(s);
+    }
+
+    /** serialize a variable, then assign the result to the node's val */
+    inline NodeRef& operator<< (csubstr s)
+    {
+        // this overload is needed to prevent ambiguity (there's also
+        // operator<< for writing a substr to a stream)
+        _apply_seed();
+        write(this, s);
+        RYML_ASSERT(val() == s);
+        return *this;
+    }
+
+    template<class T>
+    inline NodeRef& operator<< (T const& C4_RESTRICT v)
+    {
+        _apply_seed();
+        write(this, v);
+        return *this;
+    }
+
+    /** serialize a variable, then assign the result to the node's key */
+    template<class T>
+    inline NodeRef& operator<< (Key<const T> const& C4_RESTRICT v)
+    {
+        _apply_seed();
+        set_key_serialized(v.k);
+        return *this;
+    }
+
+    /** serialize a variable, then assign the result to the node's key */
+    template<class T>
+    inline NodeRef& operator<< (Key<T> const& C4_RESTRICT v)
+    {
+        _apply_seed();
+        set_key_serialized(v.k);
+        return *this;
+    }
+
+    NodeRef& operator<< (Key<fmt::const_base64_wrapper> w)
+    {
+        set_key_serialized(w.wrapper);
+        return *this;
+    }
+
+    NodeRef& operator<< (fmt::const_base64_wrapper w)
+    {
+        set_val_serialized(w);
+        return *this;
+    }
+
+    /** @} */
+
+private:
+
+    void _apply_seed()
+    {
+        if(m_seed.str) // we have a seed key: use it to create the new child
+        {
+            //RYML_ASSERT(i.key.scalar.empty() || m_key == i.key.scalar || m_key.empty());
+            m_id = m_tree->append_child(m_id);
+            m_tree->_set_key(m_id, m_seed);
+            m_seed.str = nullptr;
+            m_seed.len = NONE;
+        }
+        else if(m_seed.len != NONE) // we have a seed index: create a child at that position
+        {
+            RYML_ASSERT(m_tree->num_children(m_id) == m_seed.len);
+            m_id = m_tree->append_child(m_id);
+            m_seed.str = nullptr;
+            m_seed.len = NONE;
+        }
+        else
+        {
+            RYML_ASSERT(valid());
+        }
+    }
+
+    inline void _apply(csubstr v)
+    {
+        m_tree->_set_val(m_id, v);
+    }
+
+    inline void _apply(NodeScalar const& v)
+    {
+        m_tree->_set_val(m_id, v);
+    }
+
+    inline void _apply(NodeInit const& i)
+    {
+        m_tree->_set(m_id, i);
+    }
+
+public:
+
+    /** @name modification of hierarchy */
+    /** @{ */
+
+    inline NodeRef insert_child(NodeRef after)
+    {
+        _C4RV();
+        RYML_ASSERT(after.m_tree == m_tree);
+        NodeRef r(m_tree, m_tree->insert_child(m_id, after.m_id));
+        return r;
+    }
+
+    inline NodeRef insert_child(NodeInit const& i, NodeRef after)
+    {
+        _C4RV();
+        RYML_ASSERT(after.m_tree == m_tree);
+        NodeRef r(m_tree, m_tree->insert_child(m_id, after.m_id));
+        r._apply(i);
+        return r;
+    }
+
+    inline NodeRef prepend_child()
+    {
+        _C4RV();
+        NodeRef r(m_tree, m_tree->insert_child(m_id, NONE));
+        return r;
+    }
+
+    inline NodeRef prepend_child(NodeInit const& i)
+    {
+        _C4RV();
+        NodeRef r(m_tree, m_tree->insert_child(m_id, NONE));
+        r._apply(i);
+        return r;
+    }
+
+    inline NodeRef append_child()
+    {
+        _C4RV();
+        NodeRef r(m_tree, m_tree->append_child(m_id));
+        return r;
+    }
+
+    inline NodeRef append_child(NodeInit const& i)
+    {
+        _C4RV();
+        NodeRef r(m_tree, m_tree->append_child(m_id));
+        r._apply(i);
+        return r;
+    }
+
+public:
+
+    inline NodeRef insert_sibling(ConstNodeRef const& after)
+    {
+        _C4RV();
+        RYML_ASSERT(after.m_tree == m_tree);
+        NodeRef r(m_tree, m_tree->insert_sibling(m_id, after.m_id));
+        return r;
+    }
+
+    inline NodeRef insert_sibling(NodeInit const& i, ConstNodeRef const& after)
+    {
+        _C4RV();
+        RYML_ASSERT(after.m_tree == m_tree);
+        NodeRef r(m_tree, m_tree->insert_sibling(m_id, after.m_id));
+        r._apply(i);
+        return r;
+    }
+
+    inline NodeRef prepend_sibling()
+    {
+        _C4RV();
+        NodeRef r(m_tree, m_tree->prepend_sibling(m_id));
+        return r;
+    }
+
+    inline NodeRef prepend_sibling(NodeInit const& i)
+    {
+        _C4RV();
+        NodeRef r(m_tree, m_tree->prepend_sibling(m_id));
+        r._apply(i);
+        return r;
+    }
+
+    inline NodeRef append_sibling()
+    {
+        _C4RV();
+        NodeRef r(m_tree, m_tree->append_sibling(m_id));
+        return r;
+    }
+
+    inline NodeRef append_sibling(NodeInit const& i)
+    {
+        _C4RV();
+        NodeRef r(m_tree, m_tree->append_sibling(m_id));
+        r._apply(i);
+        return r;
+    }
+
+public:
+
+    inline void remove_child(NodeRef & child)
+    {
+        _C4RV();
+        RYML_ASSERT(has_child(child));
+        RYML_ASSERT(child.parent().id() == id());
+        m_tree->remove(child.id());
+        child.clear();
+    }
+
+    //! remove the nth child of this node
+    inline void remove_child(size_t pos)
+    {
+        _C4RV();
+        RYML_ASSERT(pos >= 0 && pos < num_children());
+        size_t child = m_tree->child(m_id, pos);
+        RYML_ASSERT(child != NONE);
+        m_tree->remove(child);
+    }
+
+    //! remove a child by name
+    inline void remove_child(csubstr key)
+    {
+        _C4RV();
+        size_t child = m_tree->find_child(m_id, key);
+        RYML_ASSERT(child != NONE);
+        m_tree->remove(child);
+    }
+
+public:
+
+    /** change the node's position within its parent, placing it after
+     * @p after. To move to the first position in the parent, simply
+     * pass an empty or default-constructed reference like this:
+     * `n.move({})`. */
+    inline void move(ConstNodeRef const& after)
+    {
+        _C4RV();
+        m_tree->move(m_id, after.m_id);
+    }
+
+    /** move the node to a different @p parent (which may belong to a
+     * different tree), placing it after @p after. When the
+     * destination parent is in a new tree, then this node's tree
+     * pointer is reset to the tree of the parent node. */
+    inline void move(NodeRef const& parent, ConstNodeRef const& after)
+    {
+        _C4RV();
+        if(parent.m_tree == m_tree)
+        {
+            m_tree->move(m_id, parent.m_id, after.m_id);
+        }
+        else
+        {
+            parent.m_tree->move(m_tree, m_id, parent.m_id, after.m_id);
+            m_tree = parent.m_tree;
+        }
+    }
+
+    /** duplicate the current node somewhere within its parent, and
+     * place it after the node @p after. To place into the first
+     * position of the parent, simply pass an empty or
+     * default-constructed reference like this: `n.move({})`. */
+    inline NodeRef duplicate(ConstNodeRef const& after) const
+    {
+        _C4RV();
+        RYML_ASSERT(m_tree == after.m_tree || after.m_id == NONE);
+        size_t dup = m_tree->duplicate(m_id, m_tree->parent(m_id), after.m_id);
+        NodeRef r(m_tree, dup);
+        return r;
+    }
+
+    /** duplicate the current node somewhere into a different @p parent
+     * (possibly from a different tree), and place it after the node
+     * @p after. To place into the first position of the parent,
+     * simply pass an empty or default-constructed reference like
+     * this: `n.move({})`. */
+    inline NodeRef duplicate(NodeRef const& parent, ConstNodeRef const& after) const
+    {
+        _C4RV();
+        RYML_ASSERT(parent.m_tree == after.m_tree || after.m_id == NONE);
+        if(parent.m_tree == m_tree)
+        {
+            size_t dup = m_tree->duplicate(m_id, parent.m_id, after.m_id);
+            NodeRef r(m_tree, dup);
+            return r;
+        }
+        else
+        {
+            size_t dup = parent.m_tree->duplicate(m_tree, m_id, parent.m_id, after.m_id);
+            NodeRef r(parent.m_tree, dup);
+            return r;
+        }
+    }
+
+    inline void duplicate_children(NodeRef const& parent, ConstNodeRef const& after) const
+    {
+        _C4RV();
+        RYML_ASSERT(parent.m_tree == after.m_tree);
+        if(parent.m_tree == m_tree)
+        {
+            m_tree->duplicate_children(m_id, parent.m_id, after.m_id);
+        }
+        else
+        {
+            parent.m_tree->duplicate_children(m_tree, m_id, parent.m_id, after.m_id);
+        }
+    }
+
+    /** @} */
+
+#undef _C4RV
+};
+
+
+//-----------------------------------------------------------------------------
+
+inline ConstNodeRef::ConstNodeRef(NodeRef const& that)
+    : m_tree(that.m_tree)
+    , m_id(!that.is_seed() ? that.id() : NONE)
+{
+}
+
+inline ConstNodeRef::ConstNodeRef(NodeRef && that)
+    : m_tree(that.m_tree)
+    , m_id(!that.is_seed() ? that.id() : NONE)
+{
+}
+
+
+inline ConstNodeRef& ConstNodeRef::operator= (NodeRef const& that)
+{
+    m_tree = (that.m_tree);
+    m_id = (!that.is_seed() ? that.id() : NONE);
+    return *this;
+}
+
+inline ConstNodeRef& ConstNodeRef::operator= (NodeRef && that)
+{
+    m_tree = (that.m_tree);
+    m_id = (!that.is_seed() ? that.id() : NONE);
+    return *this;
+}
+
+
+//-----------------------------------------------------------------------------
+
+template<class T>
+inline void write(NodeRef *n, T const& v)
+{
+    n->set_val_serialized(v);
+}
+
+template<class T>
+typename std::enable_if< ! std::is_floating_point<T>::value, bool>::type
+inline read(NodeRef const& n, T *v)
+{
+    return from_chars(n.val(), v);
+}
+template<class T>
+typename std::enable_if< ! std::is_floating_point<T>::value, bool>::type
+inline read(ConstNodeRef const& n, T *v)
+{
+    return from_chars(n.val(), v);
+}
+
+template<class T>
+typename std::enable_if<std::is_floating_point<T>::value, bool>::type
+inline read(NodeRef const& n, T *v)
+{
+    return from_chars_float(n.val(), v);
+}
+template<class T>
+typename std::enable_if<std::is_floating_point<T>::value, bool>::type
+inline read(ConstNodeRef const& n, T *v)
+{
+    return from_chars_float(n.val(), v);
+}
+
+
+} // namespace yml
+} // namespace c4
+
+
+#if defined(_MSC_VER)
+#   pragma warning(pop)
+#endif
+
+#ifdef __GNUC__
+#   pragma GCC diagnostic pop
+#endif
+
+#endif /* _C4_YML_NODE_HPP_ */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/node.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/writer.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/writer.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_YML_WRITER_HPP_
+#define _C4_YML_WRITER_HPP_
+
+#ifndef _C4_YML_COMMON_HPP_
+#include "./common.hpp"
+#endif
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/substr.hpp
+//#include <c4/substr.hpp>
+#if !defined(C4_SUBSTR_HPP_) && !defined(_C4_SUBSTR_HPP_)
+#error "amalgamate: file c4/substr.hpp must have been included at this point"
+#endif /* C4_SUBSTR_HPP_ */
+
+//included above:
+//#include <stdio.h>  // fwrite(), fputc()
+//included above:
+//#include <string.h> // memcpy()
+
+
+namespace c4 {
+namespace yml {
+
+
+/** Repeat-Character: a character to be written a number of times. */
+struct RepC
+{
+    char c;
+    size_t num_times;
+};
+inline RepC indent_to(size_t num_levels)
+{
+    return {' ', size_t(2) * num_levels};
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+/** A writer that outputs to a file. Defaults to stdout. */
+struct WriterFile
+{
+    FILE * m_file;
+    size_t m_pos;
+
+    WriterFile(FILE *f = nullptr) : m_file(f ? f : stdout), m_pos(0) {}
+
+    inline substr _get(bool /*error_on_excess*/)
+    {
+        substr sp;
+        sp.str = nullptr;
+        sp.len = m_pos;
+        return sp;
+    }
+
+    template<size_t N>
+    inline void _do_write(const char (&a)[N])
+    {
+        fwrite(a, sizeof(char), N - 1, m_file);
+        m_pos += N - 1;
+    }
+
+    inline void _do_write(csubstr sp)
+    {
+        #if defined(__clang__)
+        #   pragma clang diagnostic push
+        #   pragma GCC diagnostic ignored "-Wsign-conversion"
+        #elif defined(__GNUC__)
+        #   pragma GCC diagnostic push
+        #   pragma GCC diagnostic ignored "-Wsign-conversion"
+        #endif
+        if(sp.empty()) return;
+        fwrite(sp.str, sizeof(csubstr::char_type), sp.len, m_file);
+        m_pos += sp.len;
+        #if defined(__clang__)
+        #   pragma clang diagnostic pop
+        #elif defined(__GNUC__)
+        #   pragma GCC diagnostic pop
+        #endif
+    }
+
+    inline void _do_write(const char c)
+    {
+        fputc(c, m_file);
+        ++m_pos;
+    }
+
+    inline void _do_write(RepC const rc)
+    {
+        for(size_t i = 0; i < rc.num_times; ++i)
+        {
+            fputc(rc.c, m_file);
+        }
+        m_pos += rc.num_times;
+    }
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+/** A writer that outputs to an STL-like ostream. */
+template<class OStream>
+struct WriterOStream
+{
+    OStream& m_stream;
+    size_t   m_pos;
+
+    WriterOStream(OStream &s) : m_stream(s), m_pos(0) {}
+
+    inline substr _get(bool /*error_on_excess*/)
+    {
+        substr sp;
+        sp.str = nullptr;
+        sp.len = m_pos;
+        return sp;
+    }
+
+    template<size_t N>
+    inline void _do_write(const char (&a)[N])
+    {
+        m_stream.write(a, N - 1);
+        m_pos += N - 1;
+    }
+
+    inline void _do_write(csubstr sp)
+    {
+        #if defined(__clang__)
+        #   pragma clang diagnostic push
+        #   pragma GCC diagnostic ignored "-Wsign-conversion"
+        #elif defined(__GNUC__)
+        #   pragma GCC diagnostic push
+        #   pragma GCC diagnostic ignored "-Wsign-conversion"
+        #endif
+        if(sp.empty()) return;
+        m_stream.write(sp.str, sp.len);
+        m_pos += sp.len;
+        #if defined(__clang__)
+        #   pragma clang diagnostic pop
+        #elif defined(__GNUC__)
+        #   pragma GCC diagnostic pop
+        #endif
+    }
+
+    inline void _do_write(const char c)
+    {
+        m_stream.put(c);
+        ++m_pos;
+    }
+
+    inline void _do_write(RepC const rc)
+    {
+        for(size_t i = 0; i < rc.num_times; ++i)
+        {
+            m_stream.put(rc.c);
+        }
+        m_pos += rc.num_times;
+    }
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+/** a writer to a substr */
+struct WriterBuf
+{
+    substr m_buf;
+    size_t m_pos;
+
+    WriterBuf(substr sp) : m_buf(sp), m_pos(0) {}
+
+    inline substr _get(bool error_on_excess)
+    {
+        if(m_pos <= m_buf.len)
+        {
+            return m_buf.first(m_pos);
+        }
+        if(error_on_excess)
+        {
+            c4::yml::error("not enough space in the given buffer");
+        }
+        substr sp;
+        sp.str = nullptr;
+        sp.len = m_pos;
+        return sp;
+    }
+
+    template<size_t N>
+    inline void _do_write(const char (&a)[N])
+    {
+        RYML_ASSERT( ! m_buf.overlaps(a));
+        if(m_pos + N-1 <= m_buf.len)
+        {
+            memcpy(&(m_buf[m_pos]), a, N-1);
+        }
+        m_pos += N-1;
+    }
+
+    inline void _do_write(csubstr sp)
+    {
+        if(sp.empty()) return;
+        RYML_ASSERT( ! sp.overlaps(m_buf));
+        if(m_pos + sp.len <= m_buf.len)
+        {
+            memcpy(&(m_buf[m_pos]), sp.str, sp.len);
+        }
+        m_pos += sp.len;
+    }
+
+    inline void _do_write(const char c)
+    {
+        if(m_pos + 1 <= m_buf.len)
+        {
+            m_buf[m_pos] = c;
+        }
+        ++m_pos;
+    }
+
+    inline void _do_write(RepC const rc)
+    {
+        if(m_pos + rc.num_times <= m_buf.len)
+        {
+            for(size_t i = 0; i < rc.num_times; ++i)
+            {
+                m_buf[m_pos + i] = rc.c;
+            }
+        }
+        m_pos += rc.num_times;
+    }
+};
+
+
+} // namespace yml
+} // namespace c4
+
+#endif /* _C4_YML_WRITER_HPP_ */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/writer.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/detail/parser_dbg.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/detail/parser_dbg.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_YML_DETAIL_PARSER_DBG_HPP_
+#define _C4_YML_DETAIL_PARSER_DBG_HPP_
+
+#ifndef _C4_YML_COMMON_HPP_
+#include "../common.hpp"
+#endif
+//included above:
+//#include <cstdio>
+
+//-----------------------------------------------------------------------------
+// some debugging scaffolds
+
+#if defined(_MSC_VER)
+#   pragma warning(push)
+#   pragma warning(disable: 4068/*unknown pragma*/)
+#endif
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunknown-pragmas"
+//#pragma GCC diagnostic ignored "-Wpragma-system-header-outside-header"
+#pragma GCC system_header
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Werror"
+#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
+
+// some debugging scaffolds
+#ifdef RYML_DBG
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/dump.hpp
+//#include <c4/dump.hpp>
+#if !defined(C4_DUMP_HPP_) && !defined(_C4_DUMP_HPP_)
+#error "amalgamate: file c4/dump.hpp must have been included at this point"
+#endif /* C4_DUMP_HPP_ */
+
+namespace c4 {
+inline void _dbg_dumper(csubstr s) { fwrite(s.str, 1, s.len, stdout); };
+template<class ...Args>
+void _dbg_printf(c4::csubstr fmt, Args&& ...args)
+{
+    static char writebuf[256];
+    auto results = c4::format_dump_resume<&_dbg_dumper>(writebuf, fmt, std::forward<Args>(args)...);
+    // resume writing if the results failed to fit the buffer
+    if(C4_UNLIKELY(results.bufsize > sizeof(writebuf))) // bufsize will be that of the largest element serialized. Eg int(1), will require 1 byte.
+    {
+        results = format_dump_resume<&_dbg_dumper>(results, writebuf, fmt, std::forward<Args>(args)...);
+        if(C4_UNLIKELY(results.bufsize > sizeof(writebuf)))
+        {
+            results = format_dump_resume<&_dbg_dumper>(results, writebuf, fmt, std::forward<Args>(args)...);
+        }
+    }
+}
+} // namespace c4
+
+#   define _c4dbgt(fmt, ...)   this->_dbg ("{}:{}: "   fmt     , __FILE__, __LINE__, ## __VA_ARGS__)
+#   define _c4dbgpf(fmt, ...)  _dbg_printf("{}:{}: "   fmt "\n", __FILE__, __LINE__, ## __VA_ARGS__)
+#   define _c4dbgp(msg)        _dbg_printf("{}:{}: "   msg "\n", __FILE__, __LINE__                )
+#   define _c4dbgq(msg)        _dbg_printf(msg "\n")
+#   define _c4err(fmt, ...)   \
+    do { if(c4::is_debugger_attached()) { C4_DEBUG_BREAK(); } \
+         this->_err("ERROR:\n" "{}:{}: " fmt, __FILE__, __LINE__, ## __VA_ARGS__); } while(0)
+#else
+#   define _c4dbgt(fmt, ...)
+#   define _c4dbgpf(fmt, ...)
+#   define _c4dbgp(msg)
+#   define _c4dbgq(msg)
+#   define _c4err(fmt, ...)   \
+    do { if(c4::is_debugger_attached()) { C4_DEBUG_BREAK(); } \
+         this->_err("ERROR: " fmt, ## __VA_ARGS__); } while(0)
+#endif
+
+#define _c4prsp(sp) sp
+#define _c4presc(s) __c4presc(s.str, s.len)
+inline c4::csubstr _c4prc(const char &C4_RESTRICT c)
+{
+    switch(c)
+    {
+    case '\n': return c4::csubstr("\\n");
+    case '\t': return c4::csubstr("\\t");
+    case '\0': return c4::csubstr("\\0");
+    case '\r': return c4::csubstr("\\r");
+    case '\f': return c4::csubstr("\\f");
+    case '\b': return c4::csubstr("\\b");
+    case '\v': return c4::csubstr("\\v");
+    case '\a': return c4::csubstr("\\a");
+    default: return c4::csubstr(&c, 1);
+    }
+}
+inline void __c4presc(const char *s, size_t len)
+{
+    size_t prev = 0;
+    for(size_t i = 0; i < len; ++i)
+    {
+        switch(s[i])
+        {
+        case '\n'  : fwrite(s+prev, 1, i-prev, stdout); putchar('\\'); putchar('n'); putchar('\n'); prev = i+1; break;
+        case '\t'  : fwrite(s+prev, 1, i-prev, stdout); putchar('\\'); putchar('t'); prev = i+1; break;
+        case '\0'  : fwrite(s+prev, 1, i-prev, stdout); putchar('\\'); putchar('0'); prev = i+1; break;
+        case '\r'  : fwrite(s+prev, 1, i-prev, stdout); putchar('\\'); putchar('r'); prev = i+1; break;
+        case '\f'  : fwrite(s+prev, 1, i-prev, stdout); putchar('\\'); putchar('f'); prev = i+1; break;
+        case '\b'  : fwrite(s+prev, 1, i-prev, stdout); putchar('\\'); putchar('b'); prev = i+1; break;
+        case '\v'  : fwrite(s+prev, 1, i-prev, stdout); putchar('\\'); putchar('v'); prev = i+1; break;
+        case '\a'  : fwrite(s+prev, 1, i-prev, stdout); putchar('\\'); putchar('a'); prev = i+1; break;
+        case '\x1b': fwrite(s+prev, 1, i-prev, stdout); putchar('\\'); putchar('e'); prev = i+1; break;
+        case -0x3e/*0xc2u*/:
+            if(i+1 < len)
+            {
+                if(s[i+1] == -0x60/*0xa0u*/)
+                {
+                    fwrite(s+prev, 1, i-prev, stdout); putchar('\\'); putchar('_'); prev = i+2; ++i;
+                }
+                else if(s[i+1] == -0x7b/*0x85u*/)
+                {
+                    fwrite(s+prev, 1, i-prev, stdout); putchar('\\'); putchar('N'); prev = i+2; ++i;
+                }
+                break;
+            }
+        case -0x1e/*0xe2u*/:
+            if(i+2 < len && s[i+1] == -0x80/*0x80u*/)
+            {
+                if(s[i+2] == -0x58/*0xa8u*/)
+                {
+                    fwrite(s+prev, 1, i-prev, stdout); putchar('\\'); putchar('L'); prev = i+3; i += 2;
+                }
+                else if(s[i+2] == -0x57/*0xa9u*/)
+                {
+                    fwrite(s+prev, 1, i-prev, stdout); putchar('\\'); putchar('P'); prev = i+3; i += 2;
+                }
+                break;
+            }
+        }
+    }
+    fwrite(s + prev, 1, len - prev, stdout);
+}
+
+#pragma clang diagnostic pop
+#pragma GCC diagnostic pop
+
+#if defined(_MSC_VER)
+#   pragma warning(pop)
+#endif
+
+
+#endif /* _C4_YML_DETAIL_PARSER_DBG_HPP_ */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/detail/parser_dbg.hpp)
+
+#define C4_YML_EMIT_DEF_HPP_
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/emit.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/emit.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_YML_EMIT_HPP_
+#define _C4_YML_EMIT_HPP_
+
+#ifndef _C4_YML_WRITER_HPP_
+#include "./writer.hpp"
+#endif
+
+#ifndef _C4_YML_TREE_HPP_
+#include "./tree.hpp"
+#endif
+
+#ifndef _C4_YML_NODE_HPP_
+#include "./node.hpp"
+#endif
+
+
+#define RYML_DEPRECATE_EMIT                                             \
+    RYML_DEPRECATED("use emit_yaml() instead. See https://github.com/biojppm/rapidyaml/issues/120")
+#ifdef emit
+#error "emit is defined, likely from a Qt include. This will cause a compilation error. See https://github.com/biojppm/rapidyaml/issues/120"
+#endif
+#define RYML_DEPRECATE_EMITRS                                           \
+    RYML_DEPRECATED("use emitrs_yaml() instead. See https://github.com/biojppm/rapidyaml/issues/120")
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+namespace c4 {
+namespace yml {
+
+template<class Writer> class Emitter;
+
+template<class OStream>
+using EmitterOStream = Emitter<WriterOStream<OStream>>;
+using EmitterFile = Emitter<WriterFile>;
+using EmitterBuf  = Emitter<WriterBuf>;
+
+typedef enum {
+    EMIT_YAML = 0,
+    EMIT_JSON = 1
+} EmitType_e;
+
+
+/** mark a tree or node to be emitted as json */
+struct as_json
+{
+    Tree const* tree;
+    size_t node;
+    as_json(Tree const& t) : tree(&t), node(t.empty() ? NONE : t.root_id()) {}
+    as_json(Tree const& t, size_t id) : tree(&t), node(id) {}
+    as_json(ConstNodeRef const& n) : tree(n.tree()), node(n.id()) {}
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+template<class Writer>
+class Emitter : public Writer
+{
+public:
+
+    using Writer::Writer;
+
+    /** emit!
+     *
+     * When writing to a buffer, returns a substr of the emitted YAML.
+     * If the given buffer has insufficient space, the returned span will
+     * be null and its size will be the needed space. No writes are done
+     * after the end of the buffer.
+     *
+     * When writing to a file, the returned substr will be null, but its
+     * length will be set to the number of bytes written. */
+    substr emit_as(EmitType_e type, Tree const& t, size_t id, bool error_on_excess);
+    /** emit starting at the root node */
+    substr emit_as(EmitType_e type, Tree const& t, bool error_on_excess=true);
+    /** emit the given node */
+    substr emit_as(EmitType_e type, ConstNodeRef const& n, bool error_on_excess=true);
+
+private:
+
+    Tree const* C4_RESTRICT m_tree;
+
+    void _emit_yaml(size_t id);
+    void _do_visit_flow_sl(size_t id, size_t ilevel=0);
+    void _do_visit_flow_ml(size_t id, size_t ilevel=0, size_t do_indent=1);
+    void _do_visit_block(size_t id, size_t ilevel=0, size_t do_indent=1);
+    void _do_visit_block_container(size_t id, size_t next_level, size_t do_indent);
+    void _do_visit_json(size_t id);
+
+private:
+
+    void _write(NodeScalar const& C4_RESTRICT sc, NodeType flags, size_t level);
+    void _write_json(NodeScalar const& C4_RESTRICT sc, NodeType flags);
+
+    void _write_doc(size_t id);
+    void _write_scalar(csubstr s, bool was_quoted);
+    void _write_scalar_json(csubstr s, bool as_key, bool was_quoted);
+    void _write_scalar_literal(csubstr s, size_t level, bool as_key, bool explicit_indentation=false);
+    void _write_scalar_folded(csubstr s, size_t level, bool as_key);
+    void _write_scalar_squo(csubstr s, size_t level);
+    void _write_scalar_dquo(csubstr s, size_t level);
+    void _write_scalar_plain(csubstr s, size_t level);
+
+    void _write_tag(csubstr tag)
+    {
+        if(!tag.begins_with('!'))
+            this->Writer::_do_write('!');
+        this->Writer::_do_write(tag);
+    }
+
+    enum : type_bits {
+        _keysc =  (KEY|KEYREF|KEYANCH|KEYQUO|_WIP_KEY_STYLE) | ~(VAL|VALREF|VALANCH|VALQUO|_WIP_VAL_STYLE),
+        _valsc = ~(KEY|KEYREF|KEYANCH|KEYQUO|_WIP_KEY_STYLE) |  (VAL|VALREF|VALANCH|VALQUO|_WIP_VAL_STYLE),
+        _keysc_json =  (KEY)  | ~(VAL),
+        _valsc_json = ~(KEY)  |  (VAL),
+    };
+
+    C4_ALWAYS_INLINE void _writek(size_t id, size_t level) { _write(m_tree->keysc(id), m_tree->_p(id)->m_type.type & ~_valsc, level); }
+    C4_ALWAYS_INLINE void _writev(size_t id, size_t level) { _write(m_tree->valsc(id), m_tree->_p(id)->m_type.type & ~_keysc, level); }
+
+    C4_ALWAYS_INLINE void _writek_json(size_t id) { _write_json(m_tree->keysc(id), m_tree->_p(id)->m_type.type & ~(VAL)); }
+    C4_ALWAYS_INLINE void _writev_json(size_t id) { _write_json(m_tree->valsc(id), m_tree->_p(id)->m_type.type & ~(KEY)); }
+
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** emit YAML to the given file. A null file defaults to stdout.
+ * Return the number of bytes written. */
+inline size_t emit_yaml(Tree const& t, size_t id, FILE *f)
+{
+    EmitterFile em(f);
+    return em.emit_as(EMIT_YAML, t, id, /*error_on_excess*/true).len;
+}
+RYML_DEPRECATE_EMIT inline size_t emit(Tree const& t, size_t id, FILE *f)
+{
+    return emit_yaml(t, id, f);
+}
+
+/** emit JSON to the given file. A null file defaults to stdout.
+ * Return the number of bytes written. */
+inline size_t emit_json(Tree const& t, size_t id, FILE *f)
+{
+    EmitterFile em(f);
+    return em.emit_as(EMIT_JSON, t, id, /*error_on_excess*/true).len;
+}
+
+
+/** emit YAML to the given file. A null file defaults to stdout.
+ * Return the number of bytes written.
+ * @overload */
+inline size_t emit_yaml(Tree const& t, FILE *f=nullptr)
+{
+    EmitterFile em(f);
+    return em.emit_as(EMIT_YAML, t, /*error_on_excess*/true).len;
+}
+RYML_DEPRECATE_EMIT inline size_t emit(Tree const& t, FILE *f=nullptr)
+{
+    return emit_yaml(t, f);
+}
+
+/** emit JSON to the given file. A null file defaults to stdout.
+ * Return the number of bytes written.
+ * @overload */
+inline size_t emit_json(Tree const& t, FILE *f=nullptr)
+{
+    EmitterFile em(f);
+    return em.emit_as(EMIT_JSON, t, /*error_on_excess*/true).len;
+}
+
+
+/** emit YAML to the given file. A null file defaults to stdout.
+ * Return the number of bytes written.
+ * @overload */
+inline size_t emit_yaml(ConstNodeRef const& r, FILE *f=nullptr)
+{
+    EmitterFile em(f);
+    return em.emit_as(EMIT_YAML, r, /*error_on_excess*/true).len;
+}
+RYML_DEPRECATE_EMIT inline size_t emit(ConstNodeRef const& r, FILE *f=nullptr)
+{
+    return emit_yaml(r, f);
+}
+
+/** emit JSON to the given file. A null file defaults to stdout.
+ * Return the number of bytes written.
+ * @overload */
+inline size_t emit_json(ConstNodeRef const& r, FILE *f=nullptr)
+{
+    EmitterFile em(f);
+    return em.emit_as(EMIT_JSON, r, /*error_on_excess*/true).len;
+}
+
+
+//-----------------------------------------------------------------------------
+
+/** emit YAML to an STL-like ostream */
+template<class OStream>
+inline OStream& operator<< (OStream& s, Tree const& t)
+{
+    EmitterOStream<OStream> em(s);
+    em.emit_as(EMIT_YAML, t);
+    return s;
+}
+
+/** emit YAML to an STL-like ostream
+ * @overload */
+template<class OStream>
+inline OStream& operator<< (OStream& s, ConstNodeRef const& n)
+{
+    EmitterOStream<OStream> em(s);
+    em.emit_as(EMIT_YAML, n);
+    return s;
+}
+
+/** emit json to an STL-like stream */
+template<class OStream>
+inline OStream& operator<< (OStream& s, as_json const& j)
+{
+    EmitterOStream<OStream> em(s);
+    em.emit_as(EMIT_JSON, *j.tree, j.node, true);
+    return s;
+}
+
+
+//-----------------------------------------------------------------------------
+
+
+/** emit YAML to the given buffer. Return a substr trimmed to the emitted YAML.
+ * @param error_on_excess Raise an error if the space in the buffer is insufficient.
+ * @overload */
+inline substr emit_yaml(Tree const& t, size_t id, substr buf, bool error_on_excess=true)
+{
+    EmitterBuf em(buf);
+    return em.emit_as(EMIT_YAML, t, id, error_on_excess);
+}
+RYML_DEPRECATE_EMIT inline substr emit(Tree const& t, size_t id, substr buf, bool error_on_excess=true)
+{
+    return emit_yaml(t, id, buf, error_on_excess);
+}
+
+/** emit JSON to the given buffer. Return a substr trimmed to the emitted JSON.
+ * @param error_on_excess Raise an error if the space in the buffer is insufficient.
+ * @overload */
+inline substr emit_json(Tree const& t, size_t id, substr buf, bool error_on_excess=true)
+{
+    EmitterBuf em(buf);
+    return em.emit_as(EMIT_JSON, t, id, error_on_excess);
+}
+
+
+/** emit YAML to the given buffer. Return a substr trimmed to the emitted YAML.
+ * @param error_on_excess Raise an error if the space in the buffer is insufficient.
+ * @overload */
+inline substr emit_yaml(Tree const& t, substr buf, bool error_on_excess=true)
+{
+    EmitterBuf em(buf);
+    return em.emit_as(EMIT_YAML, t, error_on_excess);
+}
+RYML_DEPRECATE_EMIT inline substr emit(Tree const& t, substr buf, bool error_on_excess=true)
+{
+    return emit_yaml(t, buf, error_on_excess);
+}
+
+/** emit JSON to the given buffer. Return a substr trimmed to the emitted JSON.
+ * @param error_on_excess Raise an error if the space in the buffer is insufficient.
+ * @overload */
+inline substr emit_json(Tree const& t, substr buf, bool error_on_excess=true)
+{
+    EmitterBuf em(buf);
+    return em.emit_as(EMIT_JSON, t, error_on_excess);
+}
+
+
+/** emit YAML to the given buffer. Return a substr trimmed to the emitted YAML.
+ * @param error_on_excess Raise an error if the space in the buffer is insufficient.
+ * @overload
+ */
+inline substr emit_yaml(ConstNodeRef const& r, substr buf, bool error_on_excess=true)
+{
+    EmitterBuf em(buf);
+    return em.emit_as(EMIT_YAML, r, error_on_excess);
+}
+RYML_DEPRECATE_EMIT inline substr emit(ConstNodeRef const& r, substr buf, bool error_on_excess=true)
+{
+    return emit_yaml(r, buf, error_on_excess);
+}
+
+/** emit JSON to the given buffer. Return a substr trimmed to the emitted JSON.
+ * @param error_on_excess Raise an error if the space in the buffer is insufficient.
+ * @overload
+ */
+inline substr emit_json(ConstNodeRef const& r, substr buf, bool error_on_excess=true)
+{
+    EmitterBuf em(buf);
+    return em.emit_as(EMIT_JSON, r, error_on_excess);
+}
+
+
+//-----------------------------------------------------------------------------
+
+/** emit+resize: emit YAML to the given std::string/std::vector-like
+ * container, resizing it as needed to fit the emitted YAML. */
+template<class CharOwningContainer>
+substr emitrs_yaml(Tree const& t, size_t id, CharOwningContainer * cont)
+{
+    substr buf = to_substr(*cont);
+    substr ret = emit_yaml(t, id, buf, /*error_on_excess*/false);
+    if(ret.str == nullptr && ret.len > 0)
+    {
+        cont->resize(ret.len);
+        buf = to_substr(*cont);
+        ret = emit_yaml(t, id, buf, /*error_on_excess*/true);
+    }
+    return ret;
+}
+template<class CharOwningContainer>
+RYML_DEPRECATE_EMITRS substr emitrs(Tree const& t, size_t id, CharOwningContainer * cont)
+{
+    return emitrs_yaml(t, id, cont);
+}
+
+/** emit+resize: emit JSON to the given std::string/std::vector-like
+ * container, resizing it as needed to fit the emitted JSON. */
+template<class CharOwningContainer>
+substr emitrs_json(Tree const& t, size_t id, CharOwningContainer * cont)
+{
+    substr buf = to_substr(*cont);
+    substr ret = emit_json(t, id, buf, /*error_on_excess*/false);
+    if(ret.str == nullptr && ret.len > 0)
+    {
+        cont->resize(ret.len);
+        buf = to_substr(*cont);
+        ret = emit_json(t, id, buf, /*error_on_excess*/true);
+    }
+    return ret;
+}
+
+
+/** emit+resize: emit YAML to the given std::string/std::vector-like
+ * container, resizing it as needed to fit the emitted YAML. */
+template<class CharOwningContainer>
+CharOwningContainer emitrs_yaml(Tree const& t, size_t id)
+{
+    CharOwningContainer c;
+    emitrs_yaml(t, id, &c);
+    return c;
+}
+template<class CharOwningContainer>
+RYML_DEPRECATE_EMITRS CharOwningContainer emitrs(Tree const& t, size_t id)
+{
+    CharOwningContainer c;
+    emitrs_yaml(t, id, &c);
+    return c;
+}
+
+/** emit+resize: emit JSON to the given std::string/std::vector-like
+ * container, resizing it as needed to fit the emitted JSON. */
+template<class CharOwningContainer>
+CharOwningContainer emitrs_json(Tree const& t, size_t id)
+{
+    CharOwningContainer c;
+    emitrs_json(t, id, &c);
+    return c;
+}
+
+
+/** emit+resize: YAML to the given std::string/std::vector-like
+ * container, resizing it as needed to fit the emitted YAML. */
+template<class CharOwningContainer>
+substr emitrs_yaml(Tree const& t, CharOwningContainer * cont)
+{
+    if(t.empty())
+        return {};
+    return emitrs_yaml(t, t.root_id(), cont);
+}
+template<class CharOwningContainer>
+RYML_DEPRECATE_EMITRS substr emitrs(Tree const& t, CharOwningContainer * cont)
+{
+    return emitrs_yaml(t, cont);
+}
+
+/** emit+resize: JSON to the given std::string/std::vector-like
+ * container, resizing it as needed to fit the emitted JSON. */
+template<class CharOwningContainer>
+substr emitrs_json(Tree const& t, CharOwningContainer * cont)
+{
+    if(t.empty())
+        return {};
+    return emitrs_json(t, t.root_id(), cont);
+}
+
+
+/** emit+resize: YAML to the given std::string/std::vector-like container,
+ * resizing it as needed to fit the emitted YAML. */
+template<class CharOwningContainer>
+CharOwningContainer emitrs_yaml(Tree const& t)
+{
+    CharOwningContainer c;
+    if(t.empty())
+        return c;
+    emitrs_yaml(t, t.root_id(), &c);
+    return c;
+}
+template<class CharOwningContainer>
+RYML_DEPRECATE_EMITRS CharOwningContainer emitrs(Tree const& t)
+{
+    return emitrs_yaml<CharOwningContainer>(t);
+}
+
+/** emit+resize: JSON to the given std::string/std::vector-like container,
+ * resizing it as needed to fit the emitted JSON. */
+template<class CharOwningContainer>
+CharOwningContainer emitrs_json(Tree const& t)
+{
+    CharOwningContainer c;
+    if(t.empty())
+        return c;
+    emitrs_json(t, t.root_id(), &c);
+    return c;
+}
+
+
+/** emit+resize: YAML to the given std::string/std::vector-like container,
+ * resizing it as needed to fit the emitted YAML. */
+template<class CharOwningContainer>
+substr emitrs_yaml(ConstNodeRef const& n, CharOwningContainer * cont)
+{
+    _RYML_CB_CHECK(n.tree()->callbacks(), n.valid());
+    return emitrs_yaml(*n.tree(), n.id(), cont);
+}
+template<class CharOwningContainer>
+RYML_DEPRECATE_EMITRS substr emitrs(ConstNodeRef const& n, CharOwningContainer * cont)
+{
+    return emitrs_yaml(n, cont);
+}
+
+/** emit+resize: JSON to the given std::string/std::vector-like container,
+ * resizing it as needed to fit the emitted JSON. */
+template<class CharOwningContainer>
+substr emitrs_json(ConstNodeRef const& n, CharOwningContainer * cont)
+{
+    _RYML_CB_CHECK(n.tree()->callbacks(), n.valid());
+    return emitrs_json(*n.tree(), n.id(), cont);
+}
+
+
+/** emit+resize: YAML to the given std::string/std::vector-like container,
+ * resizing it as needed to fit the emitted YAML. */
+template<class CharOwningContainer>
+CharOwningContainer emitrs_yaml(ConstNodeRef const& n)
+{
+    _RYML_CB_CHECK(n.tree()->callbacks(), n.valid());
+    CharOwningContainer c;
+    emitrs_yaml(*n.tree(), n.id(), &c);
+    return c;
+}
+template<class CharOwningContainer>
+RYML_DEPRECATE_EMITRS CharOwningContainer emitrs(ConstNodeRef const& n)
+{
+    return emitrs_yaml<CharOwningContainer>(n);
+}
+
+/** emit+resize: JSON to the given std::string/std::vector-like container,
+ * resizing it as needed to fit the emitted JSON. */
+template<class CharOwningContainer>
+CharOwningContainer emitrs_json(ConstNodeRef const& n)
+{
+    _RYML_CB_CHECK(n.tree()->callbacks(), n.valid());
+    CharOwningContainer c;
+    emitrs_json(*n.tree(), n.id(), &c);
+    return c;
+}
+
+} // namespace yml
+} // namespace c4
+
+#undef RYML_DEPRECATE_EMIT
+#undef RYML_DEPRECATE_EMITRS
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/emit.def.hpp
+//#include "c4/yml/emit.def.hpp"
+#if !defined(C4_YML_EMIT_DEF_HPP_) && !defined(_C4_YML_EMIT_DEF_HPP_)
+#error "amalgamate: file c4/yml/emit.def.hpp must have been included at this point"
+#endif /* C4_YML_EMIT_DEF_HPP_ */
+
+
+#endif /* _C4_YML_EMIT_HPP_ */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/emit.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/emit.def.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/emit.def.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_YML_EMIT_DEF_HPP_
+#define _C4_YML_EMIT_DEF_HPP_
+
+#ifndef _C4_YML_EMIT_HPP_
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/emit.hpp
+//#include "c4/yml/emit.hpp"
+#if !defined(C4_YML_EMIT_HPP_) && !defined(_C4_YML_EMIT_HPP_)
+#error "amalgamate: file c4/yml/emit.hpp must have been included at this point"
+#endif /* C4_YML_EMIT_HPP_ */
+
+#endif
+
+namespace c4 {
+namespace yml {
+
+template<class Writer>
+substr Emitter<Writer>::emit_as(EmitType_e type, Tree const& t, size_t id, bool error_on_excess)
+{
+    if(t.empty())
+    {
+        _RYML_CB_ASSERT(t.callbacks(), id == NONE);
+        return {};
+    }
+    _RYML_CB_CHECK(t.callbacks(), id < t.capacity());
+    m_tree = &t;
+    if(type == EMIT_YAML)
+        _emit_yaml(id);
+    else if(type == EMIT_JSON)
+        _do_visit_json(id);
+    else
+        _RYML_CB_ERR(m_tree->callbacks(), "unknown emit type");
+    return this->Writer::_get(error_on_excess);
+}
+
+template<class Writer>
+substr Emitter<Writer>::emit_as(EmitType_e type, Tree const& t, bool error_on_excess)
+{
+    if(t.empty())
+        return {};
+    return this->emit_as(type, t, t.root_id(), error_on_excess);
+}
+
+template<class Writer>
+substr Emitter<Writer>::emit_as(EmitType_e type, ConstNodeRef const& n, bool error_on_excess)
+{
+    _RYML_CB_CHECK(n.tree()->callbacks(), n.valid());
+    return this->emit_as(type, *n.tree(), n.id(), error_on_excess);
+}
+
+
+//-----------------------------------------------------------------------------
+
+template<class Writer>
+void Emitter<Writer>::_emit_yaml(size_t id)
+{
+    // save branches in the visitor by doing the initial stream/doc
+    // logic here, sparing the need to check stream/val/keyval inside
+    // the visitor functions
+    auto dispatch = [this](size_t node){
+        NodeType ty = m_tree->type(node);
+        if(ty.marked_flow_sl())
+            _do_visit_flow_sl(node, 0);
+        else if(ty.marked_flow_ml())
+            _do_visit_flow_ml(node, 0);
+        else
+        {
+            _do_visit_block(node, 0);
+        }
+    };
+    if(!m_tree->is_root(id))
+    {
+        if(m_tree->is_container(id) && !m_tree->type(id).marked_flow())
+        {
+            size_t ilevel = 0;
+            if(m_tree->has_key(id))
+            {
+                this->Writer::_do_write(m_tree->key(id));
+                this->Writer::_do_write(":\n");
+                ++ilevel;
+            }
+            _do_visit_block_container(id, ilevel, ilevel);
+            return;
+        }
+    }
+
+    auto *btd = m_tree->tag_directives().b;
+    auto *etd = m_tree->tag_directives().e;
+    auto write_tag_directives = [&btd, etd, this](size_t next_node){
+        auto end = btd;
+        while(end < etd)
+        {
+            if(end->next_node_id > next_node)
+                break;
+            ++end;
+        }
+        for( ; btd != end; ++btd)
+        {
+            if(next_node != m_tree->first_child(m_tree->parent(next_node)))
+                this->Writer::_do_write("...\n");
+            this->Writer::_do_write("%TAG ");
+            this->Writer::_do_write(btd->handle);
+            this->Writer::_do_write(' ');
+            this->Writer::_do_write(btd->prefix);
+            this->Writer::_do_write('\n');
+        }
+    };
+    if(m_tree->is_stream(id))
+    {
+        if(m_tree->first_child(id) != NONE)
+            write_tag_directives(m_tree->first_child(id));
+        for(size_t child = m_tree->first_child(id); child != NONE; child = m_tree->next_sibling(child))
+        {
+            dispatch(child);
+            if(m_tree->next_sibling(child) != NONE)
+                write_tag_directives(m_tree->next_sibling(child));
+        }
+    }
+    else if(m_tree->is_container(id))
+    {
+        dispatch(id);
+    }
+    else if(m_tree->is_doc(id))
+    {
+        _RYML_CB_ASSERT(m_tree->callbacks(), !m_tree->is_container(id)); // checked above
+        _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->is_val(id)); // so it must be a val
+        _write_doc(id);
+    }
+    else if(m_tree->is_keyval(id))
+    {
+        _writek(id, 0);
+        this->Writer::_do_write(": ");
+        _writev(id, 0);
+        if(!m_tree->type(id).marked_flow())
+            this->Writer::_do_write('\n');
+    }
+    else if(m_tree->is_val(id))
+    {
+        //this->Writer::_do_write("- ");
+        _writev(id, 0);
+        if(!m_tree->type(id).marked_flow())
+            this->Writer::_do_write('\n');
+    }
+    else if(m_tree->type(id) == NOTYPE)
+    {
+        ;
+    }
+    else
+    {
+        _RYML_CB_ERR(m_tree->callbacks(), "unknown type");
+    }
+}
+
+template<class Writer>
+void Emitter<Writer>::_write_doc(size_t id)
+{
+    RYML_ASSERT(m_tree->is_doc(id));
+    if(!m_tree->is_root(id))
+    {
+        RYML_ASSERT(m_tree->is_stream(m_tree->parent(id)));
+        this->Writer::_do_write("---");
+    }
+    if(!m_tree->has_val(id)) // this is more frequent
+    {
+        if(m_tree->has_val_tag(id))
+        {
+            if(!m_tree->is_root(id))
+                this->Writer::_do_write(' ');
+            _write_tag(m_tree->val_tag(id));
+        }
+        if(m_tree->has_val_anchor(id))
+        {
+            if(!m_tree->is_root(id))
+                this->Writer::_do_write(' ');
+            this->Writer::_do_write('&');
+            this->Writer::_do_write(m_tree->val_anchor(id));
+        }
+    }
+    else // docval
+    {
+        RYML_ASSERT(m_tree->has_val(id));
+        RYML_ASSERT(!m_tree->has_key(id));
+        if(!m_tree->is_root(id))
+            this->Writer::_do_write(' ');
+        _writev(id, 0);
+    }
+    this->Writer::_do_write('\n');
+}
+
+template<class Writer>
+void Emitter<Writer>::_do_visit_flow_sl(size_t node, size_t ilevel)
+{
+    RYML_ASSERT(!m_tree->is_stream(node));
+    RYML_ASSERT(m_tree->is_container(node) || m_tree->is_doc(node));
+    RYML_ASSERT(m_tree->is_root(node) || (m_tree->parent_is_map(node) || m_tree->parent_is_seq(node)));
+
+    if(m_tree->is_doc(node))
+    {
+        _write_doc(node);
+        if(!m_tree->has_children(node))
+            return;
+    }
+    else if(m_tree->is_container(node))
+    {
+        RYML_ASSERT(m_tree->is_map(node) || m_tree->is_seq(node));
+
+        bool spc = false; // write a space
+
+        if(m_tree->has_key(node))
+        {
+            _writek(node, ilevel);
+            this->Writer::_do_write(':');
+            spc = true;
+        }
+
+        if(m_tree->has_val_tag(node))
+        {
+            if(spc)
+                this->Writer::_do_write(' ');
+            _write_tag(m_tree->val_tag(node));
+            spc = true;
+        }
+
+        if(m_tree->has_val_anchor(node))
+        {
+            if(spc)
+                this->Writer::_do_write(' ');
+            this->Writer::_do_write('&');
+            this->Writer::_do_write(m_tree->val_anchor(node));
+            spc = true;
+        }
+
+        if(spc)
+            this->Writer::_do_write(' ');
+
+        if(m_tree->is_map(node))
+        {
+            this->Writer::_do_write('{');
+        }
+        else
+        {
+            _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->is_seq(node));
+            this->Writer::_do_write('[');
+        }
+    } // container
+
+    for(size_t child = m_tree->first_child(node), count = 0; child != NONE; child = m_tree->next_sibling(child))
+    {
+        if(count++)
+            this->Writer::_do_write(',');
+        if(m_tree->is_keyval(child))
+        {
+            _writek(child, ilevel);
+            this->Writer::_do_write(": ");
+            _writev(child, ilevel);
+        }
+        else if(m_tree->is_val(child))
+        {
+            _writev(child, ilevel);
+        }
+        else
+        {
+            // with single-line flow, we can never go back to block
+            _do_visit_flow_sl(child, ilevel + 1);
+        }
+    }
+
+    if(m_tree->is_map(node))
+    {
+        this->Writer::_do_write('}');
+    }
+    else if(m_tree->is_seq(node))
+    {
+        this->Writer::_do_write(']');
+    }
+}
+
+template<class Writer>
+void Emitter<Writer>::_do_visit_flow_ml(size_t id, size_t ilevel, size_t do_indent)
+{
+    C4_UNUSED(id);
+    C4_UNUSED(ilevel);
+    C4_UNUSED(do_indent);
+    RYML_CHECK(false/*not implemented*/);
+}
+
+template<class Writer>
+void Emitter<Writer>::_do_visit_block_container(size_t node, size_t next_level, size_t do_indent)
+{
+    RepC ind = indent_to(do_indent * next_level);
+
+    if(m_tree->is_seq(node))
+    {
+        for(size_t child = m_tree->first_child(node); child != NONE; child = m_tree->next_sibling(child))
+        {
+            _RYML_CB_ASSERT(m_tree->callbacks(), !m_tree->has_key(child));
+            if(m_tree->is_val(child))
+            {
+                this->Writer::_do_write(ind);
+                this->Writer::_do_write("- ");
+                _writev(child, next_level);
+                this->Writer::_do_write('\n');
+            }
+            else
+            {
+                _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->is_container(child));
+                NodeType ty = m_tree->type(child);
+                if(ty.marked_flow_sl())
+                {
+                    this->Writer::_do_write(ind);
+                    this->Writer::_do_write("- ");
+                    _do_visit_flow_sl(child, 0u);
+                    this->Writer::_do_write('\n');
+                }
+                else if(ty.marked_flow_ml())
+                {
+                    this->Writer::_do_write(ind);
+                    this->Writer::_do_write("- ");
+                    _do_visit_flow_ml(child, next_level, do_indent);
+                    this->Writer::_do_write('\n');
+                }
+                else
+                {
+                    _do_visit_block(child, next_level, do_indent);
+                }
+            }
+            do_indent = true;
+            ind = indent_to(do_indent * next_level);
+        }
+    }
+    else // map
+    {
+        _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->is_map(node));
+        for(size_t ich = m_tree->first_child(node); ich != NONE; ich = m_tree->next_sibling(ich))
+        {
+            _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->has_key(ich));
+            if(m_tree->is_keyval(ich))
+            {
+                this->Writer::_do_write(ind);
+                _writek(ich, next_level);
+                this->Writer::_do_write(": ");
+                _writev(ich, next_level);
+                this->Writer::_do_write('\n');
+            }
+            else
+            {
+                _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->is_container(ich));
+                NodeType ty = m_tree->type(ich);
+                if(ty.marked_flow_sl())
+                {
+                    this->Writer::_do_write(ind);
+                    _do_visit_flow_sl(ich, 0u);
+                    this->Writer::_do_write('\n');
+                }
+                else if(ty.marked_flow_ml())
+                {
+                    this->Writer::_do_write(ind);
+                    _do_visit_flow_ml(ich, 0u);
+                    this->Writer::_do_write('\n');
+                }
+                else
+                {
+                    _do_visit_block(ich, next_level, do_indent);
+                }
+            }
+            do_indent = true;
+            ind = indent_to(do_indent * next_level);
+        }
+    }
+}
+
+template<class Writer>
+void Emitter<Writer>::_do_visit_block(size_t node, size_t ilevel, size_t do_indent)
+{
+    RYML_ASSERT(!m_tree->is_stream(node));
+    RYML_ASSERT(m_tree->is_container(node) || m_tree->is_doc(node));
+    RYML_ASSERT(m_tree->is_root(node) || (m_tree->parent_is_map(node) || m_tree->parent_is_seq(node)));
+    RepC ind = indent_to(do_indent * ilevel);
+
+    if(m_tree->is_doc(node))
+    {
+        _write_doc(node);
+        if(!m_tree->has_children(node))
+            return;
+    }
+    else if(m_tree->is_container(node))
+    {
+        RYML_ASSERT(m_tree->is_map(node) || m_tree->is_seq(node));
+
+        bool spc = false; // write a space
+        bool nl = false;  // write a newline
+
+        if(m_tree->has_key(node))
+        {
+            this->Writer::_do_write(ind);
+            _writek(node, ilevel);
+            this->Writer::_do_write(':');
+            spc = true;
+        }
+        else if(!m_tree->is_root(node))
+        {
+            this->Writer::_do_write(ind);
+            this->Writer::_do_write('-');
+            spc = true;
+        }
+
+        if(m_tree->has_val_tag(node))
+        {
+            if(spc)
+                this->Writer::_do_write(' ');
+            _write_tag(m_tree->val_tag(node));
+            spc = true;
+            nl = true;
+        }
+
+        if(m_tree->has_val_anchor(node))
+        {
+            if(spc)
+                this->Writer::_do_write(' ');
+            this->Writer::_do_write('&');
+            this->Writer::_do_write(m_tree->val_anchor(node));
+            spc = true;
+            nl = true;
+        }
+
+        if(m_tree->has_children(node))
+        {
+            if(m_tree->has_key(node))
+                nl = true;
+            else
+                if(!m_tree->is_root(node) && !nl)
+                    spc = true;
+        }
+        else
+        {
+            if(m_tree->is_seq(node))
+                this->Writer::_do_write(" []\n");
+            else if(m_tree->is_map(node))
+                this->Writer::_do_write(" {}\n");
+            return;
+        }
+
+        if(spc && !nl)
+            this->Writer::_do_write(' ');
+
+        do_indent = 0;
+        if(nl)
+        {
+            this->Writer::_do_write('\n');
+            do_indent = 1;
+        }
+    } // container
+
+    size_t next_level = ilevel + 1;
+    if(m_tree->is_root(node) || m_tree->is_doc(node))
+        next_level = ilevel; // do not indent at top level
+
+    _do_visit_block_container(node, next_level, do_indent);
+}
+
+template<class Writer>
+void Emitter<Writer>::_do_visit_json(size_t id)
+{
+    _RYML_CB_CHECK(m_tree->callbacks(), !m_tree->is_stream(id)); // JSON does not have streams
+    if(m_tree->is_keyval(id))
+    {
+        _writek_json(id);
+        this->Writer::_do_write(": ");
+        _writev_json(id);
+    }
+    else if(m_tree->is_val(id))
+    {
+        _writev_json(id);
+    }
+    else if(m_tree->is_container(id))
+    {
+        if(m_tree->has_key(id))
+        {
+            _writek_json(id);
+            this->Writer::_do_write(": ");
+        }
+        if(m_tree->is_seq(id))
+            this->Writer::_do_write('[');
+        else if(m_tree->is_map(id))
+            this->Writer::_do_write('{');
+    }  // container
+
+    for(size_t ich = m_tree->first_child(id); ich != NONE; ich = m_tree->next_sibling(ich))
+    {
+        if(ich != m_tree->first_child(id))
+            this->Writer::_do_write(',');
+        _do_visit_json(ich);
+    }
+
+    if(m_tree->is_seq(id))
+        this->Writer::_do_write(']');
+    else if(m_tree->is_map(id))
+        this->Writer::_do_write('}');
+}
+
+template<class Writer>
+void Emitter<Writer>::_write(NodeScalar const& C4_RESTRICT sc, NodeType flags, size_t ilevel)
+{
+    if( ! sc.tag.empty())
+    {
+        _write_tag(sc.tag);
+        this->Writer::_do_write(' ');
+    }
+    if(flags.has_anchor())
+    {
+        RYML_ASSERT(flags.is_ref() != flags.has_anchor());
+        RYML_ASSERT( ! sc.anchor.empty());
+        this->Writer::_do_write('&');
+        this->Writer::_do_write(sc.anchor);
+        this->Writer::_do_write(' ');
+    }
+    else if(flags.is_ref())
+    {
+        if(sc.anchor != "<<")
+            this->Writer::_do_write('*');
+        this->Writer::_do_write(sc.anchor);
+        return;
+    }
+
+    // ensure the style flags only have one of KEY or VAL
+    _RYML_CB_ASSERT(m_tree->callbacks(), ((flags & (_WIP_KEY_STYLE|_WIP_VAL_STYLE)) == 0) || (((flags&_WIP_KEY_STYLE) == 0) != ((flags&_WIP_VAL_STYLE) == 0)));
+
+    auto style_marks = flags & (_WIP_KEY_STYLE|_WIP_VAL_STYLE);
+    if(style_marks & (_WIP_KEY_LITERAL|_WIP_VAL_LITERAL))
+    {
+        _write_scalar_literal(sc.scalar, ilevel, flags.has_key());
+    }
+    else if(style_marks & (_WIP_KEY_FOLDED|_WIP_VAL_FOLDED))
+    {
+        _write_scalar_folded(sc.scalar, ilevel, flags.has_key());
+    }
+    else if(style_marks & (_WIP_KEY_SQUO|_WIP_VAL_SQUO))
+    {
+        _write_scalar_squo(sc.scalar, ilevel);
+    }
+    else if(style_marks & (_WIP_KEY_DQUO|_WIP_VAL_DQUO))
+    {
+        _write_scalar_dquo(sc.scalar, ilevel);
+    }
+    else if(style_marks & (_WIP_KEY_PLAIN|_WIP_VAL_PLAIN))
+    {
+        _write_scalar_plain(sc.scalar, ilevel);
+    }
+    else if(!style_marks)
+    {
+        size_t first_non_nl = sc.scalar.first_not_of('\n');
+        bool all_newlines = first_non_nl == npos;
+        bool has_leading_ws = (!all_newlines) && sc.scalar.sub(first_non_nl).begins_with_any(" \t");
+        bool do_literal = ((!sc.scalar.empty() && all_newlines) || (has_leading_ws && !sc.scalar.trim(' ').empty()));
+        if(do_literal)
+        {
+            _write_scalar_literal(sc.scalar, ilevel, flags.has_key(), /*explicit_indentation*/has_leading_ws);
+        }
+        else
+        {
+            for(size_t i = 0; i < sc.scalar.len; ++i)
+            {
+                if(sc.scalar.str[i] == '\n')
+                {
+                    _write_scalar_literal(sc.scalar, ilevel, flags.has_key(), /*explicit_indentation*/has_leading_ws);
+                    goto wrote_special;
+                }
+                // todo: check for escaped characters requiring double quotes
+            }
+            _write_scalar(sc.scalar, flags.is_quoted());
+        wrote_special:
+            ;
+        }
+    }
+    else
+    {
+        _RYML_CB_ERR(m_tree->callbacks(), "not implemented");
+    }
+}
+template<class Writer>
+void Emitter<Writer>::_write_json(NodeScalar const& C4_RESTRICT sc, NodeType flags)
+{
+    if(C4_UNLIKELY( ! sc.tag.empty()))
+        _RYML_CB_ERR(m_tree->callbacks(), "JSON does not have tags");
+    if(C4_UNLIKELY(flags.has_anchor()))
+        _RYML_CB_ERR(m_tree->callbacks(), "JSON does not have anchors");
+    _write_scalar_json(sc.scalar, flags.has_key(), flags.is_quoted());
+}
+
+#define _rymlindent_nextline() for(size_t lv = 0; lv < ilevel+1; ++lv) { this->Writer::_do_write(' '); this->Writer::_do_write(' '); }
+
+template<class Writer>
+void Emitter<Writer>::_write_scalar_literal(csubstr s, size_t ilevel, bool explicit_key, bool explicit_indentation)
+{
+    if(explicit_key)
+        this->Writer::_do_write("? ");
+    csubstr trimmed = s.trimr("\n\r");
+    size_t numnewlines_at_end = s.len - trimmed.len - s.sub(trimmed.len).count('\r');
+    //
+    if(!explicit_indentation)
+        this->Writer::_do_write('|');
+    else
+        this->Writer::_do_write("|2");
+    //
+    if(numnewlines_at_end > 1 || (trimmed.len == 0 && s.len > 0)/*only newlines*/)
+        this->Writer::_do_write("+\n");
+    else if(numnewlines_at_end == 1)
+        this->Writer::_do_write('\n');
+    else
+        this->Writer::_do_write("-\n");
+    //
+    if(trimmed.len)
+    {
+        size_t pos = 0; // tracks the last character that was already written
+        for(size_t i = 0; i < trimmed.len; ++i)
+        {
+            if(trimmed[i] != '\n')
+                continue;
+            // write everything up to this point
+            csubstr since_pos = trimmed.range(pos, i+1); // include the newline
+            _rymlindent_nextline()
+            this->Writer::_do_write(since_pos);
+            pos = i+1; // already written
+        }
+        if(pos < trimmed.len)
+        {
+            _rymlindent_nextline()
+            this->Writer::_do_write(trimmed.sub(pos));
+        }
+        if(numnewlines_at_end)
+        {
+            this->Writer::_do_write('\n');
+            --numnewlines_at_end;
+        }
+    }
+    for(size_t i = 0; i < numnewlines_at_end; ++i)
+    {
+        _rymlindent_nextline()
+        if(i+1 < numnewlines_at_end || explicit_key)
+            this->Writer::_do_write('\n');
+    }
+    if(explicit_key && !numnewlines_at_end)
+        this->Writer::_do_write('\n');
+}
+
+template<class Writer>
+void Emitter<Writer>::_write_scalar_folded(csubstr s, size_t ilevel, bool explicit_key)
+{
+    if(explicit_key)
+    {
+        this->Writer::_do_write("? ");
+    }
+    RYML_ASSERT(s.find("\r") == csubstr::npos);
+    csubstr trimmed = s.trimr('\n');
+    size_t numnewlines_at_end = s.len - trimmed.len;
+    if(numnewlines_at_end == 0)
+    {
+        this->Writer::_do_write(">-\n");
+    }
+    else if(numnewlines_at_end == 1)
+    {
+        this->Writer::_do_write(">\n");
+    }
+    else if(numnewlines_at_end > 1)
+    {
+        this->Writer::_do_write(">+\n");
+    }
+    if(trimmed.len)
+    {
+        size_t pos = 0; // tracks the last character that was already written
+        for(size_t i = 0; i < trimmed.len; ++i)
+        {
+            if(trimmed[i] != '\n')
+                continue;
+            // write everything up to this point
+            csubstr since_pos = trimmed.range(pos, i+1); // include the newline
+            pos = i+1; // because of the newline
+            _rymlindent_nextline()
+            this->Writer::_do_write(since_pos);
+            this->Writer::_do_write('\n'); // write the newline twice
+        }
+        if(pos < trimmed.len)
+        {
+            _rymlindent_nextline()
+            this->Writer::_do_write(trimmed.sub(pos));
+        }
+        if(numnewlines_at_end)
+        {
+            this->Writer::_do_write('\n');
+            --numnewlines_at_end;
+        }
+    }
+    for(size_t i = 0; i < numnewlines_at_end; ++i)
+    {
+        _rymlindent_nextline()
+        if(i+1 < numnewlines_at_end || explicit_key)
+            this->Writer::_do_write('\n');
+    }
+    if(explicit_key && !numnewlines_at_end)
+        this->Writer::_do_write('\n');
+}
+
+template<class Writer>
+void Emitter<Writer>::_write_scalar_squo(csubstr s, size_t ilevel)
+{
+    size_t pos = 0; // tracks the last character that was already written
+    this->Writer::_do_write('\'');
+    for(size_t i = 0; i < s.len; ++i)
+    {
+        if(s[i] == '\n')
+        {
+            csubstr sub = s.range(pos, i+1);
+            this->Writer::_do_write(sub);  // write everything up to (including) this char
+            this->Writer::_do_write('\n'); // write the character again
+            if(i + 1 < s.len)
+                _rymlindent_nextline()     // indent the next line
+            pos = i+1;
+        }
+        else if(s[i] == '\'')
+        {
+            csubstr sub = s.range(pos, i+1);
+            this->Writer::_do_write(sub); // write everything up to (including) this char
+            this->Writer::_do_write('\''); // write the character again
+            pos = i+1;
+        }
+    }
+    // write missing characters at the end of the string
+    if(pos < s.len)
+        this->Writer::_do_write(s.sub(pos));
+    this->Writer::_do_write('\'');
+}
+
+template<class Writer>
+void Emitter<Writer>::_write_scalar_dquo(csubstr s, size_t ilevel)
+{
+    size_t pos = 0; // tracks the last character that was already written
+    this->Writer::_do_write('"');
+    for(size_t i = 0; i < s.len; ++i)
+    {
+        const char curr = s.str[i];
+        if(curr == '"' || curr == '\\')
+        {
+            csubstr sub = s.range(pos, i);
+            this->Writer::_do_write(sub);  // write everything up to (excluding) this char
+            this->Writer::_do_write('\\'); // write the escape
+            this->Writer::_do_write(curr); // write the char
+            pos = i+1;
+        }
+        else if(s[i] == '\n')
+        {
+            csubstr sub = s.range(pos, i+1);
+            this->Writer::_do_write(sub);  // write everything up to (including) this newline
+            this->Writer::_do_write('\n'); // write the newline again
+            if(i + 1 < s.len)
+                _rymlindent_nextline()     // indent the next line
+            pos = i+1;
+            if(i+1 < s.len) // escape leading whitespace after the newline
+            {
+                const char next = s.str[i+1];
+                if(next == ' ' || next == '\t')
+                    this->Writer::_do_write('\\');
+            }
+        }
+        else if(curr == ' ' || curr == '\t')
+        {
+            // escape trailing whitespace before a newline
+            size_t next = s.first_not_of(" \t\r", i);
+            if(next != npos && s[next] == '\n')
+            {
+                csubstr sub = s.range(pos, i);
+                this->Writer::_do_write(sub);  // write everything up to (excluding) this char
+                this->Writer::_do_write('\\'); // escape the whitespace
+                pos = i;
+            }
+        }
+        else if(C4_UNLIKELY(curr == '\r'))
+        {
+            csubstr sub = s.range(pos, i);
+            this->Writer::_do_write(sub);  // write everything up to (excluding) this char
+            this->Writer::_do_write("\\r"); // write the escaped char
+            pos = i+1;
+        }
+    }
+    // write missing characters at the end of the string
+    if(pos < s.len)
+    {
+        csubstr sub = s.sub(pos);
+        this->Writer::_do_write(sub);
+    }
+    this->Writer::_do_write('"');
+}
+
+template<class Writer>
+void Emitter<Writer>::_write_scalar_plain(csubstr s, size_t ilevel)
+{
+    size_t pos = 0; // tracks the last character that was already written
+    for(size_t i = 0; i < s.len; ++i)
+    {
+        const char curr = s.str[i];
+        if(curr == '\n')
+        {
+            csubstr sub = s.range(pos, i+1);
+            this->Writer::_do_write(sub);  // write everything up to (including) this newline
+            this->Writer::_do_write('\n'); // write the newline again
+            if(i + 1 < s.len)
+                _rymlindent_nextline()     // indent the next line
+            pos = i+1;
+        }
+    }
+    // write missing characters at the end of the string
+    if(pos < s.len)
+    {
+        csubstr sub = s.sub(pos);
+        this->Writer::_do_write(sub);
+    }
+}
+
+#undef _rymlindent_nextline
+
+template<class Writer>
+void Emitter<Writer>::_write_scalar(csubstr s, bool was_quoted)
+{
+    // this block of code needed to be moved to before the needs_quotes
+    // assignment to work around a g++ optimizer bug where (s.str != nullptr)
+    // was evaluated as true even if s.str was actually a nullptr (!!!)
+    if(s.len == size_t(0))
+    {
+        if(was_quoted || s.str != nullptr)
+            this->Writer::_do_write("''");
+        return;
+    }
+
+    const bool needs_quotes = (
+        was_quoted
+        ||
+        (
+            ( ! s.is_number())
+            &&
+            (
+                // has leading whitespace
+                // looks like reference or anchor
+                // would be treated as a directive
+                // see https://www.yaml.info/learn/quote.html#noplain
+                s.begins_with_any(" \n\t\r*&%@`")
+                ||
+                s.begins_with("<<")
+                ||
+                // has trailing whitespace
+                s.ends_with_any(" \n\t\r")
+                ||
+                // has special chars
+                (s.first_of("#:-?,\n{}[]'\"") != npos)
+            )
+        )
+    );
+
+    if( ! needs_quotes)
+    {
+        this->Writer::_do_write(s);
+    }
+    else
+    {
+        const bool has_dquotes = s.first_of( '"') != npos;
+        const bool has_squotes = s.first_of('\'') != npos;
+        if(!has_squotes && has_dquotes)
+        {
+            this->Writer::_do_write('\'');
+            this->Writer::_do_write(s);
+            this->Writer::_do_write('\'');
+        }
+        else if(has_squotes && !has_dquotes)
+        {
+            RYML_ASSERT(s.count('\n') == 0);
+            this->Writer::_do_write('"');
+            this->Writer::_do_write(s);
+            this->Writer::_do_write('"');
+        }
+        else
+        {
+            _write_scalar_squo(s, /*FIXME FIXME FIXME*/0);
+        }
+    }
+}
+template<class Writer>
+void Emitter<Writer>::_write_scalar_json(csubstr s, bool as_key, bool use_quotes)
+{
+    if((!use_quotes)
+       // json keys require quotes
+       && (!as_key)
+       && (
+           // do not quote special cases
+           (s == "true" || s == "false" || s == "null")
+           || (
+               // do not quote numbers
+               (s.is_number()
+                && (
+                    // quote integral numbers if they have a leading 0
+                    // https://github.com/biojppm/rapidyaml/issues/291
+                    (!(s.len > 1 && s.begins_with('0')))
+                    // do not quote reals with leading 0
+                    // https://github.com/biojppm/rapidyaml/issues/313
+                    || (s.find('.') != csubstr::npos) ))
+               )
+           )
+        )
+    {
+        this->Writer::_do_write(s);
+    }
+    else
+    {
+        size_t pos = 0;
+        this->Writer::_do_write('"');
+        for(size_t i = 0; i < s.len; ++i)
+        {
+            switch(s.str[i])
+            {
+            case '"':
+              this->Writer ::_do_write(s.range(pos, i));
+              this->Writer ::_do_write("\\\"");
+              pos = i + 1;
+              break;
+            case '\n':
+              this->Writer ::_do_write(s.range(pos, i));
+              this->Writer ::_do_write("\\n");
+              pos = i + 1;
+              break;
+            case '\t':
+              this->Writer ::_do_write(s.range(pos, i));
+              this->Writer ::_do_write("\\t");
+              pos = i + 1;
+              break;
+            case '\\':
+              this->Writer ::_do_write(s.range(pos, i));
+              this->Writer ::_do_write("\\\\");
+              pos = i + 1;
+              break;
+            case '\r':
+              this->Writer ::_do_write(s.range(pos, i));
+              this->Writer ::_do_write("\\r");
+              pos = i + 1;
+              break;
+            case '\b':
+              this->Writer ::_do_write(s.range(pos, i));
+              this->Writer ::_do_write("\\b");
+              pos = i + 1;
+              break;
+            case '\f':
+              this->Writer ::_do_write(s.range(pos, i));
+              this->Writer ::_do_write("\\f");
+              pos = i + 1;
+              break;
+            }
+        }
+        if(pos < s.len)
+        {
+            csubstr sub = s.sub(pos);
+            this->Writer::_do_write(sub);
+        }
+        this->Writer::_do_write('"');
+    }
+}
+
+} // namespace yml
+} // namespace c4
+
+#endif /* _C4_YML_EMIT_DEF_HPP_ */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/emit.def.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/detail/stack.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/detail/stack.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_YML_DETAIL_STACK_HPP_
+#define _C4_YML_DETAIL_STACK_HPP_
+
+#ifndef _C4_YML_COMMON_HPP_
+//included above:
+//#include "../common.hpp"
+#endif
+
+#ifdef RYML_DBG
+//included above:
+//#   include <type_traits>
+#endif
+
+//included above:
+//#include <string.h>
+
+namespace c4 {
+namespace yml {
+namespace detail {
+
+/** A lightweight contiguous stack with SSO. This avoids a dependency on std. */
+template<class T, size_t N=16>
+class stack
+{
+    static_assert(std::is_trivially_copyable<T>::value, "T must be trivially copyable");
+    static_assert(std::is_trivially_destructible<T>::value, "T must be trivially destructible");
+
+    enum : size_t { sso_size = N };
+
+public:
+
+    T         m_buf[N];
+    T *       m_stack;
+    size_t    m_size;
+    size_t    m_capacity;
+    Callbacks m_callbacks;
+
+public:
+
+    constexpr static bool is_contiguous() { return true; }
+
+    stack(Callbacks const& cb)
+        : m_buf()
+        , m_stack(m_buf)
+        , m_size(0)
+        , m_capacity(N)
+        , m_callbacks(cb) {}
+    stack() : stack(get_callbacks()) {}
+    ~stack()
+    {
+        _free();
+    }
+
+    stack(stack const& that) noexcept : stack(that.m_callbacks)
+    {
+        resize(that.m_size);
+        _cp(&that);
+    }
+
+    stack(stack &&that) noexcept : stack(that.m_callbacks)
+    {
+        _mv(&that);
+    }
+
+    stack& operator= (stack const& that) noexcept
+    {
+        _cb(that.m_callbacks);
+        resize(that.m_size);
+        _cp(&that);
+        return *this;
+    }
+
+    stack& operator= (stack &&that) noexcept
+    {
+        _cb(that.m_callbacks);
+        _mv(&that);
+        return *this;
+    }
+
+public:
+
+    size_t size() const { return m_size; }
+    size_t empty() const { return m_size == 0; }
+    size_t capacity() const { return m_capacity; }
+
+    void clear()
+    {
+        m_size = 0;
+    }
+
+    void resize(size_t sz)
+    {
+        reserve(sz);
+        m_size = sz;
+    }
+
+    void reserve(size_t sz);
+
+    void push(T const& C4_RESTRICT n)
+    {
+        RYML_ASSERT((const char*)&n + sizeof(T) < (const char*)m_stack || &n > m_stack + m_capacity);
+        if(m_size == m_capacity)
+        {
+            size_t cap = m_capacity == 0 ? N : 2 * m_capacity;
+            reserve(cap);
+        }
+        m_stack[m_size] = n;
+        ++m_size;
+    }
+
+    void push_top()
+    {
+        RYML_ASSERT(m_size > 0);
+        if(m_size == m_capacity)
+        {
+            size_t cap = m_capacity == 0 ? N : 2 * m_capacity;
+            reserve(cap);
+        }
+        m_stack[m_size] = m_stack[m_size - 1];
+        ++m_size;
+    }
+
+    T const& C4_RESTRICT pop()
+    {
+        RYML_ASSERT(m_size > 0);
+        --m_size;
+        return m_stack[m_size];
+    }
+
+    C4_ALWAYS_INLINE T const& C4_RESTRICT top() const { RYML_ASSERT(m_size > 0); return m_stack[m_size - 1]; }
+    C4_ALWAYS_INLINE T      & C4_RESTRICT top()       { RYML_ASSERT(m_size > 0); return m_stack[m_size - 1]; }
+
+    C4_ALWAYS_INLINE T const& C4_RESTRICT bottom() const { RYML_ASSERT(m_size > 0); return m_stack[0]; }
+    C4_ALWAYS_INLINE T      & C4_RESTRICT bottom()       { RYML_ASSERT(m_size > 0); return m_stack[0]; }
+
+    C4_ALWAYS_INLINE T const& C4_RESTRICT top(size_t i) const { RYML_ASSERT(i < m_size); return m_stack[m_size - 1 - i]; }
+    C4_ALWAYS_INLINE T      & C4_RESTRICT top(size_t i)       { RYML_ASSERT(i < m_size); return m_stack[m_size - 1 - i]; }
+
+    C4_ALWAYS_INLINE T const& C4_RESTRICT bottom(size_t i) const { RYML_ASSERT(i < m_size); return m_stack[i]; }
+    C4_ALWAYS_INLINE T      & C4_RESTRICT bottom(size_t i)       { RYML_ASSERT(i < m_size); return m_stack[i]; }
+
+    C4_ALWAYS_INLINE T const& C4_RESTRICT operator[](size_t i) const { RYML_ASSERT(i < m_size); return m_stack[i]; }
+    C4_ALWAYS_INLINE T      & C4_RESTRICT operator[](size_t i)       { RYML_ASSERT(i < m_size); return m_stack[i]; }
+
+public:
+
+    using       iterator = T       *;
+    using const_iterator = T const *;
+
+    iterator begin() { return m_stack; }
+    iterator end  () { return m_stack + m_size; }
+
+    const_iterator begin() const { return (const_iterator)m_stack; }
+    const_iterator end  () const { return (const_iterator)m_stack + m_size; }
+
+public:
+    void _free();
+    void _cp(stack const* C4_RESTRICT that);
+    void _mv(stack * that);
+    void _cb(Callbacks const& cb);
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+template<class T, size_t N>
+void stack<T, N>::reserve(size_t sz)
+{
+    if(sz <= m_size)
+        return;
+    if(sz <= N)
+    {
+        m_stack = m_buf;
+        m_capacity = N;
+        return;
+    }
+    T *buf = (T*) m_callbacks.m_allocate(sz * sizeof(T), m_stack, m_callbacks.m_user_data);
+    memcpy(buf, m_stack, m_size * sizeof(T));
+    if(m_stack != m_buf)
+    {
+        m_callbacks.m_free(m_stack, m_capacity * sizeof(T), m_callbacks.m_user_data);
+    }
+    m_stack = buf;
+    m_capacity = sz;
+}
+
+
+//-----------------------------------------------------------------------------
+
+template<class T, size_t N>
+void stack<T, N>::_free()
+{
+    RYML_ASSERT(m_stack != nullptr); // this structure cannot be memset() to zero
+    if(m_stack != m_buf)
+    {
+        m_callbacks.m_free(m_stack, m_capacity * sizeof(T), m_callbacks.m_user_data);
+        m_stack = m_buf;
+        m_size = N;
+        m_capacity = N;
+    }
+    else
+    {
+        RYML_ASSERT(m_capacity == N);
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+
+template<class T, size_t N>
+void stack<T, N>::_cp(stack const* C4_RESTRICT that)
+{
+    if(that->m_stack != that->m_buf)
+    {
+        RYML_ASSERT(that->m_capacity > N);
+        RYML_ASSERT(that->m_size <= that->m_capacity);
+    }
+    else
+    {
+        RYML_ASSERT(that->m_capacity <= N);
+        RYML_ASSERT(that->m_size <= that->m_capacity);
+    }
+    memcpy(m_stack, that->m_stack, that->m_size * sizeof(T));
+    m_size = that->m_size;
+    m_capacity = that->m_size < N ? N : that->m_size;
+    m_callbacks = that->m_callbacks;
+}
+
+
+//-----------------------------------------------------------------------------
+
+template<class T, size_t N>
+void stack<T, N>::_mv(stack * that)
+{
+    if(that->m_stack != that->m_buf)
+    {
+        RYML_ASSERT(that->m_capacity > N);
+        RYML_ASSERT(that->m_size <= that->m_capacity);
+        m_stack = that->m_stack;
+    }
+    else
+    {
+        RYML_ASSERT(that->m_capacity <= N);
+        RYML_ASSERT(that->m_size <= that->m_capacity);
+        memcpy(m_buf, that->m_buf, that->m_size * sizeof(T));
+        m_stack = m_buf;
+    }
+    m_size = that->m_size;
+    m_capacity = that->m_capacity;
+    m_callbacks = that->m_callbacks;
+    // make sure no deallocation happens on destruction
+    RYML_ASSERT(that->m_stack != m_buf);
+    that->m_stack = that->m_buf;
+    that->m_capacity = N;
+    that->m_size = 0;
+}
+
+
+//-----------------------------------------------------------------------------
+
+template<class T, size_t N>
+void stack<T, N>::_cb(Callbacks const& cb)
+{
+    if(cb != m_callbacks)
+    {
+        _free();
+        m_callbacks = cb;
+    }
+}
+
+} // namespace detail
+} // namespace yml
+} // namespace c4
+
+#endif /* _C4_YML_DETAIL_STACK_HPP_ */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/detail/stack.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/parse.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/parse.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_YML_PARSE_HPP_
+#define _C4_YML_PARSE_HPP_
+
+#ifndef _C4_YML_TREE_HPP_
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/tree.hpp
+//#include "c4/yml/tree.hpp"
+#if !defined(C4_YML_TREE_HPP_) && !defined(_C4_YML_TREE_HPP_)
+#error "amalgamate: file c4/yml/tree.hpp must have been included at this point"
+#endif /* C4_YML_TREE_HPP_ */
+
+#endif
+
+#ifndef _C4_YML_NODE_HPP_
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/node.hpp
+//#include "c4/yml/node.hpp"
+#if !defined(C4_YML_NODE_HPP_) && !defined(_C4_YML_NODE_HPP_)
+#error "amalgamate: file c4/yml/node.hpp must have been included at this point"
+#endif /* C4_YML_NODE_HPP_ */
+
+#endif
+
+#ifndef _C4_YML_DETAIL_STACK_HPP_
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/detail/stack.hpp
+//#include "c4/yml/detail/stack.hpp"
+#if !defined(C4_YML_DETAIL_STACK_HPP_) && !defined(_C4_YML_DETAIL_STACK_HPP_)
+#error "amalgamate: file c4/yml/detail/stack.hpp must have been included at this point"
+#endif /* C4_YML_DETAIL_STACK_HPP_ */
+
+#endif
+
+//included above:
+//#include <stdarg.h>
+
+#if defined(_MSC_VER)
+#   pragma warning(push)
+#   pragma warning(disable: 4251/*needs to have dll-interface to be used by clients of struct*/)
+#endif
+
+namespace c4 {
+namespace yml {
+
+struct RYML_EXPORT ParserOptions
+{
+private:
+
+    typedef enum : uint32_t {
+        LOCATIONS = (1 << 0),
+        DEFAULTS = 0,
+    } Flags_e;
+
+    uint32_t flags = DEFAULTS;
+public:
+    ParserOptions() = default;
+
+    /** @name source location tracking */
+    /** @{ */
+
+    /** enable/disable source location tracking */
+    ParserOptions& locations(bool enabled)
+    {
+        if(enabled)
+            flags |= LOCATIONS;
+        else
+            flags &= ~LOCATIONS;
+        return *this;
+    }
+    bool locations() const { return (flags & LOCATIONS) != 0u; }
+
+    /** @} */
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+class RYML_EXPORT Parser
+{
+public:
+
+    /** @name construction and assignment */
+    /** @{ */
+
+    Parser(Callbacks const& cb, ParserOptions opts={});
+    Parser(ParserOptions opts={}) : Parser(get_callbacks(), opts) {}
+    ~Parser();
+
+    Parser(Parser &&);
+    Parser(Parser const&);
+    Parser& operator=(Parser &&);
+    Parser& operator=(Parser const&);
+
+    /** @} */
+
+public:
+
+    /** @name modifiers */
+    /** @{ */
+
+    /** Reserve a certain capacity for the parsing stack.
+     * This should be larger than the expected depth of the parsed
+     * YAML tree.
+     *
+     * The parsing stack is the only (potential) heap memory used by
+     * the parser.
+     *
+     * If the requested capacity is below the default
+     * stack size of 16, the memory is used directly in the parser
+     * object; otherwise it will be allocated from the heap.
+     *
+     * @note this reserves memory only for the parser itself; all the
+     * allocations for the parsed tree will go through the tree's
+     * allocator.
+     *
+     * @note the tree and the arena can (and should) also be reserved. */
+    void reserve_stack(size_t capacity)
+    {
+        m_stack.reserve(capacity);
+    }
+
+    /** Reserve a certain capacity for the array used to track node
+     * locations in the source buffer. */
+    void reserve_locations(size_t num_source_lines)
+    {
+        _resize_locations(num_source_lines);
+    }
+
+    /** Reserve a certain capacity for the character arena used to
+     * filter scalars. */
+    void reserve_filter_arena(size_t num_characters)
+    {
+        _resize_filter_arena(num_characters);
+    }
+
+    /** @} */
+
+public:
+
+    /** @name getters and modifiers */
+    /** @{ */
+
+    /** Get the current callbacks in the parser. */
+    Callbacks callbacks() const { return m_stack.m_callbacks; }
+
+    /** Get the name of the latest file parsed by this object. */
+    csubstr filename() const { return m_file; }
+
+    /** Get the latest YAML buffer parsed by this object. */
+    csubstr source() const { return m_buf; }
+
+    size_t stack_capacity() const { return m_stack.capacity(); }
+    size_t locations_capacity() const { return m_newline_offsets_capacity; }
+    size_t filter_arena_capacity() const { return m_filter_arena.len; }
+
+    ParserOptions const& options() const { return m_options; }
+
+    /** @} */
+
+public:
+
+    /** @name parse_in_place */
+    /** @{ */
+
+    /** Create a new tree and parse into its root.
+     * The tree is created with the callbacks currently in the parser. */
+    Tree parse_in_place(csubstr filename, substr src)
+    {
+        Tree t(callbacks());
+        t.reserve(_estimate_capacity(src));
+        this->parse_in_place(filename, src, &t, t.root_id());
+        return t;
+    }
+
+    /** Parse into an existing tree, starting at its root node.
+     * The callbacks in the tree are kept, and used to allocate
+     * the tree members, if any allocation is required. */
+    void parse_in_place(csubstr filename, substr src, Tree *t)
+    {
+        this->parse_in_place(filename, src, t, t->root_id());
+    }
+
+    /** Parse into an existing node.
+     * The callbacks in the tree are kept, and used to allocate
+     * the tree members, if any allocation is required. */
+    void parse_in_place(csubstr filename, substr src, Tree *t, size_t node_id);
+    //   ^^^^^^^^^^^^^ this is the workhorse overload; everything else is syntactic candy
+
+    /** Parse into an existing node.
+     * The callbacks in the tree are kept, and used to allocate
+     * the tree members, if any allocation is required. */
+    void parse_in_place(csubstr filename, substr src, NodeRef node)
+    {
+        this->parse_in_place(filename, src, node.tree(), node.id());
+    }
+
+    RYML_DEPRECATED("use parse_in_place() instead") Tree parse(csubstr filename, substr src) { return parse_in_place(filename, src); }
+    RYML_DEPRECATED("use parse_in_place() instead") void parse(csubstr filename, substr src, Tree *t) { parse_in_place(filename, src, t); }
+    RYML_DEPRECATED("use parse_in_place() instead") void parse(csubstr filename, substr src, Tree *t, size_t node_id) { parse_in_place(filename, src, t, node_id); }
+    RYML_DEPRECATED("use parse_in_place() instead") void parse(csubstr filename, substr src, NodeRef node) { parse_in_place(filename, src, node); }
+
+    /** @} */
+
+public:
+
+    /** @name parse_in_arena: copy the YAML source buffer to the
+     * tree's arena, then parse the copy in situ
+     *
+     * @note overloads receiving a substr YAML buffer are intentionally
+     * left undefined, such that calling parse_in_arena() with a substr
+     * will cause a linker error. This is to prevent an accidental
+     * copy of the source buffer to the tree's arena, because substr
+     * is implicitly convertible to csubstr. If you really intend to parse
+     * a mutable buffer in the tree's arena, convert it first to immutable
+     * by assigning the substr to a csubstr prior to calling parse_in_arena().
+     * This is not needed for parse_in_place() because csubstr is not
+     * implicitly convertible to substr. */
+    /** @{ */
+
+    // READ THE NOTE ABOVE!
+    #define RYML_DONT_PARSE_SUBSTR_IN_ARENA "Do not pass a (mutable) substr to parse_in_arena(); if you have a substr, it should be parsed in place. Consider using parse_in_place() instead, or convert the buffer to csubstr prior to calling. This function is deliberately left undefined and will cause a linker error."
+    RYML_DEPRECATED(RYML_DONT_PARSE_SUBSTR_IN_ARENA) Tree parse_in_arena(csubstr filename, substr csrc);
+    RYML_DEPRECATED(RYML_DONT_PARSE_SUBSTR_IN_ARENA) void parse_in_arena(csubstr filename, substr csrc, Tree *t);
+    RYML_DEPRECATED(RYML_DONT_PARSE_SUBSTR_IN_ARENA) void parse_in_arena(csubstr filename, substr csrc, Tree *t, size_t node_id);
+    RYML_DEPRECATED(RYML_DONT_PARSE_SUBSTR_IN_ARENA) void parse_in_arena(csubstr filename, substr csrc, NodeRef node);
+
+    /** Create a new tree and parse into its root.
+     * The immutable YAML source is first copied to the tree's arena,
+     * and parsed from there.
+     * The callbacks in the tree are kept, and used to allocate
+     * the tree members, if any allocation is required. */
+    Tree parse_in_arena(csubstr filename, csubstr csrc)
+    {
+        Tree t(callbacks());
+        substr src = t.copy_to_arena(csrc);
+        t.reserve(_estimate_capacity(csrc));
+        this->parse_in_place(filename, src, &t, t.root_id());
+        return t;
+    }
+
+    /** Parse into an existing tree, starting at its root node.
+     * The immutable YAML source is first copied to the tree's arena,
+     * and parsed from there.
+     * The callbacks in the tree are kept, and used to allocate
+     * the tree members, if any allocation is required. */
+    void parse_in_arena(csubstr filename, csubstr csrc, Tree *t)
+    {
+        substr src = t->copy_to_arena(csrc);
+        this->parse_in_place(filename, src, t, t->root_id());
+    }
+
+    /** Parse into a specific node in an existing tree.
+     * The immutable YAML source is first copied to the tree's arena,
+     * and parsed from there.
+     * The callbacks in the tree are kept, and used to allocate
+     * the tree members, if any allocation is required. */
+    void parse_in_arena(csubstr filename, csubstr csrc, Tree *t, size_t node_id)
+    {
+        substr src = t->copy_to_arena(csrc);
+        this->parse_in_place(filename, src, t, node_id);
+    }
+
+    /** Parse into a specific node in an existing tree.
+     * The immutable YAML source is first copied to the tree's arena,
+     * and parsed from there.
+     * The callbacks in the tree are kept, and used to allocate
+     * the tree members, if any allocation is required. */
+    void parse_in_arena(csubstr filename, csubstr csrc, NodeRef node)
+    {
+        substr src = node.tree()->copy_to_arena(csrc);
+        this->parse_in_place(filename, src, node.tree(), node.id());
+    }
+
+    RYML_DEPRECATED("use parse_in_arena() instead") Tree parse(csubstr filename, csubstr csrc) { return parse_in_arena(filename, csrc); }
+    RYML_DEPRECATED("use parse_in_arena() instead") void parse(csubstr filename, csubstr csrc, Tree *t) { parse_in_arena(filename, csrc, t); }
+    RYML_DEPRECATED("use parse_in_arena() instead") void parse(csubstr filename, csubstr csrc, Tree *t, size_t node_id) { parse_in_arena(filename, csrc, t, node_id); }
+    RYML_DEPRECATED("use parse_in_arena() instead") void parse(csubstr filename, csubstr csrc, NodeRef node) { parse_in_arena(filename, csrc, node); }
+
+    /** @} */
+
+public:
+
+    /** @name locations */
+    /** @{ */
+
+    /** Get the location of a node of the last tree to be parsed by this parser. */
+    Location location(Tree const& tree, size_t node_id) const;
+    /** Get the location of a node of the last tree to be parsed by this parser. */
+    Location location(ConstNodeRef node) const;
+    /** Get the string starting at a particular location, to the end
+     * of the parsed source buffer. */
+    csubstr location_contents(Location const& loc) const;
+    /** Given a pointer to a buffer position, get the location. @p val
+     * must be pointing to somewhere in the source buffer that was
+     * last parsed by this object. */
+    Location val_location(const char *val) const;
+
+    /** @} */
+
+private:
+
+    typedef enum {
+        BLOCK_LITERAL, //!< keep newlines (|)
+        BLOCK_FOLD     //!< replace newline with single space (>)
+    } BlockStyle_e;
+
+    typedef enum {
+        CHOMP_CLIP,    //!< single newline at end (default)
+        CHOMP_STRIP,   //!< no newline at end     (-)
+        CHOMP_KEEP     //!< all newlines from end (+)
+    } BlockChomp_e;
+
+private:
+
+    using flag_t = int;
+
+    static size_t _estimate_capacity(csubstr src) { size_t c = _count_nlines(src); c = c >= 16 ? c : 16; return c; }
+
+    void  _reset();
+
+    bool  _finished_file() const;
+    bool  _finished_line() const;
+
+    csubstr _peek_next_line(size_t pos=npos) const;
+    bool    _advance_to_peeked();
+    void    _scan_line();
+
+    csubstr _slurp_doc_scalar();
+
+    /**
+     * @param [out] quoted
+     * Will only be written to if this method returns true.
+     * Will be set to true if the scanned scalar was quoted, by '', "", > or |.
+     */
+    bool    _scan_scalar_seq_blck(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted);
+    bool    _scan_scalar_map_blck(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted);
+    bool    _scan_scalar_seq_flow(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted);
+    bool    _scan_scalar_map_flow(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted);
+    bool    _scan_scalar_unk(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted);
+
+    csubstr _scan_comment();
+    csubstr _scan_squot_scalar();
+    csubstr _scan_dquot_scalar();
+    csubstr _scan_block();
+    substr  _scan_plain_scalar_blck(csubstr currscalar, csubstr peeked_line, size_t indentation);
+    substr  _scan_plain_scalar_flow(csubstr currscalar, csubstr peeked_line);
+    substr  _scan_complex_key(csubstr currscalar, csubstr peeked_line);
+    csubstr _scan_to_next_nonempty_line(size_t indentation);
+    csubstr _extend_scanned_scalar(csubstr currscalar);
+
+    csubstr _filter_squot_scalar(const substr s);
+    csubstr _filter_dquot_scalar(substr s);
+    csubstr _filter_plain_scalar(substr s, size_t indentation);
+    csubstr _filter_block_scalar(substr s, BlockStyle_e style, BlockChomp_e chomp, size_t indentation);
+    template<bool backslash_is_escape, bool keep_trailing_whitespace>
+    bool    _filter_nl(substr scalar, size_t *C4_RESTRICT pos, size_t *C4_RESTRICT filter_arena_pos, size_t indentation);
+    template<bool keep_trailing_whitespace>
+    void    _filter_ws(substr scalar, size_t *C4_RESTRICT pos, size_t *C4_RESTRICT filter_arena_pos);
+    bool    _apply_chomp(substr buf, size_t *C4_RESTRICT pos, BlockChomp_e chomp);
+
+    void  _handle_finished_file();
+    void  _handle_line();
+
+    bool  _handle_indentation();
+
+    bool  _handle_unk();
+    bool  _handle_map_flow();
+    bool  _handle_map_blck();
+    bool  _handle_seq_flow();
+    bool  _handle_seq_blck();
+    bool  _handle_top();
+    bool  _handle_types();
+    bool  _handle_key_anchors_and_refs();
+    bool  _handle_val_anchors_and_refs();
+    void  _move_val_tag_to_key_tag();
+    void  _move_key_tag_to_val_tag();
+    void  _move_key_tag2_to_key_tag();
+    void  _move_val_anchor_to_key_anchor();
+    void  _move_key_anchor_to_val_anchor();
+
+    void  _push_level(bool explicit_flow_chars = false);
+    void  _pop_level();
+
+    void  _start_unk(bool as_child=true);
+
+    void  _start_map(bool as_child=true);
+    void  _start_map_unk(bool as_child);
+    void  _stop_map();
+
+    void  _start_seq(bool as_child=true);
+    void  _stop_seq();
+
+    void  _start_seqimap();
+    void  _stop_seqimap();
+
+    void  _start_doc(bool as_child=true);
+    void  _stop_doc();
+    void  _start_new_doc(csubstr rem);
+    void  _end_stream();
+
+    NodeData* _append_val(csubstr val, flag_t quoted=false);
+    NodeData* _append_key_val(csubstr val, flag_t val_quoted=false);
+    bool  _rval_dash_start_or_continue_seq();
+
+    void  _store_scalar(csubstr s, flag_t is_quoted);
+    csubstr _consume_scalar();
+    void  _move_scalar_from_top();
+
+    inline NodeData* _append_val_null(const char *str) { _RYML_CB_ASSERT(m_stack.m_callbacks, str >= m_buf.begin() && str <= m_buf.end()); return _append_val({nullptr, size_t(0)}); }
+    inline NodeData* _append_key_val_null(const char *str) { _RYML_CB_ASSERT(m_stack.m_callbacks, str >= m_buf.begin() && str <= m_buf.end()); return _append_key_val({nullptr, size_t(0)}); }
+    inline void      _store_scalar_null(const char *str) {  _RYML_CB_ASSERT(m_stack.m_callbacks, str >= m_buf.begin() && str <= m_buf.end()); _store_scalar({nullptr, size_t(0)}, false); }
+
+    void  _set_indentation(size_t behind);
+    void  _save_indentation(size_t behind=0);
+    bool  _maybe_set_indentation_from_anchor_or_tag();
+
+    void  _write_key_anchor(size_t node_id);
+    void  _write_val_anchor(size_t node_id);
+
+    void _handle_directive(csubstr directive);
+
+    void _skipchars(char c);
+    template<size_t N>
+    void _skipchars(const char (&chars)[N]);
+
+private:
+
+    static size_t _count_nlines(csubstr src);
+
+private:
+
+    typedef enum : flag_t {
+        RTOP = 0x01 <<  0,   ///< reading at top level
+        RUNK = 0x01 <<  1,   ///< reading an unknown: must determine whether scalar, map or seq
+        RMAP = 0x01 <<  2,   ///< reading a map
+        RSEQ = 0x01 <<  3,   ///< reading a seq
+        FLOW = 0x01 <<  4,   ///< reading is inside explicit flow chars: [] or {}
+        QMRK = 0x01 <<  5,   ///< reading an explicit key (`? key`)
+        RKEY = 0x01 <<  6,   ///< reading a scalar as key
+        RVAL = 0x01 <<  7,   ///< reading a scalar as val
+        RNXT = 0x01 <<  8,   ///< read next val or keyval
+        SSCL = 0x01 <<  9,   ///< there's a stored scalar
+        QSCL = 0x01 << 10,   ///< stored scalar was quoted
+        RSET = 0x01 << 11,   ///< the (implicit) map being read is a !!set. @see https://yaml.org/type/set.html
+        NDOC = 0x01 << 12,   ///< no document mode. a document has ended and another has not started yet.
+        //! reading an implicit map nested in an explicit seq.
+        //! eg, {key: [key2: value2, key3: value3]}
+        //! is parsed as {key: [{key2: value2}, {key3: value3}]}
+        RSEQIMAP = 0x01 << 13,
+    } State_e;
+
+    struct LineContents
+    {
+        csubstr  full;        ///< the full line, including newlines on the right
+        csubstr  stripped;    ///< the stripped line, excluding newlines on the right
+        csubstr  rem;         ///< the stripped line remainder; initially starts at the first non-space character
+        size_t   indentation; ///< the number of spaces on the beginning of the line
+
+        LineContents() : full(), stripped(), rem(), indentation() {}
+
+        void reset_with_next_line(csubstr buf, size_t pos);
+
+        void reset(csubstr full_, csubstr stripped_)
+        {
+            full = full_;
+            stripped = stripped_;
+            rem = stripped_;
+            // find the first column where the character is not a space
+            indentation = full.first_not_of(' ');
+        }
+
+        size_t current_col() const
+        {
+            return current_col(rem);
+        }
+
+        size_t current_col(csubstr s) const
+        {
+            RYML_ASSERT(s.str >= full.str);
+            RYML_ASSERT(full.is_super(s));
+            size_t col = static_cast<size_t>(s.str - full.str);
+            return col;
+        }
+    };
+
+    struct State
+    {
+        flag_t       flags;
+        size_t       level;
+        size_t       node_id; // don't hold a pointer to the node as it will be relocated during tree resizes
+        csubstr      scalar;
+        size_t       scalar_col; // the column where the scalar (or its quotes) begin
+
+        Location     pos;
+        LineContents line_contents;
+        size_t       indref;
+
+        State() : flags(), level(), node_id(), scalar(), scalar_col(), pos(), line_contents(), indref() {}
+
+        void reset(const char *file, size_t node_id_)
+        {
+            flags = RUNK|RTOP;
+            level = 0;
+            pos.name = to_csubstr(file);
+            pos.offset = 0;
+            pos.line = 1;
+            pos.col = 1;
+            node_id = node_id_;
+            scalar_col = 0;
+            scalar.clear();
+            indref = 0;
+        }
+    };
+
+    void _line_progressed(size_t ahead);
+    void _line_ended();
+    void _line_ended_undo();
+
+    void _prepare_pop()
+    {
+        RYML_ASSERT(m_stack.size() > 1);
+        State const& curr = m_stack.top();
+        State      & next = m_stack.top(1);
+        next.pos = curr.pos;
+        next.line_contents = curr.line_contents;
+        next.scalar = curr.scalar;
+    }
+
+    inline bool _at_line_begin() const
+    {
+        return m_state->line_contents.rem.begin() == m_state->line_contents.full.begin();
+    }
+    inline bool _at_line_end() const
+    {
+        csubstr r = m_state->line_contents.rem;
+        return r.empty() || r.begins_with(' ', r.len);
+    }
+    inline bool _token_is_from_this_line(csubstr token) const
+    {
+        return token.is_sub(m_state->line_contents.full);
+    }
+
+    inline NodeData * node(State const* s) const { return m_tree->get(s->node_id); }
+    inline NodeData * node(State const& s) const { return m_tree->get(s .node_id); }
+    inline NodeData * node(size_t node_id) const { return m_tree->get(   node_id); }
+
+    inline bool has_all(flag_t f) const { return (m_state->flags & f) == f; }
+    inline bool has_any(flag_t f) const { return (m_state->flags & f) != 0; }
+    inline bool has_none(flag_t f) const { return (m_state->flags & f) == 0; }
+
+    static inline bool has_all(flag_t f, State const* s) { return (s->flags & f) == f; }
+    static inline bool has_any(flag_t f, State const* s) { return (s->flags & f) != 0; }
+    static inline bool has_none(flag_t f, State const* s) { return (s->flags & f) == 0; }
+
+    inline void set_flags(flag_t f) { set_flags(f, m_state); }
+    inline void add_flags(flag_t on) { add_flags(on, m_state); }
+    inline void addrem_flags(flag_t on, flag_t off) { addrem_flags(on, off, m_state); }
+    inline void rem_flags(flag_t off) { rem_flags(off, m_state); }
+
+    void set_flags(flag_t f, State * s);
+    void add_flags(flag_t on, State * s);
+    void addrem_flags(flag_t on, flag_t off, State * s);
+    void rem_flags(flag_t off, State * s);
+
+    void _resize_filter_arena(size_t num_characters);
+    void _grow_filter_arena(size_t num_characters);
+    substr _finish_filter_arena(substr dst, size_t pos);
+
+    void _prepare_locations();
+    void _resize_locations(size_t sz);
+    bool _locations_dirty() const;
+
+    bool _location_from_cont(Tree const& tree, size_t node, Location *C4_RESTRICT loc) const;
+    bool _location_from_node(Tree const& tree, size_t node, Location *C4_RESTRICT loc, size_t level) const;
+
+private:
+
+    void _free();
+    void _clr();
+    void _cp(Parser const* that);
+    void _mv(Parser *that);
+
+#ifdef RYML_DBG
+    template<class ...Args> void _dbg(csubstr fmt, Args const& C4_RESTRICT ...args) const;
+#endif
+    template<class ...Args> void _err(csubstr fmt, Args const& C4_RESTRICT ...args) const;
+    template<class DumpFn>  void _fmt_msg(DumpFn &&dumpfn) const;
+    static csubstr _prfl(substr buf, flag_t v);
+
+private:
+
+    ParserOptions m_options;
+
+    csubstr m_file;
+     substr m_buf;
+
+    size_t  m_root_id;
+    Tree *  m_tree;
+
+    detail::stack<State> m_stack;
+    State * m_state;
+
+    size_t  m_key_tag_indentation;
+    size_t  m_key_tag2_indentation;
+    csubstr m_key_tag;
+    csubstr m_key_tag2;
+    size_t  m_val_tag_indentation;
+    csubstr m_val_tag;
+
+    bool    m_key_anchor_was_before;
+    size_t  m_key_anchor_indentation;
+    csubstr m_key_anchor;
+    size_t  m_val_anchor_indentation;
+    csubstr m_val_anchor;
+
+    substr m_filter_arena;
+
+    size_t *m_newline_offsets;
+    size_t  m_newline_offsets_size;
+    size_t  m_newline_offsets_capacity;
+    csubstr m_newline_offsets_buf;
+};
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+/** @name parse_in_place
+ *
+ * @desc parse a mutable YAML source buffer.
+ *
+ * @note These freestanding functions use a temporary parser object,
+ * and are convenience functions to easily parse YAML without the need
+ * to instantiate a separate parser. Note that some properties
+ * (notably node locations in the original source code) are only
+ * available through the parser object after it has parsed the
+ * code. If you need access to any of these properties, use
+ * Parser::parse_in_place() */
+/** @{ */
+
+inline Tree parse_in_place(                  substr yaml                         ) { Parser np; return np.parse_in_place({}      , yaml); } //!< parse in-situ a modifiable YAML source buffer.
+inline Tree parse_in_place(csubstr filename, substr yaml                         ) { Parser np; return np.parse_in_place(filename, yaml); } //!< parse in-situ a modifiable YAML source buffer, providing a filename for error messages.
+inline void parse_in_place(                  substr yaml, Tree *t                ) { Parser np; np.parse_in_place({}      , yaml, t); } //!< reusing the YAML tree, parse in-situ a modifiable YAML source buffer
+inline void parse_in_place(csubstr filename, substr yaml, Tree *t                ) { Parser np; np.parse_in_place(filename, yaml, t); } //!< reusing the YAML tree, parse in-situ a modifiable YAML source buffer, providing a filename for error messages.
+inline void parse_in_place(                  substr yaml, Tree *t, size_t node_id) { Parser np; np.parse_in_place({}      , yaml, t, node_id); } //!< reusing the YAML tree, parse in-situ a modifiable YAML source buffer
+inline void parse_in_place(csubstr filename, substr yaml, Tree *t, size_t node_id) { Parser np; np.parse_in_place(filename, yaml, t, node_id); } //!< reusing the YAML tree, parse in-situ a modifiable YAML source buffer, providing a filename for error messages.
+inline void parse_in_place(                  substr yaml, NodeRef node           ) { Parser np; np.parse_in_place({}      , yaml, node); } //!< reusing the YAML tree, parse in-situ a modifiable YAML source buffer
+inline void parse_in_place(csubstr filename, substr yaml, NodeRef node           ) { Parser np; np.parse_in_place(filename, yaml, node); } //!< reusing the YAML tree, parse in-situ a modifiable YAML source buffer, providing a filename for error messages.
+
+RYML_DEPRECATED("use parse_in_place() instead") inline Tree parse(                  substr yaml                         ) { Parser np; return np.parse_in_place({}      , yaml); }
+RYML_DEPRECATED("use parse_in_place() instead") inline Tree parse(csubstr filename, substr yaml                         ) { Parser np; return np.parse_in_place(filename, yaml); }
+RYML_DEPRECATED("use parse_in_place() instead") inline void parse(                  substr yaml, Tree *t                ) { Parser np; np.parse_in_place({}      , yaml, t); }
+RYML_DEPRECATED("use parse_in_place() instead") inline void parse(csubstr filename, substr yaml, Tree *t                ) { Parser np; np.parse_in_place(filename, yaml, t); }
+RYML_DEPRECATED("use parse_in_place() instead") inline void parse(                  substr yaml, Tree *t, size_t node_id) { Parser np; np.parse_in_place({}      , yaml, t, node_id); }
+RYML_DEPRECATED("use parse_in_place() instead") inline void parse(csubstr filename, substr yaml, Tree *t, size_t node_id) { Parser np; np.parse_in_place(filename, yaml, t, node_id); }
+RYML_DEPRECATED("use parse_in_place() instead") inline void parse(                  substr yaml, NodeRef node           ) { Parser np; np.parse_in_place({}      , yaml, node); }
+RYML_DEPRECATED("use parse_in_place() instead") inline void parse(csubstr filename, substr yaml, NodeRef node           ) { Parser np; np.parse_in_place(filename, yaml, node); }
+
+/** @} */
+
+
+//-----------------------------------------------------------------------------
+
+/** @name parse_in_arena
+ * @desc parse a read-only YAML source buffer, copying it first to the tree's arena.
+ *
+ * @note These freestanding functions use a temporary parser object,
+ * and are convenience functions to easily parse YAML without the need
+ * to instantiate a separate parser. Note that some properties
+ * (notably node locations in the original source code) are only
+ * available through the parser object after it has parsed the
+ * code. If you need access to any of these properties, use
+ * Parser::parse_in_arena().
+ *
+ * @note overloads receiving a substr YAML buffer are intentionally
+ * left undefined, such that calling parse_in_arena() with a substr
+ * will cause a linker error. This is to prevent an accidental
+ * copy of the source buffer to the tree's arena, because substr
+ * is implicitly convertible to csubstr. If you really intend to parse
+ * a mutable buffer in the tree's arena, convert it first to immutable
+ * by assigning the substr to a csubstr prior to calling parse_in_arena().
+ * This is not needed for parse_in_place() because csubstr is not
+ * implicitly convertible to substr. */
+/** @{ */
+
+/* READ THE NOTE ABOVE! */
+RYML_DEPRECATED(RYML_DONT_PARSE_SUBSTR_IN_ARENA) Tree parse_in_arena(                  substr yaml                         );
+RYML_DEPRECATED(RYML_DONT_PARSE_SUBSTR_IN_ARENA) Tree parse_in_arena(csubstr filename, substr yaml                         );
+RYML_DEPRECATED(RYML_DONT_PARSE_SUBSTR_IN_ARENA) void parse_in_arena(                  substr yaml, Tree *t                );
+RYML_DEPRECATED(RYML_DONT_PARSE_SUBSTR_IN_ARENA) void parse_in_arena(csubstr filename, substr yaml, Tree *t                );
+RYML_DEPRECATED(RYML_DONT_PARSE_SUBSTR_IN_ARENA) void parse_in_arena(                  substr yaml, Tree *t, size_t node_id);
+RYML_DEPRECATED(RYML_DONT_PARSE_SUBSTR_IN_ARENA) void parse_in_arena(csubstr filename, substr yaml, Tree *t, size_t node_id);
+RYML_DEPRECATED(RYML_DONT_PARSE_SUBSTR_IN_ARENA) void parse_in_arena(                  substr yaml, NodeRef node           );
+RYML_DEPRECATED(RYML_DONT_PARSE_SUBSTR_IN_ARENA) void parse_in_arena(csubstr filename, substr yaml, NodeRef node           );
+
+inline Tree parse_in_arena(                  csubstr yaml                         ) { Parser np; return np.parse_in_arena({}      , yaml); } //!< parse a read-only YAML source buffer, copying it first to the tree's source arena.
+inline Tree parse_in_arena(csubstr filename, csubstr yaml                         ) { Parser np; return np.parse_in_arena(filename, yaml); } //!< parse a read-only YAML source buffer, copying it first to the tree's source arena, providing a filename for error messages.
+inline void parse_in_arena(                  csubstr yaml, Tree *t                ) { Parser np; np.parse_in_arena({}      , yaml, t); } //!< reusing the YAML tree, parse a read-only YAML source buffer, copying it first to the tree's source arena.
+inline void parse_in_arena(csubstr filename, csubstr yaml, Tree *t                ) { Parser np; np.parse_in_arena(filename, yaml, t); } //!< reusing the YAML tree, parse a read-only YAML source buffer, copying it first to the tree's source arena, providing a filename for error messages.
+inline void parse_in_arena(                  csubstr yaml, Tree *t, size_t node_id) { Parser np; np.parse_in_arena({}      , yaml, t, node_id); } //!< reusing the YAML tree, parse a read-only YAML source buffer, copying it first to the tree's source arena.
+inline void parse_in_arena(csubstr filename, csubstr yaml, Tree *t, size_t node_id) { Parser np; np.parse_in_arena(filename, yaml, t, node_id); } //!< reusing the YAML tree, parse a read-only YAML source buffer, copying it first to the tree's source arena, providing a filename for error messages.
+inline void parse_in_arena(                  csubstr yaml, NodeRef node           ) { Parser np; np.parse_in_arena({}      , yaml, node); } //!< reusing the YAML tree, parse a read-only YAML source buffer, copying it first to the tree's source arena.
+inline void parse_in_arena(csubstr filename, csubstr yaml, NodeRef node           ) { Parser np; np.parse_in_arena(filename, yaml, node); } //!< reusing the YAML tree, parse a read-only YAML source buffer, copying it first to the tree's source arena, providing a filename for error messages.
+
+RYML_DEPRECATED("use parse_in_arena() instead") inline Tree parse(                  csubstr yaml                         ) { Parser np; return np.parse_in_arena({}      , yaml); } //!< parse a read-only YAML source buffer, copying it first to the tree's source arena.
+RYML_DEPRECATED("use parse_in_arena() instead") inline Tree parse(csubstr filename, csubstr yaml                         ) { Parser np; return np.parse_in_arena(filename, yaml); } //!< parse a read-only YAML source buffer, copying it first to the tree's source arena, providing a filename for error messages.
+RYML_DEPRECATED("use parse_in_arena() instead") inline void parse(                  csubstr yaml, Tree *t                ) { Parser np; np.parse_in_arena({}      , yaml, t); } //!< reusing the YAML tree, parse a read-only YAML source buffer, copying it first to the tree's source arena.
+RYML_DEPRECATED("use parse_in_arena() instead") inline void parse(csubstr filename, csubstr yaml, Tree *t                ) { Parser np; np.parse_in_arena(filename, yaml, t); } //!< reusing the YAML tree, parse a read-only YAML source buffer, copying it first to the tree's source arena, providing a filename for error messages.
+RYML_DEPRECATED("use parse_in_arena() instead") inline void parse(                  csubstr yaml, Tree *t, size_t node_id) { Parser np; np.parse_in_arena({}      , yaml, t, node_id); } //!< reusing the YAML tree, parse a read-only YAML source buffer, copying it first to the tree's source arena.
+RYML_DEPRECATED("use parse_in_arena() instead") inline void parse(csubstr filename, csubstr yaml, Tree *t, size_t node_id) { Parser np; np.parse_in_arena(filename, yaml, t, node_id); } //!< reusing the YAML tree, parse a read-only YAML source buffer, copying it first to the tree's source arena, providing a filename for error messages.
+RYML_DEPRECATED("use parse_in_arena() instead") inline void parse(                  csubstr yaml, NodeRef node           ) { Parser np; np.parse_in_arena({}      , yaml, node); } //!< reusing the YAML tree, parse a read-only YAML source buffer, copying it first to the tree's source arena.
+RYML_DEPRECATED("use parse_in_arena() instead") inline void parse(csubstr filename, csubstr yaml, NodeRef node           ) { Parser np; np.parse_in_arena(filename, yaml, node); } //!< reusing the YAML tree, parse a read-only YAML source buffer, copying it first to the tree's source arena, providing a filename for error messages.
+
+/** @} */
+
+} // namespace yml
+} // namespace c4
+
+#if defined(_MSC_VER)
+#   pragma warning(pop)
+#endif
+
+#endif /* _C4_YML_PARSE_HPP_ */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/parse.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/std/map.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/std/map.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_YML_STD_MAP_HPP_
+#define _C4_YML_STD_MAP_HPP_
+
+/** @file map.hpp write/read std::map to/from a YAML tree. */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/node.hpp
+//#include "c4/yml/node.hpp"
+#if !defined(C4_YML_NODE_HPP_) && !defined(_C4_YML_NODE_HPP_)
+#error "amalgamate: file c4/yml/node.hpp must have been included at this point"
+#endif /* C4_YML_NODE_HPP_ */
+
+#include <map>
+
+namespace c4 {
+namespace yml {
+
+// std::map requires child nodes in the data
+// tree hierarchy (a MAP node in ryml parlance).
+// So it should be serialized via write()/read().
+
+template<class K, class V, class Less, class Alloc>
+void write(c4::yml::NodeRef *n, std::map<K, V, Less, Alloc> const& m)
+{
+    *n |= c4::yml::MAP;
+    for(auto const& C4_RESTRICT p : m)
+    {
+        auto ch = n->append_child();
+        ch << c4::yml::key(p.first);
+        ch << p.second;
+    }
+}
+
+template<class K, class V, class Less, class Alloc>
+bool read(c4::yml::ConstNodeRef const& n, std::map<K, V, Less, Alloc> * m)
+{
+    K k{};
+    V v{};
+    for(auto const& C4_RESTRICT ch : n)
+    {
+        ch >> c4::yml::key(k);
+        ch >> v;
+        m->emplace(std::make_pair(std::move(k), std::move(v)));
+    }
+    return true;
+}
+
+} // namespace yml
+} // namespace c4
+
+#endif // _C4_YML_STD_MAP_HPP_
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/std/map.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/std/string.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/std/string.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef C4_YML_STD_STRING_HPP_
+#define C4_YML_STD_STRING_HPP_
+
+/** @file string.hpp substring conversions for/from std::string */
+
+// everything we need is implemented here:
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/std/string.hpp
+//#include <c4/std/string.hpp>
+#if !defined(C4_STD_STRING_HPP_) && !defined(_C4_STD_STRING_HPP_)
+#error "amalgamate: file c4/std/string.hpp must have been included at this point"
+#endif /* C4_STD_STRING_HPP_ */
+
+
+#endif // C4_YML_STD_STRING_HPP_
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/std/string.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/std/vector.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/std/vector.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_YML_STD_VECTOR_HPP_
+#define _C4_YML_STD_VECTOR_HPP_
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/node.hpp
+//#include "c4/yml/node.hpp"
+#if !defined(C4_YML_NODE_HPP_) && !defined(_C4_YML_NODE_HPP_)
+#error "amalgamate: file c4/yml/node.hpp must have been included at this point"
+#endif /* C4_YML_NODE_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/std/vector.hpp
+//#include <c4/std/vector.hpp>
+#if !defined(C4_STD_VECTOR_HPP_) && !defined(_C4_STD_VECTOR_HPP_)
+#error "amalgamate: file c4/std/vector.hpp must have been included at this point"
+#endif /* C4_STD_VECTOR_HPP_ */
+
+//included above:
+//#include <vector>
+
+namespace c4 {
+namespace yml {
+
+// vector is a sequence-like type, and it requires child nodes
+// in the data tree hierarchy (a SEQ node in ryml parlance).
+// So it should be serialized via write()/read().
+
+
+template<class V, class Alloc>
+void write(c4::yml::NodeRef *n, std::vector<V, Alloc> const& vec)
+{
+    *n |= c4::yml::SEQ;
+    for(auto const& v : vec)
+        n->append_child() << v;
+}
+
+template<class V, class Alloc>
+bool read(c4::yml::ConstNodeRef const& n, std::vector<V, Alloc> *vec)
+{
+    vec->resize(n.num_children());
+    size_t pos = 0;
+    for(auto const ch : n)
+        ch >> (*vec)[pos++];
+    return true;
+}
+
+/** specialization: std::vector<bool> uses std::vector<bool>::reference as
+ * the return value of its operator[]. */
+template<class Alloc>
+bool read(c4::yml::ConstNodeRef const& n, std::vector<bool, Alloc> *vec)
+{
+    vec->resize(n.num_children());
+    size_t pos = 0;
+    bool tmp;
+    for(auto const ch : n)
+    {
+        ch >> tmp;
+        (*vec)[pos++] = tmp;
+    }
+    return true;
+}
+
+} // namespace yml
+} // namespace c4
+
+#endif // _C4_YML_STD_VECTOR_HPP_
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/std/vector.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/std/std.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/std/std.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_YML_STD_STD_HPP_
+#define _C4_YML_STD_STD_HPP_
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/std/string.hpp
+//#include "c4/yml/std/string.hpp"
+#if !defined(C4_YML_STD_STRING_HPP_) && !defined(_C4_YML_STD_STRING_HPP_)
+#error "amalgamate: file c4/yml/std/string.hpp must have been included at this point"
+#endif /* C4_YML_STD_STRING_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/std/vector.hpp
+//#include "c4/yml/std/vector.hpp"
+#if !defined(C4_YML_STD_VECTOR_HPP_) && !defined(_C4_YML_STD_VECTOR_HPP_)
+#error "amalgamate: file c4/yml/std/vector.hpp must have been included at this point"
+#endif /* C4_YML_STD_VECTOR_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/std/map.hpp
+//#include "c4/yml/std/map.hpp"
+#if !defined(C4_YML_STD_MAP_HPP_) && !defined(_C4_YML_STD_MAP_HPP_)
+#error "amalgamate: file c4/yml/std/map.hpp must have been included at this point"
+#endif /* C4_YML_STD_MAP_HPP_ */
+
+
+#endif // _C4_YML_STD_STD_HPP_
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/std/std.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/common.cpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/common.cpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifdef RYML_SINGLE_HDR_DEFINE_NOW
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/common.hpp
+//#include "c4/yml/common.hpp"
+#if !defined(C4_YML_COMMON_HPP_) && !defined(_C4_YML_COMMON_HPP_)
+#error "amalgamate: file c4/yml/common.hpp must have been included at this point"
+#endif /* C4_YML_COMMON_HPP_ */
+
+
+#ifndef RYML_NO_DEFAULT_CALLBACKS
+//included above:
+//#   include <stdlib.h>
+//included above:
+//#   include <stdio.h>
+#endif // RYML_NO_DEFAULT_CALLBACKS
+
+namespace c4 {
+namespace yml {
+
+namespace {
+Callbacks s_default_callbacks;
+} // anon namespace
+
+#ifndef RYML_NO_DEFAULT_CALLBACKS
+void report_error_impl(const char* msg, size_t length, Location loc, FILE *f)
+{
+    if(!f)
+        f = stderr;
+    if(loc)
+    {
+        if(!loc.name.empty())
+        {
+            fwrite(loc.name.str, 1, loc.name.len, f);
+            fputc(':', f);
+        }
+        fprintf(f, "%zu:", loc.line);
+        if(loc.col)
+            fprintf(f, "%zu:", loc.col);
+        if(loc.offset)
+            fprintf(f, " (%zuB):", loc.offset);
+    }
+    fprintf(f, "%.*s\n", (int)length, msg);
+    fflush(f);
+}
+
+void error_impl(const char* msg, size_t length, Location loc, void * /*user_data*/)
+{
+    report_error_impl(msg, length, loc, nullptr);
+    ::abort();
+}
+
+void* allocate_impl(size_t length, void * /*hint*/, void * /*user_data*/)
+{
+    void *mem = ::malloc(length);
+    if(mem == nullptr)
+    {
+        const char msg[] = "could not allocate memory";
+        error_impl(msg, sizeof(msg)-1, {}, nullptr);
+    }
+    return mem;
+}
+
+void free_impl(void *mem, size_t /*length*/, void * /*user_data*/)
+{
+    ::free(mem);
+}
+#endif // RYML_NO_DEFAULT_CALLBACKS
+
+
+
+Callbacks::Callbacks()
+    :
+    m_user_data(nullptr),
+    #ifndef RYML_NO_DEFAULT_CALLBACKS
+    m_allocate(allocate_impl),
+    m_free(free_impl),
+    m_error(error_impl)
+    #else
+    m_allocate(nullptr),
+    m_free(nullptr),
+    m_error(nullptr)
+    #endif
+{
+}
+
+Callbacks::Callbacks(void *user_data, pfn_allocate alloc_, pfn_free free_, pfn_error error_)
+    :
+    m_user_data(user_data),
+    #ifndef RYML_NO_DEFAULT_CALLBACKS
+    m_allocate(alloc_ ? alloc_ : allocate_impl),
+    m_free(free_ ? free_ : free_impl),
+    m_error(error_ ? error_ : error_impl)
+    #else
+    m_allocate(alloc_),
+    m_free(free_),
+    m_error(error_)
+    #endif
+{
+    C4_CHECK(m_allocate);
+    C4_CHECK(m_free);
+    C4_CHECK(m_error);
+}
+
+
+void set_callbacks(Callbacks const& c)
+{
+    s_default_callbacks = c;
+}
+
+Callbacks const& get_callbacks()
+{
+    return s_default_callbacks;
+}
+
+void reset_callbacks()
+{
+    set_callbacks(Callbacks());
+}
+
+void error(const char *msg, size_t msg_len, Location loc)
+{
+    s_default_callbacks.m_error(msg, msg_len, loc, s_default_callbacks.m_user_data);
+}
+
+} // namespace yml
+} // namespace c4
+
+#endif /* RYML_SINGLE_HDR_DEFINE_NOW */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/common.cpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/tree.cpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/tree.cpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifdef RYML_SINGLE_HDR_DEFINE_NOW
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/tree.hpp
+//#include "c4/yml/tree.hpp"
+#if !defined(C4_YML_TREE_HPP_) && !defined(_C4_YML_TREE_HPP_)
+#error "amalgamate: file c4/yml/tree.hpp must have been included at this point"
+#endif /* C4_YML_TREE_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/detail/parser_dbg.hpp
+//#include "c4/yml/detail/parser_dbg.hpp"
+#if !defined(C4_YML_DETAIL_PARSER_DBG_HPP_) && !defined(_C4_YML_DETAIL_PARSER_DBG_HPP_)
+#error "amalgamate: file c4/yml/detail/parser_dbg.hpp must have been included at this point"
+#endif /* C4_YML_DETAIL_PARSER_DBG_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/node.hpp
+//#include "c4/yml/node.hpp"
+#if !defined(C4_YML_NODE_HPP_) && !defined(_C4_YML_NODE_HPP_)
+#error "amalgamate: file c4/yml/node.hpp must have been included at this point"
+#endif /* C4_YML_NODE_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/detail/stack.hpp
+//#include "c4/yml/detail/stack.hpp"
+#if !defined(C4_YML_DETAIL_STACK_HPP_) && !defined(_C4_YML_DETAIL_STACK_HPP_)
+#error "amalgamate: file c4/yml/detail/stack.hpp must have been included at this point"
+#endif /* C4_YML_DETAIL_STACK_HPP_ */
+
+
+
+C4_SUPPRESS_WARNING_GCC_WITH_PUSH("-Wtype-limits")
+C4_SUPPRESS_WARNING_MSVC_WITH_PUSH(4296/*expression is always 'boolean_value'*/)
+
+namespace c4 {
+namespace yml {
+
+
+csubstr normalize_tag(csubstr tag)
+{
+    YamlTag_e t = to_tag(tag);
+    if(t != TAG_NONE)
+        return from_tag(t);
+    if(tag.begins_with("!<"))
+        tag = tag.sub(1);
+    if(tag.begins_with("<!"))
+        return tag;
+    return tag;
+}
+
+csubstr normalize_tag_long(csubstr tag)
+{
+    YamlTag_e t = to_tag(tag);
+    if(t != TAG_NONE)
+        return from_tag_long(t);
+    if(tag.begins_with("!<"))
+        tag = tag.sub(1);
+    if(tag.begins_with("<!"))
+        return tag;
+    return tag;
+}
+
+YamlTag_e to_tag(csubstr tag)
+{
+    if(tag.begins_with("!<"))
+        tag = tag.sub(1);
+    if(tag.begins_with("!!"))
+        tag = tag.sub(2);
+    else if(tag.begins_with('!'))
+        return TAG_NONE;
+    else if(tag.begins_with("tag:yaml.org,2002:"))
+    {
+        RYML_ASSERT(csubstr("tag:yaml.org,2002:").len == 18);
+        tag = tag.sub(18);
+    }
+    else if(tag.begins_with("<tag:yaml.org,2002:"))
+    {
+        RYML_ASSERT(csubstr("<tag:yaml.org,2002:").len == 19);
+        tag = tag.sub(19);
+        if(!tag.len)
+            return TAG_NONE;
+        tag = tag.offs(0, 1);
+    }
+
+    if(tag == "map")
+        return TAG_MAP;
+    else if(tag == "omap")
+        return TAG_OMAP;
+    else if(tag == "pairs")
+        return TAG_PAIRS;
+    else if(tag == "set")
+        return TAG_SET;
+    else if(tag == "seq")
+        return TAG_SEQ;
+    else if(tag == "binary")
+        return TAG_BINARY;
+    else if(tag == "bool")
+        return TAG_BOOL;
+    else if(tag == "float")
+        return TAG_FLOAT;
+    else if(tag == "int")
+        return TAG_INT;
+    else if(tag == "merge")
+        return TAG_MERGE;
+    else if(tag == "null")
+        return TAG_NULL;
+    else if(tag == "str")
+        return TAG_STR;
+    else if(tag == "timestamp")
+        return TAG_TIMESTAMP;
+    else if(tag == "value")
+        return TAG_VALUE;
+
+    return TAG_NONE;
+}
+
+csubstr from_tag_long(YamlTag_e tag)
+{
+    switch(tag)
+    {
+    case TAG_MAP:
+        return {"<tag:yaml.org,2002:map>"};
+    case TAG_OMAP:
+        return {"<tag:yaml.org,2002:omap>"};
+    case TAG_PAIRS:
+        return {"<tag:yaml.org,2002:pairs>"};
+    case TAG_SET:
+        return {"<tag:yaml.org,2002:set>"};
+    case TAG_SEQ:
+        return {"<tag:yaml.org,2002:seq>"};
+    case TAG_BINARY:
+        return {"<tag:yaml.org,2002:binary>"};
+    case TAG_BOOL:
+        return {"<tag:yaml.org,2002:bool>"};
+    case TAG_FLOAT:
+        return {"<tag:yaml.org,2002:float>"};
+    case TAG_INT:
+        return {"<tag:yaml.org,2002:int>"};
+    case TAG_MERGE:
+        return {"<tag:yaml.org,2002:merge>"};
+    case TAG_NULL:
+        return {"<tag:yaml.org,2002:null>"};
+    case TAG_STR:
+        return {"<tag:yaml.org,2002:str>"};
+    case TAG_TIMESTAMP:
+        return {"<tag:yaml.org,2002:timestamp>"};
+    case TAG_VALUE:
+        return {"<tag:yaml.org,2002:value>"};
+    case TAG_YAML:
+        return {"<tag:yaml.org,2002:yaml>"};
+    case TAG_NONE:
+        return {""};
+    }
+    return {""};
+}
+
+csubstr from_tag(YamlTag_e tag)
+{
+    switch(tag)
+    {
+    case TAG_MAP:
+        return {"!!map"};
+    case TAG_OMAP:
+        return {"!!omap"};
+    case TAG_PAIRS:
+        return {"!!pairs"};
+    case TAG_SET:
+        return {"!!set"};
+    case TAG_SEQ:
+        return {"!!seq"};
+    case TAG_BINARY:
+        return {"!!binary"};
+    case TAG_BOOL:
+        return {"!!bool"};
+    case TAG_FLOAT:
+        return {"!!float"};
+    case TAG_INT:
+        return {"!!int"};
+    case TAG_MERGE:
+        return {"!!merge"};
+    case TAG_NULL:
+        return {"!!null"};
+    case TAG_STR:
+        return {"!!str"};
+    case TAG_TIMESTAMP:
+        return {"!!timestamp"};
+    case TAG_VALUE:
+        return {"!!value"};
+    case TAG_YAML:
+        return {"!!yaml"};
+    case TAG_NONE:
+        return {""};
+    }
+    return {""};
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+const char* NodeType::type_str(NodeType_e ty)
+{
+    switch(ty & _TYMASK)
+    {
+    case KEYVAL:
+        return "KEYVAL";
+    case KEY:
+        return "KEY";
+    case VAL:
+        return "VAL";
+    case MAP:
+        return "MAP";
+    case SEQ:
+        return "SEQ";
+    case KEYMAP:
+        return "KEYMAP";
+    case KEYSEQ:
+        return "KEYSEQ";
+    case DOCSEQ:
+        return "DOCSEQ";
+    case DOCMAP:
+        return "DOCMAP";
+    case DOCVAL:
+        return "DOCVAL";
+    case DOC:
+        return "DOC";
+    case STREAM:
+        return "STREAM";
+    case NOTYPE:
+        return "NOTYPE";
+    default:
+        if((ty & KEYVAL) == KEYVAL)
+            return "KEYVAL***";
+        if((ty & KEYMAP) == KEYMAP)
+            return "KEYMAP***";
+        if((ty & KEYSEQ) == KEYSEQ)
+            return "KEYSEQ***";
+        if((ty & DOCSEQ) == DOCSEQ)
+            return "DOCSEQ***";
+        if((ty & DOCMAP) == DOCMAP)
+            return "DOCMAP***";
+        if((ty & DOCVAL) == DOCVAL)
+            return "DOCVAL***";
+        if(ty & KEY)
+            return "KEY***";
+        if(ty & VAL)
+            return "VAL***";
+        if(ty & MAP)
+            return "MAP***";
+        if(ty & SEQ)
+            return "SEQ***";
+        if(ty & DOC)
+            return "DOC***";
+        return "(unk)";
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+NodeRef Tree::rootref()
+{
+    return NodeRef(this, root_id());
+}
+ConstNodeRef Tree::rootref() const
+{
+    return ConstNodeRef(this, root_id());
+}
+
+ConstNodeRef Tree::crootref()
+{
+    return ConstNodeRef(this, root_id());
+}
+ConstNodeRef Tree::crootref() const
+{
+    return ConstNodeRef(this, root_id());
+}
+
+NodeRef Tree::ref(size_t id)
+{
+    _RYML_CB_ASSERT(m_callbacks, id != NONE && id >= 0 && id < m_size);
+    return NodeRef(this, id);
+}
+ConstNodeRef Tree::ref(size_t id) const
+{
+    _RYML_CB_ASSERT(m_callbacks, id != NONE && id >= 0 && id < m_size);
+    return ConstNodeRef(this, id);
+}
+
+ConstNodeRef Tree::cref(size_t id)
+{
+    _RYML_CB_ASSERT(m_callbacks, id != NONE && id >= 0 && id < m_size);
+    return ConstNodeRef(this, id);
+}
+ConstNodeRef Tree::cref(size_t id) const
+{
+    _RYML_CB_ASSERT(m_callbacks, id != NONE && id >= 0 && id < m_size);
+    return ConstNodeRef(this, id);
+}
+
+NodeRef Tree::operator[] (csubstr key)
+{
+    return rootref()[key];
+}
+ConstNodeRef Tree::operator[] (csubstr key) const
+{
+    return rootref()[key];
+}
+
+NodeRef Tree::operator[] (size_t i)
+{
+    return rootref()[i];
+}
+ConstNodeRef Tree::operator[] (size_t i) const
+{
+    return rootref()[i];
+}
+
+NodeRef Tree::docref(size_t i)
+{
+    return ref(doc(i));
+}
+ConstNodeRef Tree::docref(size_t i) const
+{
+    return cref(doc(i));
+}
+
+
+//-----------------------------------------------------------------------------
+Tree::Tree(Callbacks const& cb)
+    : m_buf(nullptr)
+    , m_cap(0)
+    , m_size(0)
+    , m_free_head(NONE)
+    , m_free_tail(NONE)
+    , m_arena()
+    , m_arena_pos(0)
+    , m_callbacks(cb)
+{
+}
+
+Tree::Tree(size_t node_capacity, size_t arena_capacity, Callbacks const& cb)
+    : Tree(cb)
+{
+    reserve(node_capacity);
+    reserve_arena(arena_capacity);
+}
+
+Tree::~Tree()
+{
+    _free();
+}
+
+
+Tree::Tree(Tree const& that) noexcept : Tree(that.m_callbacks)
+{
+    _copy(that);
+}
+
+Tree& Tree::operator= (Tree const& that) noexcept
+{
+    _free();
+    m_callbacks = that.m_callbacks;
+    _copy(that);
+    return *this;
+}
+
+Tree::Tree(Tree && that) noexcept : Tree(that.m_callbacks)
+{
+    _move(that);
+}
+
+Tree& Tree::operator= (Tree && that) noexcept
+{
+    _free();
+    m_callbacks = that.m_callbacks;
+    _move(that);
+    return *this;
+}
+
+void Tree::_free()
+{
+    if(m_buf)
+    {
+        _RYML_CB_ASSERT(m_callbacks, m_cap > 0);
+        _RYML_CB_FREE(m_callbacks, m_buf, NodeData, m_cap);
+    }
+    if(m_arena.str)
+    {
+        _RYML_CB_ASSERT(m_callbacks, m_arena.len > 0);
+        _RYML_CB_FREE(m_callbacks, m_arena.str, char, m_arena.len);
+    }
+    _clear();
+}
+
+
+C4_SUPPRESS_WARNING_GCC_PUSH
+#if defined(__GNUC__) && __GNUC__>= 8
+    C4_SUPPRESS_WARNING_GCC_WITH_PUSH("-Wclass-memaccess") // error: ‘void* memset(void*, int, size_t)’ clearing an object of type ‘class c4::yml::Tree’ with no trivial copy-assignment; use assignment or value-initialization instead
+#endif
+
+void Tree::_clear()
+{
+    m_buf = nullptr;
+    m_cap = 0;
+    m_size = 0;
+    m_free_head = 0;
+    m_free_tail = 0;
+    m_arena = {};
+    m_arena_pos = 0;
+    for(size_t i = 0; i < RYML_MAX_TAG_DIRECTIVES; ++i)
+        m_tag_directives[i] = {};
+}
+
+void Tree::_copy(Tree const& that)
+{
+    _RYML_CB_ASSERT(m_callbacks, m_buf == nullptr);
+    _RYML_CB_ASSERT(m_callbacks, m_arena.str == nullptr);
+    _RYML_CB_ASSERT(m_callbacks, m_arena.len == 0);
+    m_buf = _RYML_CB_ALLOC_HINT(m_callbacks, NodeData, that.m_cap, that.m_buf);
+    memcpy(m_buf, that.m_buf, that.m_cap * sizeof(NodeData));
+    m_cap = that.m_cap;
+    m_size = that.m_size;
+    m_free_head = that.m_free_head;
+    m_free_tail = that.m_free_tail;
+    m_arena_pos = that.m_arena_pos;
+    m_arena = that.m_arena;
+    if(that.m_arena.str)
+    {
+        _RYML_CB_ASSERT(m_callbacks, that.m_arena.len > 0);
+        substr arena;
+        arena.str = _RYML_CB_ALLOC_HINT(m_callbacks, char, that.m_arena.len, that.m_arena.str);
+        arena.len = that.m_arena.len;
+        _relocate(arena); // does a memcpy of the arena and updates nodes using the old arena
+        m_arena = arena;
+    }
+    for(size_t i = 0; i < RYML_MAX_TAG_DIRECTIVES; ++i)
+        m_tag_directives[i] = that.m_tag_directives[i];
+}
+
+void Tree::_move(Tree & that)
+{
+    _RYML_CB_ASSERT(m_callbacks, m_buf == nullptr);
+    _RYML_CB_ASSERT(m_callbacks, m_arena.str == nullptr);
+    _RYML_CB_ASSERT(m_callbacks, m_arena.len == 0);
+    m_buf = that.m_buf;
+    m_cap = that.m_cap;
+    m_size = that.m_size;
+    m_free_head = that.m_free_head;
+    m_free_tail = that.m_free_tail;
+    m_arena = that.m_arena;
+    m_arena_pos = that.m_arena_pos;
+    for(size_t i = 0; i < RYML_MAX_TAG_DIRECTIVES; ++i)
+        m_tag_directives[i] = that.m_tag_directives[i];
+    that._clear();
+}
+
+void Tree::_relocate(substr next_arena)
+{
+    _RYML_CB_ASSERT(m_callbacks, next_arena.not_empty());
+    _RYML_CB_ASSERT(m_callbacks, next_arena.len >= m_arena.len);
+    memcpy(next_arena.str, m_arena.str, m_arena_pos);
+    for(NodeData *C4_RESTRICT n = m_buf, *e = m_buf + m_cap; n != e; ++n)
+    {
+        if(in_arena(n->m_key.scalar))
+            n->m_key.scalar = _relocated(n->m_key.scalar, next_arena);
+        if(in_arena(n->m_key.tag))
+            n->m_key.tag = _relocated(n->m_key.tag, next_arena);
+        if(in_arena(n->m_key.anchor))
+            n->m_key.anchor = _relocated(n->m_key.anchor, next_arena);
+        if(in_arena(n->m_val.scalar))
+            n->m_val.scalar = _relocated(n->m_val.scalar, next_arena);
+        if(in_arena(n->m_val.tag))
+            n->m_val.tag = _relocated(n->m_val.tag, next_arena);
+        if(in_arena(n->m_val.anchor))
+            n->m_val.anchor = _relocated(n->m_val.anchor, next_arena);
+    }
+    for(TagDirective &C4_RESTRICT td : m_tag_directives)
+    {
+        if(in_arena(td.prefix))
+            td.prefix = _relocated(td.prefix, next_arena);
+        if(in_arena(td.handle))
+            td.handle = _relocated(td.handle, next_arena);
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+void Tree::reserve(size_t cap)
+{
+    if(cap > m_cap)
+    {
+        NodeData *buf = _RYML_CB_ALLOC_HINT(m_callbacks, NodeData, cap, m_buf);
+        if(m_buf)
+        {
+            memcpy(buf, m_buf, m_cap * sizeof(NodeData));
+            _RYML_CB_FREE(m_callbacks, m_buf, NodeData, m_cap);
+        }
+        size_t first = m_cap, del = cap - m_cap;
+        m_cap = cap;
+        m_buf = buf;
+        _clear_range(first, del);
+        if(m_free_head != NONE)
+        {
+            _RYML_CB_ASSERT(m_callbacks, m_buf != nullptr);
+            _RYML_CB_ASSERT(m_callbacks, m_free_tail != NONE);
+            m_buf[m_free_tail].m_next_sibling = first;
+            m_buf[first].m_prev_sibling = m_free_tail;
+            m_free_tail = cap-1;
+        }
+        else
+        {
+            _RYML_CB_ASSERT(m_callbacks, m_free_tail == NONE);
+            m_free_head = first;
+            m_free_tail = cap-1;
+        }
+        _RYML_CB_ASSERT(m_callbacks, m_free_head == NONE || (m_free_head >= 0 && m_free_head < cap));
+        _RYML_CB_ASSERT(m_callbacks, m_free_tail == NONE || (m_free_tail >= 0 && m_free_tail < cap));
+
+        if( ! m_size)
+            _claim_root();
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+void Tree::clear()
+{
+    _clear_range(0, m_cap);
+    m_size = 0;
+    if(m_buf)
+    {
+        _RYML_CB_ASSERT(m_callbacks, m_cap >= 0);
+        m_free_head = 0;
+        m_free_tail = m_cap-1;
+        _claim_root();
+    }
+    else
+    {
+        m_free_head = NONE;
+        m_free_tail = NONE;
+    }
+    for(size_t i = 0; i < RYML_MAX_TAG_DIRECTIVES; ++i)
+        m_tag_directives[i] = {};
+}
+
+void Tree::_claim_root()
+{
+    size_t r = _claim();
+    _RYML_CB_ASSERT(m_callbacks, r == 0);
+    _set_hierarchy(r, NONE, NONE);
+}
+
+
+//-----------------------------------------------------------------------------
+void Tree::_clear_range(size_t first, size_t num)
+{
+    if(num == 0)
+        return; // prevent overflow when subtracting
+    _RYML_CB_ASSERT(m_callbacks, first >= 0 && first + num <= m_cap);
+    memset(m_buf + first, 0, num * sizeof(NodeData)); // TODO we should not need this
+    for(size_t i = first, e = first + num; i < e; ++i)
+    {
+        _clear(i);
+        NodeData *n = m_buf + i;
+        n->m_prev_sibling = i - 1;
+        n->m_next_sibling = i + 1;
+    }
+    m_buf[first + num - 1].m_next_sibling = NONE;
+}
+
+C4_SUPPRESS_WARNING_GCC_POP
+
+
+//-----------------------------------------------------------------------------
+void Tree::_release(size_t i)
+{
+    _RYML_CB_ASSERT(m_callbacks, i >= 0 && i < m_cap);
+
+    _rem_hierarchy(i);
+    _free_list_add(i);
+    _clear(i);
+
+    --m_size;
+}
+
+//-----------------------------------------------------------------------------
+// add to the front of the free list
+void Tree::_free_list_add(size_t i)
+{
+    _RYML_CB_ASSERT(m_callbacks, i >= 0 && i < m_cap);
+    NodeData &C4_RESTRICT w = m_buf[i];
+
+    w.m_parent = NONE;
+    w.m_next_sibling = m_free_head;
+    w.m_prev_sibling = NONE;
+    if(m_free_head != NONE)
+        m_buf[m_free_head].m_prev_sibling = i;
+    m_free_head = i;
+    if(m_free_tail == NONE)
+        m_free_tail = m_free_head;
+}
+
+void Tree::_free_list_rem(size_t i)
+{
+    if(m_free_head == i)
+        m_free_head = _p(i)->m_next_sibling;
+    _rem_hierarchy(i);
+}
+
+//-----------------------------------------------------------------------------
+size_t Tree::_claim()
+{
+    if(m_free_head == NONE || m_buf == nullptr)
+    {
+        size_t sz = 2 * m_cap;
+        sz = sz ? sz : 16;
+        reserve(sz);
+        _RYML_CB_ASSERT(m_callbacks, m_free_head != NONE);
+    }
+
+    _RYML_CB_ASSERT(m_callbacks, m_size < m_cap);
+    _RYML_CB_ASSERT(m_callbacks, m_free_head >= 0 && m_free_head < m_cap);
+
+    size_t ichild = m_free_head;
+    NodeData *child = m_buf + ichild;
+
+    ++m_size;
+    m_free_head = child->m_next_sibling;
+    if(m_free_head == NONE)
+    {
+        m_free_tail = NONE;
+        _RYML_CB_ASSERT(m_callbacks, m_size == m_cap);
+    }
+
+    _clear(ichild);
+
+    return ichild;
+}
+
+//-----------------------------------------------------------------------------
+
+C4_SUPPRESS_WARNING_GCC_PUSH
+C4_SUPPRESS_WARNING_CLANG_PUSH
+C4_SUPPRESS_WARNING_CLANG("-Wnull-dereference")
+#if defined(__GNUC__) && (__GNUC__ >= 6)
+C4_SUPPRESS_WARNING_GCC("-Wnull-dereference")
+#endif
+
+void Tree::_set_hierarchy(size_t ichild, size_t iparent, size_t iprev_sibling)
+{
+    _RYML_CB_ASSERT(m_callbacks, iparent == NONE || (iparent >= 0 && iparent < m_cap));
+    _RYML_CB_ASSERT(m_callbacks, iprev_sibling == NONE || (iprev_sibling >= 0 && iprev_sibling < m_cap));
+
+    NodeData *C4_RESTRICT child = get(ichild);
+
+    child->m_parent = iparent;
+    child->m_prev_sibling = NONE;
+    child->m_next_sibling = NONE;
+
+    if(iparent == NONE)
+    {
+        _RYML_CB_ASSERT(m_callbacks, ichild == 0);
+        _RYML_CB_ASSERT(m_callbacks, iprev_sibling == NONE);
+    }
+
+    if(iparent == NONE)
+        return;
+
+    size_t inext_sibling = iprev_sibling != NONE ? next_sibling(iprev_sibling) : first_child(iparent);
+    NodeData *C4_RESTRICT parent = get(iparent);
+    NodeData *C4_RESTRICT psib   = get(iprev_sibling);
+    NodeData *C4_RESTRICT nsib   = get(inext_sibling);
+
+    if(psib)
+    {
+        _RYML_CB_ASSERT(m_callbacks, next_sibling(iprev_sibling) == id(nsib));
+        child->m_prev_sibling = id(psib);
+        psib->m_next_sibling = id(child);
+        _RYML_CB_ASSERT(m_callbacks, psib->m_prev_sibling != psib->m_next_sibling || psib->m_prev_sibling == NONE);
+    }
+
+    if(nsib)
+    {
+        _RYML_CB_ASSERT(m_callbacks, prev_sibling(inext_sibling) == id(psib));
+        child->m_next_sibling = id(nsib);
+        nsib->m_prev_sibling = id(child);
+        _RYML_CB_ASSERT(m_callbacks, nsib->m_prev_sibling != nsib->m_next_sibling || nsib->m_prev_sibling == NONE);
+    }
+
+    if(parent->m_first_child == NONE)
+    {
+        _RYML_CB_ASSERT(m_callbacks, parent->m_last_child == NONE);
+        parent->m_first_child = id(child);
+        parent->m_last_child = id(child);
+    }
+    else
+    {
+        if(child->m_next_sibling == parent->m_first_child)
+            parent->m_first_child = id(child);
+
+        if(child->m_prev_sibling == parent->m_last_child)
+            parent->m_last_child = id(child);
+    }
+}
+
+C4_SUPPRESS_WARNING_GCC_POP
+C4_SUPPRESS_WARNING_CLANG_POP
+
+
+//-----------------------------------------------------------------------------
+void Tree::_rem_hierarchy(size_t i)
+{
+    _RYML_CB_ASSERT(m_callbacks, i >= 0 && i < m_cap);
+
+    NodeData &C4_RESTRICT w = m_buf[i];
+
+    // remove from the parent
+    if(w.m_parent != NONE)
+    {
+        NodeData &C4_RESTRICT p = m_buf[w.m_parent];
+        if(p.m_first_child == i)
+        {
+            p.m_first_child = w.m_next_sibling;
+        }
+        if(p.m_last_child == i)
+        {
+            p.m_last_child = w.m_prev_sibling;
+        }
+    }
+
+    // remove from the used list
+    if(w.m_prev_sibling != NONE)
+    {
+        NodeData *C4_RESTRICT prev = get(w.m_prev_sibling);
+        prev->m_next_sibling = w.m_next_sibling;
+    }
+    if(w.m_next_sibling != NONE)
+    {
+        NodeData *C4_RESTRICT next = get(w.m_next_sibling);
+        next->m_prev_sibling = w.m_prev_sibling;
+    }
+}
+
+//-----------------------------------------------------------------------------
+void Tree::reorder()
+{
+    size_t r = root_id();
+    _do_reorder(&r, 0);
+}
+
+//-----------------------------------------------------------------------------
+size_t Tree::_do_reorder(size_t *node, size_t count)
+{
+    // swap this node if it's not in place
+    if(*node != count)
+    {
+        _swap(*node, count);
+        *node = count;
+    }
+    ++count; // bump the count from this node
+
+    // now descend in the hierarchy
+    for(size_t i = first_child(*node); i != NONE; i = next_sibling(i))
+    {
+        // this child may have been relocated to a different index,
+        // so get an updated version
+        count = _do_reorder(&i, count);
+    }
+    return count;
+}
+
+//-----------------------------------------------------------------------------
+void Tree::_swap(size_t n_, size_t m_)
+{
+    _RYML_CB_ASSERT(m_callbacks, (parent(n_) != NONE) || type(n_) == NOTYPE);
+    _RYML_CB_ASSERT(m_callbacks, (parent(m_) != NONE) || type(m_) == NOTYPE);
+    NodeType tn = type(n_);
+    NodeType tm = type(m_);
+    if(tn != NOTYPE && tm != NOTYPE)
+    {
+        _swap_props(n_, m_);
+        _swap_hierarchy(n_, m_);
+    }
+    else if(tn == NOTYPE && tm != NOTYPE)
+    {
+        _copy_props(n_, m_);
+        _free_list_rem(n_);
+        _copy_hierarchy(n_, m_);
+        _clear(m_);
+        _free_list_add(m_);
+    }
+    else if(tn != NOTYPE && tm == NOTYPE)
+    {
+        _copy_props(m_, n_);
+        _free_list_rem(m_);
+        _copy_hierarchy(m_, n_);
+        _clear(n_);
+        _free_list_add(n_);
+    }
+    else
+    {
+        C4_NEVER_REACH();
+    }
+}
+
+//-----------------------------------------------------------------------------
+void Tree::_swap_hierarchy(size_t ia, size_t ib)
+{
+    if(ia == ib) return;
+
+    for(size_t i = first_child(ia); i != NONE; i = next_sibling(i))
+    {
+        if(i == ib || i == ia)
+            continue;
+        _p(i)->m_parent = ib;
+    }
+
+    for(size_t i = first_child(ib); i != NONE; i = next_sibling(i))
+    {
+        if(i == ib || i == ia)
+            continue;
+        _p(i)->m_parent = ia;
+    }
+
+    auto & C4_RESTRICT a  = *_p(ia);
+    auto & C4_RESTRICT b  = *_p(ib);
+    auto & C4_RESTRICT pa = *_p(a.m_parent);
+    auto & C4_RESTRICT pb = *_p(b.m_parent);
+
+    if(&pa == &pb)
+    {
+        if((pa.m_first_child == ib && pa.m_last_child == ia)
+            ||
+           (pa.m_first_child == ia && pa.m_last_child == ib))
+        {
+            std::swap(pa.m_first_child, pa.m_last_child);
+        }
+        else
+        {
+            bool changed = false;
+            if(pa.m_first_child == ia)
+            {
+                pa.m_first_child = ib;
+                changed = true;
+            }
+            if(pa.m_last_child  == ia)
+            {
+                pa.m_last_child = ib;
+                changed = true;
+            }
+            if(pb.m_first_child == ib && !changed)
+            {
+                pb.m_first_child = ia;
+            }
+            if(pb.m_last_child  == ib && !changed)
+            {
+                pb.m_last_child  = ia;
+            }
+        }
+    }
+    else
+    {
+        if(pa.m_first_child == ia)
+            pa.m_first_child = ib;
+        if(pa.m_last_child  == ia)
+            pa.m_last_child  = ib;
+        if(pb.m_first_child == ib)
+            pb.m_first_child = ia;
+        if(pb.m_last_child  == ib)
+            pb.m_last_child  = ia;
+    }
+    std::swap(a.m_first_child , b.m_first_child);
+    std::swap(a.m_last_child  , b.m_last_child);
+
+    if(a.m_prev_sibling != ib && b.m_prev_sibling != ia &&
+       a.m_next_sibling != ib && b.m_next_sibling != ia)
+    {
+        if(a.m_prev_sibling != NONE && a.m_prev_sibling != ib)
+            _p(a.m_prev_sibling)->m_next_sibling = ib;
+        if(a.m_next_sibling != NONE && a.m_next_sibling != ib)
+            _p(a.m_next_sibling)->m_prev_sibling = ib;
+        if(b.m_prev_sibling != NONE && b.m_prev_sibling != ia)
+            _p(b.m_prev_sibling)->m_next_sibling = ia;
+        if(b.m_next_sibling != NONE && b.m_next_sibling != ia)
+            _p(b.m_next_sibling)->m_prev_sibling = ia;
+        std::swap(a.m_prev_sibling, b.m_prev_sibling);
+        std::swap(a.m_next_sibling, b.m_next_sibling);
+    }
+    else
+    {
+        if(a.m_next_sibling == ib) // n will go after m
+        {
+            _RYML_CB_ASSERT(m_callbacks, b.m_prev_sibling == ia);
+            if(a.m_prev_sibling != NONE)
+            {
+                _RYML_CB_ASSERT(m_callbacks, a.m_prev_sibling != ib);
+                _p(a.m_prev_sibling)->m_next_sibling = ib;
+            }
+            if(b.m_next_sibling != NONE)
+            {
+                _RYML_CB_ASSERT(m_callbacks, b.m_next_sibling != ia);
+                _p(b.m_next_sibling)->m_prev_sibling = ia;
+            }
+            size_t ns = b.m_next_sibling;
+            b.m_prev_sibling = a.m_prev_sibling;
+            b.m_next_sibling = ia;
+            a.m_prev_sibling = ib;
+            a.m_next_sibling = ns;
+        }
+        else if(a.m_prev_sibling == ib) // m will go after n
+        {
+            _RYML_CB_ASSERT(m_callbacks, b.m_next_sibling == ia);
+            if(b.m_prev_sibling != NONE)
+            {
+                _RYML_CB_ASSERT(m_callbacks, b.m_prev_sibling != ia);
+                _p(b.m_prev_sibling)->m_next_sibling = ia;
+            }
+            if(a.m_next_sibling != NONE)
+            {
+                _RYML_CB_ASSERT(m_callbacks, a.m_next_sibling != ib);
+                _p(a.m_next_sibling)->m_prev_sibling = ib;
+            }
+            size_t ns = b.m_prev_sibling;
+            a.m_prev_sibling = b.m_prev_sibling;
+            a.m_next_sibling = ib;
+            b.m_prev_sibling = ia;
+            b.m_next_sibling = ns;
+        }
+        else
+        {
+            C4_NEVER_REACH();
+        }
+    }
+    _RYML_CB_ASSERT(m_callbacks, a.m_next_sibling != ia);
+    _RYML_CB_ASSERT(m_callbacks, a.m_prev_sibling != ia);
+    _RYML_CB_ASSERT(m_callbacks, b.m_next_sibling != ib);
+    _RYML_CB_ASSERT(m_callbacks, b.m_prev_sibling != ib);
+
+    if(a.m_parent != ib && b.m_parent != ia)
+    {
+        std::swap(a.m_parent, b.m_parent);
+    }
+    else
+    {
+        if(a.m_parent == ib && b.m_parent != ia)
+        {
+            a.m_parent = b.m_parent;
+            b.m_parent = ia;
+        }
+        else if(a.m_parent != ib && b.m_parent == ia)
+        {
+            b.m_parent = a.m_parent;
+            a.m_parent = ib;
+        }
+        else
+        {
+            C4_NEVER_REACH();
+        }
+    }
+}
+
+//-----------------------------------------------------------------------------
+void Tree::_copy_hierarchy(size_t dst_, size_t src_)
+{
+    auto const& C4_RESTRICT src = *_p(src_);
+    auto      & C4_RESTRICT dst = *_p(dst_);
+    auto      & C4_RESTRICT prt = *_p(src.m_parent);
+    for(size_t i = src.m_first_child; i != NONE; i = next_sibling(i))
+    {
+        _p(i)->m_parent = dst_;
+    }
+    if(src.m_prev_sibling != NONE)
+    {
+        _p(src.m_prev_sibling)->m_next_sibling = dst_;
+    }
+    if(src.m_next_sibling != NONE)
+    {
+        _p(src.m_next_sibling)->m_prev_sibling = dst_;
+    }
+    if(prt.m_first_child == src_)
+    {
+        prt.m_first_child = dst_;
+    }
+    if(prt.m_last_child  == src_)
+    {
+        prt.m_last_child  = dst_;
+    }
+    dst.m_parent       = src.m_parent;
+    dst.m_first_child  = src.m_first_child;
+    dst.m_last_child   = src.m_last_child;
+    dst.m_prev_sibling = src.m_prev_sibling;
+    dst.m_next_sibling = src.m_next_sibling;
+}
+
+//-----------------------------------------------------------------------------
+void Tree::_swap_props(size_t n_, size_t m_)
+{
+    NodeData &C4_RESTRICT n = *_p(n_);
+    NodeData &C4_RESTRICT m = *_p(m_);
+    std::swap(n.m_type, m.m_type);
+    std::swap(n.m_key, m.m_key);
+    std::swap(n.m_val, m.m_val);
+}
+
+//-----------------------------------------------------------------------------
+void Tree::move(size_t node, size_t after)
+{
+    _RYML_CB_ASSERT(m_callbacks, node != NONE);
+    _RYML_CB_ASSERT(m_callbacks, node != after);
+    _RYML_CB_ASSERT(m_callbacks,  ! is_root(node));
+    _RYML_CB_ASSERT(m_callbacks, (after == NONE) || (has_sibling(node, after) && has_sibling(after, node)));
+
+    _rem_hierarchy(node);
+    _set_hierarchy(node, parent(node), after);
+}
+
+//-----------------------------------------------------------------------------
+
+void Tree::move(size_t node, size_t new_parent, size_t after)
+{
+    _RYML_CB_ASSERT(m_callbacks, node != NONE);
+    _RYML_CB_ASSERT(m_callbacks, node != after);
+    _RYML_CB_ASSERT(m_callbacks, new_parent != NONE);
+    _RYML_CB_ASSERT(m_callbacks, new_parent != node);
+    _RYML_CB_ASSERT(m_callbacks, new_parent != after);
+    _RYML_CB_ASSERT(m_callbacks,  ! is_root(node));
+
+    _rem_hierarchy(node);
+    _set_hierarchy(node, new_parent, after);
+}
+
+size_t Tree::move(Tree *src, size_t node, size_t new_parent, size_t after)
+{
+    _RYML_CB_ASSERT(m_callbacks, src != nullptr);
+    _RYML_CB_ASSERT(m_callbacks, node != NONE);
+    _RYML_CB_ASSERT(m_callbacks, new_parent != NONE);
+    _RYML_CB_ASSERT(m_callbacks, new_parent != after);
+
+    size_t dup = duplicate(src, node, new_parent, after);
+    src->remove(node);
+    return dup;
+}
+
+void Tree::set_root_as_stream()
+{
+    size_t root = root_id();
+    if(is_stream(root))
+        return;
+    // don't use _add_flags() because it's checked and will fail
+    if(!has_children(root))
+    {
+        if(is_val(root))
+        {
+            _p(root)->m_type.add(SEQ);
+            size_t next_doc = append_child(root);
+            _copy_props_wo_key(next_doc, root);
+            _p(next_doc)->m_type.add(DOC);
+            _p(next_doc)->m_type.rem(SEQ);
+        }
+        _p(root)->m_type = STREAM;
+        return;
+    }
+    _RYML_CB_ASSERT(m_callbacks, !has_key(root));
+    size_t next_doc = append_child(root);
+    _copy_props_wo_key(next_doc, root);
+    _add_flags(next_doc, DOC);
+    for(size_t prev = NONE, ch = first_child(root), next = next_sibling(ch); ch != NONE; )
+    {
+        if(ch == next_doc)
+            break;
+        move(ch, next_doc, prev);
+        prev = ch;
+        ch = next;
+        next = next_sibling(next);
+    }
+    _p(root)->m_type = STREAM;
+}
+
+
+//-----------------------------------------------------------------------------
+void Tree::remove_children(size_t node)
+{
+    _RYML_CB_ASSERT(m_callbacks, get(node) != nullptr);
+    size_t ich = get(node)->m_first_child;
+    while(ich != NONE)
+    {
+        remove_children(ich);
+        _RYML_CB_ASSERT(m_callbacks, get(ich) != nullptr);
+        size_t next = get(ich)->m_next_sibling;
+        _release(ich);
+        if(ich == get(node)->m_last_child)
+            break;
+        ich = next;
+    }
+}
+
+bool Tree::change_type(size_t node, NodeType type)
+{
+    _RYML_CB_ASSERT(m_callbacks, type.is_val() || type.is_map() || type.is_seq());
+    _RYML_CB_ASSERT(m_callbacks, type.is_val() + type.is_map() + type.is_seq() == 1);
+    _RYML_CB_ASSERT(m_callbacks, type.has_key() == has_key(node) || (has_key(node) && !type.has_key()));
+    NodeData *d = _p(node);
+    if(type.is_map() && is_map(node))
+        return false;
+    else if(type.is_seq() && is_seq(node))
+        return false;
+    else if(type.is_val() && is_val(node))
+        return false;
+    d->m_type = (d->m_type & (~(MAP|SEQ|VAL))) | type;
+    remove_children(node);
+    return true;
+}
+
+
+//-----------------------------------------------------------------------------
+size_t Tree::duplicate(size_t node, size_t parent, size_t after)
+{
+    return duplicate(this, node, parent, after);
+}
+
+size_t Tree::duplicate(Tree const* src, size_t node, size_t parent, size_t after)
+{
+    _RYML_CB_ASSERT(m_callbacks, src != nullptr);
+    _RYML_CB_ASSERT(m_callbacks, node != NONE);
+    _RYML_CB_ASSERT(m_callbacks, parent != NONE);
+    _RYML_CB_ASSERT(m_callbacks,  ! src->is_root(node));
+
+    size_t copy = _claim();
+
+    _copy_props(copy, src, node);
+    _set_hierarchy(copy, parent, after);
+    duplicate_children(src, node, copy, NONE);
+
+    return copy;
+}
+
+//-----------------------------------------------------------------------------
+size_t Tree::duplicate_children(size_t node, size_t parent, size_t after)
+{
+    return duplicate_children(this, node, parent, after);
+}
+
+size_t Tree::duplicate_children(Tree const* src, size_t node, size_t parent, size_t after)
+{
+    _RYML_CB_ASSERT(m_callbacks, src != nullptr);
+    _RYML_CB_ASSERT(m_callbacks, node != NONE);
+    _RYML_CB_ASSERT(m_callbacks, parent != NONE);
+    _RYML_CB_ASSERT(m_callbacks, after == NONE || has_child(parent, after));
+
+    size_t prev = after;
+    for(size_t i = src->first_child(node); i != NONE; i = src->next_sibling(i))
+    {
+        prev = duplicate(src, i, parent, prev);
+    }
+
+    return prev;
+}
+
+//-----------------------------------------------------------------------------
+void Tree::duplicate_contents(size_t node, size_t where)
+{
+    duplicate_contents(this, node, where);
+}
+
+void Tree::duplicate_contents(Tree const *src, size_t node, size_t where)
+{
+    _RYML_CB_ASSERT(m_callbacks, src != nullptr);
+    _RYML_CB_ASSERT(m_callbacks, node != NONE);
+    _RYML_CB_ASSERT(m_callbacks, where != NONE);
+    _copy_props_wo_key(where, src, node);
+    duplicate_children(src, node, where, last_child(where));
+}
+
+//-----------------------------------------------------------------------------
+size_t Tree::duplicate_children_no_rep(size_t node, size_t parent, size_t after)
+{
+    return duplicate_children_no_rep(this, node, parent, after);
+}
+
+size_t Tree::duplicate_children_no_rep(Tree const *src, size_t node, size_t parent, size_t after)
+{
+    _RYML_CB_ASSERT(m_callbacks, node != NONE);
+    _RYML_CB_ASSERT(m_callbacks, parent != NONE);
+    _RYML_CB_ASSERT(m_callbacks, after == NONE || has_child(parent, after));
+
+    // don't loop using pointers as there may be a relocation
+
+    // find the position where "after" is
+    size_t after_pos = NONE;
+    if(after != NONE)
+    {
+        for(size_t i = first_child(parent), icount = 0; i != NONE; ++icount, i = next_sibling(i))
+        {
+            if(i == after)
+            {
+                after_pos = icount;
+                break;
+            }
+        }
+        _RYML_CB_ASSERT(m_callbacks, after_pos != NONE);
+    }
+
+    // for each child to be duplicated...
+    size_t prev = after;
+    for(size_t i = src->first_child(node), icount = 0; i != NONE; ++icount, i = src->next_sibling(i))
+    {
+        if(is_seq(parent))
+        {
+            prev = duplicate(i, parent, prev);
+        }
+        else
+        {
+            _RYML_CB_ASSERT(m_callbacks, is_map(parent));
+            // does the parent already have a node with key equal to that of the current duplicate?
+            size_t rep = NONE, rep_pos = NONE;
+            for(size_t j = first_child(parent), jcount = 0; j != NONE; ++jcount, j = next_sibling(j))
+            {
+                if(key(j) == key(i))
+                {
+                    rep = j;
+                    rep_pos = jcount;
+                    break;
+                }
+            }
+            if(rep == NONE) // there is no repetition; just duplicate
+            {
+                prev = duplicate(src, i, parent, prev);
+            }
+            else  // yes, there is a repetition
+            {
+                if(after_pos != NONE && rep_pos < after_pos)
+                {
+                    // rep is located before the node which will be inserted,
+                    // and will be overridden by the duplicate. So replace it.
+                    remove(rep);
+                    prev = duplicate(src, i, parent, prev);
+                }
+                else if(prev == NONE)
+                {
+                    // first iteration with prev = after = NONE and repetition
+                    prev = rep;
+                }
+                else if(rep != prev)
+                {
+                    // rep is located after the node which will be inserted
+                    // and overrides it. So move the rep into this node's place.
+                    move(rep, prev);
+                    prev = rep;
+                }
+            } // there's a repetition
+        }
+    }
+
+    return prev;
+}
+
+
+//-----------------------------------------------------------------------------
+
+void Tree::merge_with(Tree const *src, size_t src_node, size_t dst_node)
+{
+    _RYML_CB_ASSERT(m_callbacks, src != nullptr);
+    if(src_node == NONE)
+        src_node = src->root_id();
+    if(dst_node == NONE)
+        dst_node = root_id();
+    _RYML_CB_ASSERT(m_callbacks, src->has_val(src_node) || src->is_seq(src_node) || src->is_map(src_node));
+
+    if(src->has_val(src_node))
+    {
+        if( ! has_val(dst_node))
+        {
+            if(has_children(dst_node))
+                remove_children(dst_node);
+        }
+        if(src->is_keyval(src_node))
+            _copy_props(dst_node, src, src_node);
+        else if(src->is_val(src_node))
+            _copy_props_wo_key(dst_node, src, src_node);
+        else
+            C4_NEVER_REACH();
+    }
+    else if(src->is_seq(src_node))
+    {
+        if( ! is_seq(dst_node))
+        {
+            if(has_children(dst_node))
+                remove_children(dst_node);
+            _clear_type(dst_node);
+            if(src->has_key(src_node))
+                to_seq(dst_node, src->key(src_node));
+            else
+                to_seq(dst_node);
+        }
+        for(size_t sch = src->first_child(src_node); sch != NONE; sch = src->next_sibling(sch))
+        {
+            size_t dch = append_child(dst_node);
+            _copy_props_wo_key(dch, src, sch);
+            merge_with(src, sch, dch);
+        }
+    }
+    else if(src->is_map(src_node))
+    {
+        if( ! is_map(dst_node))
+        {
+            if(has_children(dst_node))
+                remove_children(dst_node);
+            _clear_type(dst_node);
+            if(src->has_key(src_node))
+                to_map(dst_node, src->key(src_node));
+            else
+                to_map(dst_node);
+        }
+        for(size_t sch = src->first_child(src_node); sch != NONE; sch = src->next_sibling(sch))
+        {
+            size_t dch = find_child(dst_node, src->key(sch));
+            if(dch == NONE)
+            {
+                dch = append_child(dst_node);
+                _copy_props(dch, src, sch);
+            }
+            merge_with(src, sch, dch);
+        }
+    }
+    else
+    {
+        C4_NEVER_REACH();
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+
+namespace detail {
+/** @todo make this part of the public API, refactoring as appropriate
+ * to be able to use the same resolver to handle multiple trees (one
+ * at a time) */
+struct ReferenceResolver
+{
+    struct refdata
+    {
+        NodeType type;
+        size_t node;
+        size_t prev_anchor;
+        size_t target;
+        size_t parent_ref;
+        size_t parent_ref_sibling;
+    };
+
+    Tree *t;
+    /** from the specs: "an alias node refers to the most recent
+     * node in the serialization having the specified anchor". So
+     * we need to start looking upward from ref nodes.
+     *
+     * @see http://yaml.org/spec/1.2/spec.html#id2765878 */
+    stack<refdata> refs;
+
+    ReferenceResolver(Tree *t_) : t(t_), refs(t_->callbacks())
+    {
+        resolve();
+    }
+
+    void store_anchors_and_refs()
+    {
+        // minimize (re-)allocations by counting first
+        size_t num_anchors_and_refs = count_anchors_and_refs(t->root_id());
+        if(!num_anchors_and_refs)
+            return;
+        refs.reserve(num_anchors_and_refs);
+
+        // now descend through the hierarchy
+        _store_anchors_and_refs(t->root_id());
+
+        // finally connect the reference list
+        size_t prev_anchor = npos;
+        size_t count = 0;
+        for(auto &rd : refs)
+        {
+            rd.prev_anchor = prev_anchor;
+            if(rd.type.is_anchor())
+                prev_anchor = count;
+            ++count;
+        }
+    }
+
+    size_t count_anchors_and_refs(size_t n)
+    {
+        size_t c = 0;
+        c += t->has_key_anchor(n);
+        c += t->has_val_anchor(n);
+        c += t->is_key_ref(n);
+        c += t->is_val_ref(n);
+        for(size_t ch = t->first_child(n); ch != NONE; ch = t->next_sibling(ch))
+            c += count_anchors_and_refs(ch);
+        return c;
+    }
+
+    void _store_anchors_and_refs(size_t n)
+    {
+        if(t->is_key_ref(n) || t->is_val_ref(n) || (t->has_key(n) && t->key(n) == "<<"))
+        {
+            if(t->is_seq(n))
+            {
+                // for merging multiple inheritance targets
+                //   <<: [ *CENTER, *BIG ]
+                for(size_t ich = t->first_child(n); ich != NONE; ich = t->next_sibling(ich))
+                {
+                    RYML_ASSERT(t->num_children(ich) == 0);
+                    refs.push({VALREF, ich, npos, npos, n, t->next_sibling(n)});
+                }
+                return;
+            }
+            if(t->is_key_ref(n) && t->key(n) != "<<") // insert key refs BEFORE inserting val refs
+            {
+                RYML_CHECK((!t->has_key(n)) || t->key(n).ends_with(t->key_ref(n)));
+                refs.push({KEYREF, n, npos, npos, NONE, NONE});
+            }
+            if(t->is_val_ref(n))
+            {
+                RYML_CHECK((!t->has_val(n)) || t->val(n).ends_with(t->val_ref(n)));
+                refs.push({VALREF, n, npos, npos, NONE, NONE});
+            }
+        }
+        if(t->has_key_anchor(n))
+        {
+            RYML_CHECK(t->has_key(n));
+            refs.push({KEYANCH, n, npos, npos, NONE, NONE});
+        }
+        if(t->has_val_anchor(n))
+        {
+            RYML_CHECK(t->has_val(n) || t->is_container(n));
+            refs.push({VALANCH, n, npos, npos, NONE, NONE});
+        }
+        for(size_t ch = t->first_child(n); ch != NONE; ch = t->next_sibling(ch))
+        {
+            _store_anchors_and_refs(ch);
+        }
+    }
+
+    size_t lookup_(refdata *C4_RESTRICT ra)
+    {
+        RYML_ASSERT(ra->type.is_key_ref() || ra->type.is_val_ref());
+        RYML_ASSERT(ra->type.is_key_ref() != ra->type.is_val_ref());
+        csubstr refname;
+        if(ra->type.is_val_ref())
+        {
+            refname = t->val_ref(ra->node);
+        }
+        else
+        {
+            RYML_ASSERT(ra->type.is_key_ref());
+            refname = t->key_ref(ra->node);
+        }
+        while(ra->prev_anchor != npos)
+        {
+            ra = &refs[ra->prev_anchor];
+            if(t->has_anchor(ra->node, refname))
+                return ra->node;
+        }
+
+        #ifndef RYML_ERRMSG_SIZE
+          #define RYML_ERRMSG_SIZE 1024
+        #endif
+
+        char errmsg[RYML_ERRMSG_SIZE];
+        snprintf(errmsg, RYML_ERRMSG_SIZE, "anchor does not exist: '%.*s'",
+                 static_cast<int>(refname.size()), refname.data());
+        c4::yml::error(errmsg);
+        return NONE;
+    }
+
+    void resolve()
+    {
+        store_anchors_and_refs();
+        if(refs.empty())
+            return;
+
+        /* from the specs: "an alias node refers to the most recent
+         * node in the serialization having the specified anchor". So
+         * we need to start looking upward from ref nodes.
+         *
+         * @see http://yaml.org/spec/1.2/spec.html#id2765878 */
+        for(size_t i = 0, e = refs.size(); i < e; ++i)
+        {
+            auto &C4_RESTRICT rd = refs.top(i);
+            if( ! rd.type.is_ref())
+                continue;
+            rd.target = lookup_(&rd);
+        }
+    }
+
+}; // ReferenceResolver
+} // namespace detail
+
+void Tree::resolve()
+{
+    if(m_size == 0)
+        return;
+
+    detail::ReferenceResolver rr(this);
+
+    // insert the resolved references
+    size_t prev_parent_ref = NONE;
+    size_t prev_parent_ref_after = NONE;
+    for(auto const& C4_RESTRICT rd : rr.refs)
+    {
+        if( ! rd.type.is_ref())
+            continue;
+        if(rd.parent_ref != NONE)
+        {
+            _RYML_CB_ASSERT(m_callbacks, is_seq(rd.parent_ref));
+            size_t after, p = parent(rd.parent_ref);
+            if(prev_parent_ref != rd.parent_ref)
+            {
+                after = rd.parent_ref;//prev_sibling(rd.parent_ref_sibling);
+                prev_parent_ref_after = after;
+            }
+            else
+            {
+                after = prev_parent_ref_after;
+            }
+            prev_parent_ref = rd.parent_ref;
+            prev_parent_ref_after = duplicate_children_no_rep(rd.target, p, after);
+            remove(rd.node);
+        }
+        else
+        {
+            if(has_key(rd.node) && is_key_ref(rd.node) && key(rd.node) == "<<")
+            {
+                _RYML_CB_ASSERT(m_callbacks, is_keyval(rd.node));
+                size_t p = parent(rd.node);
+                size_t after = prev_sibling(rd.node);
+                duplicate_children_no_rep(rd.target, p, after);
+                remove(rd.node);
+            }
+            else if(rd.type.is_key_ref())
+            {
+                _RYML_CB_ASSERT(m_callbacks, is_key_ref(rd.node));
+                _RYML_CB_ASSERT(m_callbacks, has_key_anchor(rd.target) || has_val_anchor(rd.target));
+                if(has_val_anchor(rd.target) && val_anchor(rd.target) == key_ref(rd.node))
+                {
+                    _RYML_CB_CHECK(m_callbacks, !is_container(rd.target));
+                    _RYML_CB_CHECK(m_callbacks, has_val(rd.target));
+                    _p(rd.node)->m_key.scalar = val(rd.target);
+                    _add_flags(rd.node, KEY);
+                }
+                else
+                {
+                    _RYML_CB_CHECK(m_callbacks, key_anchor(rd.target) == key_ref(rd.node));
+                    _p(rd.node)->m_key.scalar = key(rd.target);
+                    _add_flags(rd.node, VAL);
+                }
+            }
+            else
+            {
+                _RYML_CB_ASSERT(m_callbacks, rd.type.is_val_ref());
+                if(has_key_anchor(rd.target) && key_anchor(rd.target) == val_ref(rd.node))
+                {
+                    _RYML_CB_CHECK(m_callbacks, !is_container(rd.target));
+                    _RYML_CB_CHECK(m_callbacks, has_val(rd.target));
+                    _p(rd.node)->m_val.scalar = key(rd.target);
+                    _add_flags(rd.node, VAL);
+                }
+                else
+                {
+                    duplicate_contents(rd.target, rd.node);
+                }
+            }
+        }
+    }
+
+    // clear anchors and refs
+    for(auto const& C4_RESTRICT ar : rr.refs)
+    {
+        rem_anchor_ref(ar.node);
+        if(ar.parent_ref != NONE)
+            if(type(ar.parent_ref) != NOTYPE)
+                remove(ar.parent_ref);
+    }
+
+}
+
+//-----------------------------------------------------------------------------
+
+size_t Tree::num_children(size_t node) const
+{
+    size_t count = 0;
+    for(size_t i = first_child(node); i != NONE; i = next_sibling(i))
+        ++count;
+    return count;
+}
+
+size_t Tree::child(size_t node, size_t pos) const
+{
+    _RYML_CB_ASSERT(m_callbacks, node != NONE);
+    size_t count = 0;
+    for(size_t i = first_child(node); i != NONE; i = next_sibling(i))
+    {
+        if(count++ == pos)
+            return i;
+    }
+    return NONE;
+}
+
+size_t Tree::child_pos(size_t node, size_t ch) const
+{
+    size_t count = 0;
+    for(size_t i = first_child(node); i != NONE; i = next_sibling(i))
+    {
+        if(i == ch)
+            return count;
+        ++count;
+    }
+    return npos;
+}
+
+#if defined(__clang__)
+#   pragma clang diagnostic push
+#   pragma GCC diagnostic ignored "-Wnull-dereference"
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic push
+#   if __GNUC__ >= 6
+#       pragma GCC diagnostic ignored "-Wnull-dereference"
+#   endif
+#endif
+
+size_t Tree::find_child(size_t node, csubstr const& name) const
+{
+    _RYML_CB_ASSERT(m_callbacks, node != NONE);
+    _RYML_CB_ASSERT(m_callbacks, is_map(node));
+    if(get(node)->m_first_child == NONE)
+    {
+        _RYML_CB_ASSERT(m_callbacks, _p(node)->m_last_child == NONE);
+        return NONE;
+    }
+    else
+    {
+        _RYML_CB_ASSERT(m_callbacks, _p(node)->m_last_child != NONE);
+    }
+    for(size_t i = first_child(node); i != NONE; i = next_sibling(i))
+    {
+        if(_p(i)->m_key.scalar == name)
+        {
+            return i;
+        }
+    }
+    return NONE;
+}
+
+#if defined(__clang__)
+#   pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic pop
+#endif
+
+
+//-----------------------------------------------------------------------------
+
+void Tree::to_val(size_t node, csubstr val, type_bits more_flags)
+{
+    _RYML_CB_ASSERT(m_callbacks,  ! has_children(node));
+    _RYML_CB_ASSERT(m_callbacks, parent(node) == NONE || ! parent_is_map(node));
+    _set_flags(node, VAL|more_flags);
+    _p(node)->m_key.clear();
+    _p(node)->m_val = val;
+}
+
+void Tree::to_keyval(size_t node, csubstr key, csubstr val, type_bits more_flags)
+{
+    _RYML_CB_ASSERT(m_callbacks,  ! has_children(node));
+    _RYML_CB_ASSERT(m_callbacks, parent(node) == NONE || parent_is_map(node));
+    _set_flags(node, KEYVAL|more_flags);
+    _p(node)->m_key = key;
+    _p(node)->m_val = val;
+}
+
+void Tree::to_map(size_t node, type_bits more_flags)
+{
+    _RYML_CB_ASSERT(m_callbacks,  ! has_children(node));
+    _RYML_CB_ASSERT(m_callbacks, parent(node) == NONE || ! parent_is_map(node)); // parent must not have children with keys
+    _set_flags(node, MAP|more_flags);
+    _p(node)->m_key.clear();
+    _p(node)->m_val.clear();
+}
+
+void Tree::to_map(size_t node, csubstr key, type_bits more_flags)
+{
+    _RYML_CB_ASSERT(m_callbacks,  ! has_children(node));
+    _RYML_CB_ASSERT(m_callbacks, parent(node) == NONE || parent_is_map(node));
+    _set_flags(node, KEY|MAP|more_flags);
+    _p(node)->m_key = key;
+    _p(node)->m_val.clear();
+}
+
+void Tree::to_seq(size_t node, type_bits more_flags)
+{
+    _RYML_CB_ASSERT(m_callbacks,  ! has_children(node));
+    _RYML_CB_ASSERT(m_callbacks, parent(node) == NONE || parent_is_seq(node));
+    _set_flags(node, SEQ|more_flags);
+    _p(node)->m_key.clear();
+    _p(node)->m_val.clear();
+}
+
+void Tree::to_seq(size_t node, csubstr key, type_bits more_flags)
+{
+    _RYML_CB_ASSERT(m_callbacks,  ! has_children(node));
+    _RYML_CB_ASSERT(m_callbacks, parent(node) == NONE || parent_is_map(node));
+    _set_flags(node, KEY|SEQ|more_flags);
+    _p(node)->m_key = key;
+    _p(node)->m_val.clear();
+}
+
+void Tree::to_doc(size_t node, type_bits more_flags)
+{
+    _RYML_CB_ASSERT(m_callbacks,  ! has_children(node));
+    _set_flags(node, DOC|more_flags);
+    _p(node)->m_key.clear();
+    _p(node)->m_val.clear();
+}
+
+void Tree::to_stream(size_t node, type_bits more_flags)
+{
+    _RYML_CB_ASSERT(m_callbacks,  ! has_children(node));
+    _set_flags(node, STREAM|more_flags);
+    _p(node)->m_key.clear();
+    _p(node)->m_val.clear();
+}
+
+
+//-----------------------------------------------------------------------------
+size_t Tree::num_tag_directives() const
+{
+    // this assumes we have a very small number of tag directives
+    for(size_t i = 0; i < RYML_MAX_TAG_DIRECTIVES; ++i)
+        if(m_tag_directives[i].handle.empty())
+            return i;
+    return RYML_MAX_TAG_DIRECTIVES;
+}
+
+void Tree::clear_tag_directives()
+{
+    for(TagDirective &td : m_tag_directives)
+        td = {};
+}
+
+size_t Tree::add_tag_directive(TagDirective const& td)
+{
+    _RYML_CB_CHECK(m_callbacks, !td.handle.empty());
+    _RYML_CB_CHECK(m_callbacks, !td.prefix.empty());
+    _RYML_CB_ASSERT(m_callbacks, td.handle.begins_with('!'));
+    _RYML_CB_ASSERT(m_callbacks, td.handle.ends_with('!'));
+    // https://yaml.org/spec/1.2.2/#rule-ns-word-char
+    _RYML_CB_ASSERT(m_callbacks, td.handle == '!' || td.handle == "!!" || td.handle.trim('!').first_not_of("01234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-") == npos);
+    size_t pos = num_tag_directives();
+    _RYML_CB_CHECK(m_callbacks, pos < RYML_MAX_TAG_DIRECTIVES);
+    m_tag_directives[pos] = td;
+    return pos;
+}
+
+size_t Tree::resolve_tag(substr output, csubstr tag, size_t node_id) const
+{
+    // lookup from the end. We want to find the first directive that
+    // matches the tag and has a target node id leq than the given
+    // node_id.
+    for(size_t i = RYML_MAX_TAG_DIRECTIVES-1; i != (size_t)-1; --i)
+    {
+        auto const& td = m_tag_directives[i];
+        if(td.handle.empty())
+            continue;
+        if(tag.begins_with(td.handle) && td.next_node_id <= node_id)
+        {
+            _RYML_CB_ASSERT(m_callbacks, tag.len >= td.handle.len);
+            csubstr rest = tag.sub(td.handle.len);
+            size_t len = 1u + td.prefix.len + rest.len + 1u;
+            size_t numpc = rest.count('%');
+            if(numpc == 0)
+            {
+                if(len <= output.len)
+                {
+                    output.str[0] = '<';
+                    memcpy(1u + output.str, td.prefix.str, td.prefix.len);
+                    memcpy(1u + output.str + td.prefix.len, rest.str, rest.len);
+                    output.str[1u + td.prefix.len + rest.len] = '>';
+                }
+            }
+            else
+            {
+                // need to decode URI % sequences
+                size_t pos = rest.find('%');
+                _RYML_CB_ASSERT(m_callbacks, pos != npos);
+                do {
+                    size_t next = rest.first_not_of("0123456789abcdefABCDEF", pos+1);
+                    if(next == npos)
+                        next = rest.len;
+                    _RYML_CB_CHECK(m_callbacks, pos+1 < next);
+                    _RYML_CB_CHECK(m_callbacks, pos+1 + 2 <= next);
+                    size_t delta = next - (pos+1);
+                    len -= delta;
+                    pos = rest.find('%', pos+1);
+                } while(pos != npos);
+                if(len <= output.len)
+                {
+                    size_t prev = 0, wpos = 0;
+                    auto appendstr = [&](csubstr s) { memcpy(output.str + wpos, s.str, s.len); wpos += s.len; };
+                    auto appendchar = [&](char c) { output.str[wpos++] = c; };
+                    appendchar('<');
+                    appendstr(td.prefix);
+                    pos = rest.find('%');
+                    _RYML_CB_ASSERT(m_callbacks, pos != npos);
+                    do {
+                        size_t next = rest.first_not_of("0123456789abcdefABCDEF", pos+1);
+                        if(next == npos)
+                            next = rest.len;
+                        _RYML_CB_CHECK(m_callbacks, pos+1 < next);
+                        _RYML_CB_CHECK(m_callbacks, pos+1 + 2 <= next);
+                        uint8_t val;
+                        if(C4_UNLIKELY(!read_hex(rest.range(pos+1, next), &val) || val > 127))
+                            _RYML_CB_ERR(m_callbacks, "invalid URI character");
+                        appendstr(rest.range(prev, pos));
+                        appendchar((char)val);
+                        prev = next;
+                        pos = rest.find('%', pos+1);
+                    } while(pos != npos);
+                    _RYML_CB_ASSERT(m_callbacks, pos == npos);
+                    _RYML_CB_ASSERT(m_callbacks, prev > 0);
+                    _RYML_CB_ASSERT(m_callbacks, rest.len >= prev);
+                    appendstr(rest.sub(prev));
+                    appendchar('>');
+                    _RYML_CB_ASSERT(m_callbacks, wpos == len);
+                }
+            }
+            return len;
+        }
+    }
+    return 0; // return 0 to signal that the tag is local and cannot be resolved
+}
+
+namespace {
+csubstr _transform_tag(Tree *t, csubstr tag, size_t node)
+{
+    size_t required_size = t->resolve_tag(substr{}, tag, node);
+    if(!required_size)
+        return tag;
+    const char *prev_arena = t->arena().str;
+    substr buf = t->alloc_arena(required_size);
+    _RYML_CB_ASSERT(t->m_callbacks, t->arena().str == prev_arena);
+    size_t actual_size = t->resolve_tag(buf, tag, node);
+    _RYML_CB_ASSERT(t->m_callbacks, actual_size <= required_size);
+    return buf.first(actual_size);
+}
+void _resolve_tags(Tree *t, size_t node)
+{
+    for(size_t child = t->first_child(node); child != NONE; child = t->next_sibling(child))
+    {
+        if(t->has_key(child) && t->has_key_tag(child))
+            t->set_key_tag(child, _transform_tag(t, t->key_tag(child), child));
+        if(t->has_val(child) && t->has_val_tag(child))
+            t->set_val_tag(child, _transform_tag(t, t->val_tag(child), child));
+        _resolve_tags(t, child);
+    }
+}
+size_t _count_resolved_tags_size(Tree const* t, size_t node)
+{
+    size_t sz = 0;
+    for(size_t child = t->first_child(node); child != NONE; child = t->next_sibling(child))
+    {
+        if(t->has_key(child) && t->has_key_tag(child))
+            sz += t->resolve_tag(substr{}, t->key_tag(child), child);
+        if(t->has_val(child) && t->has_val_tag(child))
+            sz += t->resolve_tag(substr{}, t->val_tag(child), child);
+        sz += _count_resolved_tags_size(t, child);
+    }
+    return sz;
+}
+} // namespace
+
+void Tree::resolve_tags()
+{
+    if(empty())
+        return;
+    if(num_tag_directives() == 0)
+        return;
+    size_t needed_size = _count_resolved_tags_size(this, root_id());
+    if(needed_size)
+        reserve_arena(arena_size() + needed_size);
+    _resolve_tags(this, root_id());
+}
+
+
+//-----------------------------------------------------------------------------
+
+csubstr Tree::lookup_result::resolved() const
+{
+    csubstr p = path.first(path_pos);
+    if(p.ends_with('.'))
+        p = p.first(p.len-1);
+    return p;
+}
+
+csubstr Tree::lookup_result::unresolved() const
+{
+    return path.sub(path_pos);
+}
+
+void Tree::_advance(lookup_result *r, size_t more) const
+{
+    r->path_pos += more;
+    if(r->path.sub(r->path_pos).begins_with('.'))
+        ++r->path_pos;
+}
+
+Tree::lookup_result Tree::lookup_path(csubstr path, size_t start) const
+{
+    if(start == NONE)
+        start = root_id();
+    lookup_result r(path, start);
+    if(path.empty())
+        return r;
+    _lookup_path(&r);
+    if(r.target == NONE && r.closest == start)
+        r.closest = NONE;
+    return r;
+}
+
+size_t Tree::lookup_path_or_modify(csubstr default_value, csubstr path, size_t start)
+{
+    size_t target = _lookup_path_or_create(path, start);
+    if(parent_is_map(target))
+        to_keyval(target, key(target), default_value);
+    else
+        to_val(target, default_value);
+    return target;
+}
+
+size_t Tree::lookup_path_or_modify(Tree const *src, size_t src_node, csubstr path, size_t start)
+{
+    size_t target = _lookup_path_or_create(path, start);
+    merge_with(src, src_node, target);
+    return target;
+}
+
+size_t Tree::_lookup_path_or_create(csubstr path, size_t start)
+{
+    if(start == NONE)
+        start = root_id();
+    lookup_result r(path, start);
+    _lookup_path(&r);
+    if(r.target != NONE)
+    {
+        C4_ASSERT(r.unresolved().empty());
+        return r.target;
+    }
+    _lookup_path_modify(&r);
+    return r.target;
+}
+
+void Tree::_lookup_path(lookup_result *r) const
+{
+    C4_ASSERT( ! r->unresolved().empty());
+    _lookup_path_token parent{"", type(r->closest)};
+    size_t node;
+    do
+    {
+        node = _next_node(r, &parent);
+        if(node != NONE)
+            r->closest = node;
+        if(r->unresolved().empty())
+        {
+            r->target = node;
+            return;
+        }
+    } while(node != NONE);
+}
+
+void Tree::_lookup_path_modify(lookup_result *r)
+{
+    C4_ASSERT( ! r->unresolved().empty());
+    _lookup_path_token parent{"", type(r->closest)};
+    size_t node;
+    do
+    {
+        node = _next_node_modify(r, &parent);
+        if(node != NONE)
+            r->closest = node;
+        if(r->unresolved().empty())
+        {
+            r->target = node;
+            return;
+        }
+    } while(node != NONE);
+}
+
+size_t Tree::_next_node(lookup_result * r, _lookup_path_token *parent) const
+{
+    _lookup_path_token token = _next_token(r, *parent);
+    if( ! token)
+        return NONE;
+
+    size_t node = NONE;
+    csubstr prev = token.value;
+    if(token.type == MAP || token.type == SEQ)
+    {
+        _RYML_CB_ASSERT(m_callbacks, !token.value.begins_with('['));
+        //_RYML_CB_ASSERT(m_callbacks, is_container(r->closest) || r->closest == NONE);
+        _RYML_CB_ASSERT(m_callbacks, is_map(r->closest));
+        node = find_child(r->closest, token.value);
+    }
+    else if(token.type == KEYVAL)
+    {
+        _RYML_CB_ASSERT(m_callbacks, r->unresolved().empty());
+        if(is_map(r->closest))
+            node = find_child(r->closest, token.value);
+    }
+    else if(token.type == KEY)
+    {
+        _RYML_CB_ASSERT(m_callbacks, token.value.begins_with('[') && token.value.ends_with(']'));
+        token.value = token.value.offs(1, 1).trim(' ');
+        size_t idx = 0;
+        _RYML_CB_CHECK(m_callbacks, from_chars(token.value, &idx));
+        node = child(r->closest, idx);
+    }
+    else
+    {
+        C4_NEVER_REACH();
+    }
+
+    if(node != NONE)
+    {
+        *parent = token;
+    }
+    else
+    {
+        csubstr p = r->path.sub(r->path_pos > 0 ? r->path_pos - 1 : r->path_pos);
+        r->path_pos -= prev.len;
+        if(p.begins_with('.'))
+            r->path_pos -= 1u;
+    }
+
+    return node;
+}
+
+size_t Tree::_next_node_modify(lookup_result * r, _lookup_path_token *parent)
+{
+    _lookup_path_token token = _next_token(r, *parent);
+    if( ! token)
+        return NONE;
+
+    size_t node = NONE;
+    if(token.type == MAP || token.type == SEQ)
+    {
+        _RYML_CB_ASSERT(m_callbacks, !token.value.begins_with('['));
+        //_RYML_CB_ASSERT(m_callbacks, is_container(r->closest) || r->closest == NONE);
+        if( ! is_container(r->closest))
+        {
+            if(has_key(r->closest))
+                to_map(r->closest, key(r->closest));
+            else
+                to_map(r->closest);
+        }
+        else
+        {
+            if(is_map(r->closest))
+                node = find_child(r->closest, token.value);
+            else
+            {
+                size_t pos = NONE;
+                _RYML_CB_CHECK(m_callbacks, c4::atox(token.value, &pos));
+                _RYML_CB_ASSERT(m_callbacks, pos != NONE);
+                node = child(r->closest, pos);
+            }
+        }
+        if(node == NONE)
+        {
+            _RYML_CB_ASSERT(m_callbacks, is_map(r->closest));
+            node = append_child(r->closest);
+            NodeData *n = _p(node);
+            n->m_key.scalar = token.value;
+            n->m_type.add(KEY);
+        }
+    }
+    else if(token.type == KEYVAL)
+    {
+        _RYML_CB_ASSERT(m_callbacks, r->unresolved().empty());
+        if(is_map(r->closest))
+        {
+            node = find_child(r->closest, token.value);
+            if(node == NONE)
+                node = append_child(r->closest);
+        }
+        else
+        {
+            _RYML_CB_ASSERT(m_callbacks, !is_seq(r->closest));
+            _add_flags(r->closest, MAP);
+            node = append_child(r->closest);
+        }
+        NodeData *n = _p(node);
+        n->m_key.scalar = token.value;
+        n->m_val.scalar = "";
+        n->m_type.add(KEYVAL);
+    }
+    else if(token.type == KEY)
+    {
+        _RYML_CB_ASSERT(m_callbacks, token.value.begins_with('[') && token.value.ends_with(']'));
+        token.value = token.value.offs(1, 1).trim(' ');
+        size_t idx;
+        if( ! from_chars(token.value, &idx))
+             return NONE;
+        if( ! is_container(r->closest))
+        {
+            if(has_key(r->closest))
+            {
+                csubstr k = key(r->closest);
+                _clear_type(r->closest);
+                to_seq(r->closest, k);
+            }
+            else
+            {
+                _clear_type(r->closest);
+                to_seq(r->closest);
+            }
+        }
+        _RYML_CB_ASSERT(m_callbacks, is_container(r->closest));
+        node = child(r->closest, idx);
+        if(node == NONE)
+        {
+            _RYML_CB_ASSERT(m_callbacks, num_children(r->closest) <= idx);
+            for(size_t i = num_children(r->closest); i <= idx; ++i)
+            {
+                node = append_child(r->closest);
+                if(i < idx)
+                {
+                    if(is_map(r->closest))
+                        to_keyval(node, /*"~"*/{}, /*"~"*/{});
+                    else if(is_seq(r->closest))
+                        to_val(node, /*"~"*/{});
+                }
+            }
+        }
+    }
+    else
+    {
+        C4_NEVER_REACH();
+    }
+
+    _RYML_CB_ASSERT(m_callbacks, node != NONE);
+    *parent = token;
+    return node;
+}
+
+/** types of tokens:
+ * - seeing "map."  ---> "map"/MAP
+ * - finishing "scalar" ---> "scalar"/KEYVAL
+ * - seeing "seq[n]" ---> "seq"/SEQ (--> "[n]"/KEY)
+ * - seeing "[n]" ---> "[n]"/KEY
+ */
+Tree::_lookup_path_token Tree::_next_token(lookup_result *r, _lookup_path_token const& parent) const
+{
+    csubstr unres = r->unresolved();
+    if(unres.empty())
+        return {};
+
+    // is it an indexation like [0], [1], etc?
+    if(unres.begins_with('['))
+    {
+        size_t pos = unres.find(']');
+        if(pos == csubstr::npos)
+            return {};
+        csubstr idx = unres.first(pos + 1);
+        _advance(r, pos + 1);
+        return {idx, KEY};
+    }
+
+    // no. so it must be a name
+    size_t pos = unres.first_of(".[");
+    if(pos == csubstr::npos)
+    {
+        _advance(r, unres.len);
+        NodeType t;
+        if(( ! parent) || parent.type.is_seq())
+            return {unres, VAL};
+        return {unres, KEYVAL};
+    }
+
+    // it's either a map or a seq
+    _RYML_CB_ASSERT(m_callbacks, unres[pos] == '.' || unres[pos] == '[');
+    if(unres[pos] == '.')
+    {
+        _RYML_CB_ASSERT(m_callbacks, pos != 0);
+        _advance(r, pos + 1);
+        return {unres.first(pos), MAP};
+    }
+
+    _RYML_CB_ASSERT(m_callbacks, unres[pos] == '[');
+    _advance(r, pos);
+    return {unres.first(pos), SEQ};
+}
+
+
+} // namespace ryml
+} // namespace c4
+
+
+C4_SUPPRESS_WARNING_GCC_POP
+C4_SUPPRESS_WARNING_MSVC_POP
+
+#endif /* RYML_SINGLE_HDR_DEFINE_NOW */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/tree.cpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/parse.cpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/parse.cpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifdef RYML_SINGLE_HDR_DEFINE_NOW
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/parse.hpp
+//#include "c4/yml/parse.hpp"
+#if !defined(C4_YML_PARSE_HPP_) && !defined(_C4_YML_PARSE_HPP_)
+#error "amalgamate: file c4/yml/parse.hpp must have been included at this point"
+#endif /* C4_YML_PARSE_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/error.hpp
+//#include "c4/error.hpp"
+#if !defined(C4_ERROR_HPP_) && !defined(_C4_ERROR_HPP_)
+#error "amalgamate: file c4/error.hpp must have been included at this point"
+#endif /* C4_ERROR_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/utf.hpp
+//#include "c4/utf.hpp"
+#if !defined(C4_UTF_HPP_) && !defined(_C4_UTF_HPP_)
+#error "amalgamate: file c4/utf.hpp must have been included at this point"
+#endif /* C4_UTF_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/dump.hpp
+//#include <c4/dump.hpp>
+#if !defined(C4_DUMP_HPP_) && !defined(_C4_DUMP_HPP_)
+#error "amalgamate: file c4/dump.hpp must have been included at this point"
+#endif /* C4_DUMP_HPP_ */
+
+
+//included above:
+//#include <ctype.h>
+//included above:
+//#include <stdarg.h>
+//included above:
+//#include <stdio.h>
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/detail/parser_dbg.hpp
+//#include "c4/yml/detail/parser_dbg.hpp"
+#if !defined(C4_YML_DETAIL_PARSER_DBG_HPP_) && !defined(_C4_YML_DETAIL_PARSER_DBG_HPP_)
+#error "amalgamate: file c4/yml/detail/parser_dbg.hpp must have been included at this point"
+#endif /* C4_YML_DETAIL_PARSER_DBG_HPP_ */
+
+#ifdef RYML_DBG
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/detail/print.hpp
+//#include "c4/yml/detail/print.hpp"
+#if !defined(C4_YML_DETAIL_PRINT_HPP_) && !defined(_C4_YML_DETAIL_PRINT_HPP_)
+#error "amalgamate: file c4/yml/detail/print.hpp must have been included at this point"
+#endif /* C4_YML_DETAIL_PRINT_HPP_ */
+
+#endif
+
+#ifndef RYML_ERRMSG_SIZE
+    #define RYML_ERRMSG_SIZE 1024
+#endif
+
+//#define RYML_WITH_TAB_TOKENS
+#ifdef RYML_WITH_TAB_TOKENS
+#define _RYML_WITH_TAB_TOKENS(...) __VA_ARGS__
+#define _RYML_WITH_OR_WITHOUT_TAB_TOKENS(with, without) with
+#else
+#define _RYML_WITH_TAB_TOKENS(...)
+#define _RYML_WITH_OR_WITHOUT_TAB_TOKENS(with, without) without
+#endif
+
+
+#if defined(_MSC_VER)
+#   pragma warning(push)
+#   pragma warning(disable: 4296/*expression is always 'boolean_value'*/)
+#elif defined(__clang__)
+#   pragma clang diagnostic push
+#   pragma clang diagnostic ignored "-Wtype-limits" // to remove a warning on an assertion that a size_t >= 0. Later on, this size_t will turn into a template argument, and then it can become < 0.
+#   pragma clang diagnostic ignored "-Wformat-nonliteral"
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wtype-limits" // to remove a warning on an assertion that a size_t >= 0. Later on, this size_t will turn into a template argument, and then it can become < 0.
+#   pragma GCC diagnostic ignored "-Wformat-nonliteral"
+#   if __GNUC__ >= 7
+#       pragma GCC diagnostic ignored "-Wduplicated-branches"
+#   endif
+#endif
+
+namespace c4 {
+namespace yml {
+
+namespace {
+
+template<class DumpFn, class ...Args>
+void _parse_dump(DumpFn dumpfn, c4::csubstr fmt, Args&& ...args)
+{
+    char writebuf[256];
+    auto results = c4::format_dump_resume(dumpfn, writebuf, fmt, std::forward<Args>(args)...);
+    // resume writing if the results failed to fit the buffer
+    if(C4_UNLIKELY(results.bufsize > sizeof(writebuf))) // bufsize will be that of the largest element serialized. Eg int(1), will require 1 byte.
+    {
+        results = format_dump_resume(dumpfn, results, writebuf, fmt, std::forward<Args>(args)...);
+        if(C4_UNLIKELY(results.bufsize > sizeof(writebuf)))
+        {
+            results = format_dump_resume(dumpfn, results, writebuf, fmt, std::forward<Args>(args)...);
+        }
+    }
+}
+
+bool _is_scalar_next__runk(csubstr s)
+{
+    return !(s.begins_with(": ") || s.begins_with_any("#,{}[]%&") || s.begins_with("? ") || s == "-" || s.begins_with("- ") || s.begins_with(":\"") || s.begins_with(":'"));
+}
+
+bool _is_scalar_next__rseq_rval(csubstr s)
+{
+    return !(s.begins_with_any("[{!&") || s.begins_with("? ") || s.begins_with("- ") || s == "-");
+}
+
+bool _is_scalar_next__rmap(csubstr s)
+{
+    return !(s.begins_with(": ") || s.begins_with_any("#,!&") || s.begins_with("? ") _RYML_WITH_TAB_TOKENS(|| s.begins_with(":\t")));
+}
+
+bool _is_scalar_next__rmap_val(csubstr s)
+{
+    return !(s.begins_with("- ") || s.begins_with_any("{[") || s == "-");
+}
+
+bool _is_doc_sep(csubstr s)
+{
+    constexpr const csubstr dashes = "---";
+    constexpr const csubstr ellipsis = "...";
+    constexpr const csubstr whitesp = " \t";
+    if(s.begins_with(dashes))
+        return s == dashes || s.sub(3).begins_with_any(whitesp);
+    else if(s.begins_with(ellipsis))
+        return s == ellipsis || s.sub(3).begins_with_any(whitesp);
+    return false;
+}
+
+/** @p i is set to the first non whitespace character after the line
+ * @return the number of empty lines after the initial position */
+size_t count_following_newlines(csubstr r, size_t *C4_RESTRICT i, size_t indentation)
+{
+    RYML_ASSERT(r[*i] == '\n');
+    size_t numnl_following = 0;
+    ++(*i);
+    for( ; *i < r.len; ++(*i))
+    {
+        if(r.str[*i] == '\n')
+        {
+            ++numnl_following;
+            if(indentation) // skip the indentation after the newline
+            {
+                size_t stop = *i + indentation;
+                for( ; *i < r.len; ++(*i))
+                {
+                    if(r.str[*i] != ' ' && r.str[*i] != '\r')
+                        break;
+                    RYML_ASSERT(*i < stop);
+                }
+                C4_UNUSED(stop);
+            }
+        }
+        else if(r.str[*i] == ' ' || r.str[*i] == '\t' || r.str[*i] == '\r')  // skip leading whitespace
+            ;
+        else
+            break;
+    }
+    return numnl_following;
+}
+
+} // anon namespace
+
+
+//-----------------------------------------------------------------------------
+
+Parser::~Parser()
+{
+    _free();
+    _clr();
+}
+
+Parser::Parser(Callbacks const& cb, ParserOptions opts)
+    : m_options(opts)
+    , m_file()
+    , m_buf()
+    , m_root_id(NONE)
+    , m_tree()
+    , m_stack(cb)
+    , m_state()
+    , m_key_tag_indentation(0)
+    , m_key_tag2_indentation(0)
+    , m_key_tag()
+    , m_key_tag2()
+    , m_val_tag_indentation(0)
+    , m_val_tag()
+    , m_key_anchor_was_before(false)
+    , m_key_anchor_indentation(0)
+    , m_key_anchor()
+    , m_val_anchor_indentation(0)
+    , m_val_anchor()
+    , m_filter_arena()
+    , m_newline_offsets()
+    , m_newline_offsets_size(0)
+    , m_newline_offsets_capacity(0)
+    , m_newline_offsets_buf()
+{
+    m_stack.push(State{});
+    m_state = &m_stack.top();
+}
+
+Parser::Parser(Parser &&that)
+    : m_options(that.m_options)
+    , m_file(that.m_file)
+    , m_buf(that.m_buf)
+    , m_root_id(that.m_root_id)
+    , m_tree(that.m_tree)
+    , m_stack(std::move(that.m_stack))
+    , m_state(&m_stack.top())
+    , m_key_tag_indentation(that.m_key_tag_indentation)
+    , m_key_tag2_indentation(that.m_key_tag2_indentation)
+    , m_key_tag(that.m_key_tag)
+    , m_key_tag2(that.m_key_tag2)
+    , m_val_tag_indentation(that.m_val_tag_indentation)
+    , m_val_tag(that.m_val_tag)
+    , m_key_anchor_was_before(that.m_key_anchor_was_before)
+    , m_key_anchor_indentation(that.m_key_anchor_indentation)
+    , m_key_anchor(that.m_key_anchor)
+    , m_val_anchor_indentation(that.m_val_anchor_indentation)
+    , m_val_anchor(that.m_val_anchor)
+    , m_filter_arena(that.m_filter_arena)
+    , m_newline_offsets(that.m_newline_offsets)
+    , m_newline_offsets_size(that.m_newline_offsets_size)
+    , m_newline_offsets_capacity(that.m_newline_offsets_capacity)
+    , m_newline_offsets_buf(that.m_newline_offsets_buf)
+{
+    that._clr();
+}
+
+Parser::Parser(Parser const& that)
+    : m_options(that.m_options)
+    , m_file(that.m_file)
+    , m_buf(that.m_buf)
+    , m_root_id(that.m_root_id)
+    , m_tree(that.m_tree)
+    , m_stack(that.m_stack)
+    , m_state(&m_stack.top())
+    , m_key_tag_indentation(that.m_key_tag_indentation)
+    , m_key_tag2_indentation(that.m_key_tag2_indentation)
+    , m_key_tag(that.m_key_tag)
+    , m_key_tag2(that.m_key_tag2)
+    , m_val_tag_indentation(that.m_val_tag_indentation)
+    , m_val_tag(that.m_val_tag)
+    , m_key_anchor_was_before(that.m_key_anchor_was_before)
+    , m_key_anchor_indentation(that.m_key_anchor_indentation)
+    , m_key_anchor(that.m_key_anchor)
+    , m_val_anchor_indentation(that.m_val_anchor_indentation)
+    , m_val_anchor(that.m_val_anchor)
+    , m_filter_arena()
+    , m_newline_offsets()
+    , m_newline_offsets_size()
+    , m_newline_offsets_capacity()
+    , m_newline_offsets_buf()
+{
+    if(that.m_newline_offsets_capacity)
+    {
+        _resize_locations(that.m_newline_offsets_capacity);
+        _RYML_CB_CHECK(m_stack.m_callbacks, m_newline_offsets_capacity == that.m_newline_offsets_capacity);
+        memcpy(m_newline_offsets, that.m_newline_offsets, that.m_newline_offsets_size * sizeof(size_t));
+        m_newline_offsets_size = that.m_newline_offsets_size;
+    }
+    if(that.m_filter_arena.len)
+    {
+        _resize_filter_arena(that.m_filter_arena.len);
+    }
+}
+
+Parser& Parser::operator=(Parser &&that)
+{
+    _free();
+    m_options = (that.m_options);
+    m_file = (that.m_file);
+    m_buf = (that.m_buf);
+    m_root_id = (that.m_root_id);
+    m_tree = (that.m_tree);
+    m_stack = std::move(that.m_stack);
+    m_state = (&m_stack.top());
+    m_key_tag_indentation = (that.m_key_tag_indentation);
+    m_key_tag2_indentation = (that.m_key_tag2_indentation);
+    m_key_tag = (that.m_key_tag);
+    m_key_tag2 = (that.m_key_tag2);
+    m_val_tag_indentation = (that.m_val_tag_indentation);
+    m_val_tag = (that.m_val_tag);
+    m_key_anchor_was_before = (that.m_key_anchor_was_before);
+    m_key_anchor_indentation = (that.m_key_anchor_indentation);
+    m_key_anchor = (that.m_key_anchor);
+    m_val_anchor_indentation = (that.m_val_anchor_indentation);
+    m_val_anchor = (that.m_val_anchor);
+    m_filter_arena = that.m_filter_arena;
+    m_newline_offsets = (that.m_newline_offsets);
+    m_newline_offsets_size = (that.m_newline_offsets_size);
+    m_newline_offsets_capacity = (that.m_newline_offsets_capacity);
+    m_newline_offsets_buf = (that.m_newline_offsets_buf);
+    that._clr();
+    return *this;
+}
+
+Parser& Parser::operator=(Parser const& that)
+{
+    _free();
+    m_options = (that.m_options);
+    m_file = (that.m_file);
+    m_buf = (that.m_buf);
+    m_root_id = (that.m_root_id);
+    m_tree = (that.m_tree);
+    m_stack = that.m_stack;
+    m_state = &m_stack.top();
+    m_key_tag_indentation = (that.m_key_tag_indentation);
+    m_key_tag2_indentation = (that.m_key_tag2_indentation);
+    m_key_tag = (that.m_key_tag);
+    m_key_tag2 = (that.m_key_tag2);
+    m_val_tag_indentation = (that.m_val_tag_indentation);
+    m_val_tag = (that.m_val_tag);
+    m_key_anchor_was_before = (that.m_key_anchor_was_before);
+    m_key_anchor_indentation = (that.m_key_anchor_indentation);
+    m_key_anchor = (that.m_key_anchor);
+    m_val_anchor_indentation = (that.m_val_anchor_indentation);
+    m_val_anchor = (that.m_val_anchor);
+    if(that.m_filter_arena.len > 0)
+        _resize_filter_arena(that.m_filter_arena.len);
+    if(that.m_newline_offsets_capacity > m_newline_offsets_capacity)
+        _resize_locations(that.m_newline_offsets_capacity);
+    _RYML_CB_CHECK(m_stack.m_callbacks, m_newline_offsets_capacity >= that.m_newline_offsets_capacity);
+    _RYML_CB_CHECK(m_stack.m_callbacks, m_newline_offsets_capacity >= that.m_newline_offsets_size);
+    memcpy(m_newline_offsets, that.m_newline_offsets, that.m_newline_offsets_size * sizeof(size_t));
+    m_newline_offsets_size = that.m_newline_offsets_size;
+    m_newline_offsets_buf = that.m_newline_offsets_buf;
+    return *this;
+}
+
+void Parser::_clr()
+{
+    m_options = {};
+    m_file = {};
+    m_buf = {};
+    m_root_id = {};
+    m_tree = {};
+    m_stack.clear();
+    m_state = {};
+    m_key_tag_indentation = {};
+    m_key_tag2_indentation = {};
+    m_key_tag = {};
+    m_key_tag2 = {};
+    m_val_tag_indentation = {};
+    m_val_tag = {};
+    m_key_anchor_was_before = {};
+    m_key_anchor_indentation = {};
+    m_key_anchor = {};
+    m_val_anchor_indentation = {};
+    m_val_anchor = {};
+    m_filter_arena = {};
+    m_newline_offsets = {};
+    m_newline_offsets_size = {};
+    m_newline_offsets_capacity = {};
+    m_newline_offsets_buf = {};
+}
+
+void Parser::_free()
+{
+    if(m_newline_offsets)
+    {
+        _RYML_CB_FREE(m_stack.m_callbacks, m_newline_offsets, size_t, m_newline_offsets_capacity);
+        m_newline_offsets = nullptr;
+        m_newline_offsets_size = 0u;
+        m_newline_offsets_capacity = 0u;
+        m_newline_offsets_buf = 0u;
+    }
+    if(m_filter_arena.len)
+    {
+        _RYML_CB_FREE(m_stack.m_callbacks, m_filter_arena.str, char, m_filter_arena.len);
+        m_filter_arena = {};
+    }
+    m_stack._free();
+}
+
+
+//-----------------------------------------------------------------------------
+void Parser::_reset()
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_stack.size() == 1);
+    m_stack.clear();
+    m_stack.push({});
+    m_state = &m_stack.top();
+    m_state->reset(m_file.str, m_root_id);
+
+    m_key_tag_indentation = 0;
+    m_key_tag2_indentation = 0;
+    m_key_tag.clear();
+    m_key_tag2.clear();
+    m_val_tag_indentation = 0;
+    m_val_tag.clear();
+    m_key_anchor_was_before = false;
+    m_key_anchor_indentation = 0;
+    m_key_anchor.clear();
+    m_val_anchor_indentation = 0;
+    m_val_anchor.clear();
+
+    if(m_options.locations())
+    {
+        _prepare_locations();
+    }
+}
+
+//-----------------------------------------------------------------------------
+template<class DumpFn>
+void Parser::_fmt_msg(DumpFn &&dumpfn) const
+{
+    auto const& lc = m_state->line_contents;
+    csubstr contents = lc.stripped;
+    if(contents.len)
+    {
+        // print the yaml src line
+        size_t offs = 3u + to_chars(substr{}, m_state->pos.line) + to_chars(substr{}, m_state->pos.col);
+        if(m_file.len)
+        {
+            _parse_dump(dumpfn, "{}:", m_file);
+            offs += m_file.len + 1;
+        }
+        _parse_dump(dumpfn, "{}:{}: ", m_state->pos.line, m_state->pos.col);
+        csubstr maybe_full_content = (contents.len < 80u ? contents : contents.first(80u));
+        csubstr maybe_ellipsis = (contents.len < 80u ? csubstr{} : csubstr("..."));
+        _parse_dump(dumpfn, "{}{}  (size={})\n", maybe_full_content, maybe_ellipsis, contents.len);
+        // highlight the remaining portion of the previous line
+        size_t firstcol = (size_t)(lc.rem.begin() - lc.full.begin());
+        size_t lastcol = firstcol + lc.rem.len;
+        for(size_t i = 0; i < offs + firstcol; ++i)
+            dumpfn(" ");
+        dumpfn("^");
+        for(size_t i = 1, e = (lc.rem.len < 80u ? lc.rem.len : 80u); i < e; ++i)
+            dumpfn("~");
+        _parse_dump(dumpfn, "{}  (cols {}-{})\n", maybe_ellipsis, firstcol+1, lastcol+1);
+    }
+    else
+    {
+        dumpfn("\n");
+    }
+
+#ifdef RYML_DBG
+    // next line: print the state flags
+    {
+        char flagbuf_[64];
+        _parse_dump(dumpfn, "top state: {}\n", _prfl(flagbuf_, m_state->flags));
+    }
+#endif
+}
+
+
+//-----------------------------------------------------------------------------
+template<class ...Args>
+void Parser::_err(csubstr fmt, Args const& C4_RESTRICT ...args) const
+{
+    char errmsg[RYML_ERRMSG_SIZE];
+    detail::_SubstrWriter writer(errmsg);
+    auto dumpfn = [&writer](csubstr s){ writer.append(s); };
+    _parse_dump(dumpfn, fmt, args...);
+    writer.append('\n');
+    _fmt_msg(dumpfn);
+    size_t len = writer.pos < RYML_ERRMSG_SIZE ? writer.pos : RYML_ERRMSG_SIZE;
+    m_tree->m_callbacks.m_error(errmsg, len, m_state->pos, m_tree->m_callbacks.m_user_data);
+}
+
+//-----------------------------------------------------------------------------
+#ifdef RYML_DBG
+template<class ...Args>
+void Parser::_dbg(csubstr fmt, Args const& C4_RESTRICT ...args) const
+{
+    auto dumpfn = [](csubstr s){ fwrite(s.str, 1, s.len, stdout); };
+    _parse_dump(dumpfn, fmt, args...);
+    dumpfn("\n");
+    _fmt_msg(dumpfn);
+}
+#endif
+
+//-----------------------------------------------------------------------------
+bool Parser::_finished_file() const
+{
+    bool ret = m_state->pos.offset >= m_buf.len;
+    if(ret)
+    {
+        _c4dbgp("finished file!!!");
+    }
+    return ret;
+}
+
+//-----------------------------------------------------------------------------
+bool Parser::_finished_line() const
+{
+    return m_state->line_contents.rem.empty();
+}
+
+//-----------------------------------------------------------------------------
+void Parser::parse_in_place(csubstr file, substr buf, Tree *t, size_t node_id)
+{
+    m_file = file;
+    m_buf = buf;
+    m_root_id = node_id;
+    m_tree = t;
+    _reset();
+    while( ! _finished_file())
+    {
+        _scan_line();
+        while( ! _finished_line())
+            _handle_line();
+        if(_finished_file())
+            break; // it may have finished because of multiline blocks
+        _line_ended();
+    }
+    _handle_finished_file();
+}
+
+//-----------------------------------------------------------------------------
+void Parser::_handle_finished_file()
+{
+    _end_stream();
+}
+
+//-----------------------------------------------------------------------------
+void Parser::_handle_line()
+{
+    _c4dbgq("\n-----------");
+    _c4dbgt("handling line={}, offset={}B", m_state->pos.line, m_state->pos.offset);
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  ! m_state->line_contents.rem.empty());
+    if(has_any(RSEQ))
+    {
+        if(has_any(FLOW))
+        {
+            if(_handle_seq_flow())
+                return;
+        }
+        else
+        {
+            if(_handle_seq_blck())
+                return;
+        }
+    }
+    else if(has_any(RMAP))
+    {
+        if(has_any(FLOW))
+        {
+            if(_handle_map_flow())
+                return;
+        }
+        else
+        {
+            if(_handle_map_blck())
+                return;
+        }
+    }
+    else if(has_any(RUNK))
+    {
+        if(_handle_unk())
+            return;
+    }
+
+    if(_handle_top())
+        return;
+}
+
+
+//-----------------------------------------------------------------------------
+bool Parser::_handle_unk()
+{
+    _c4dbgp("handle_unk");
+
+    csubstr rem = m_state->line_contents.rem;
+    const bool start_as_child = (node(m_state) == nullptr);
+
+    if(C4_UNLIKELY(has_any(NDOC)))
+    {
+        if(rem == "---" || rem.begins_with("--- "))
+        {
+            _start_new_doc(rem);
+            return true;
+        }
+        auto trimmed = rem.triml(' ');
+        if(trimmed == "---" || trimmed.begins_with("--- "))
+        {
+            _RYML_CB_ASSERT(m_stack.m_callbacks, rem.len >= trimmed.len);
+            _line_progressed(rem.len - trimmed.len);
+            _start_new_doc(trimmed);
+            _save_indentation();
+            return true;
+        }
+        else if(trimmed.begins_with("..."))
+        {
+            _end_stream();
+        }
+        else if(trimmed.first_of("#%") == csubstr::npos) // neither a doc nor a tag
+        {
+            _c4dbgpf("starting implicit doc to accomodate unexpected tokens: '{}'", rem);
+            size_t indref = m_state->indref;
+            _push_level();
+            _start_doc();
+            _set_indentation(indref);
+        }
+        _RYML_CB_ASSERT(m_stack.m_callbacks, !trimmed.empty());
+    }
+
+    _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT|RSEQ|RMAP));
+    if(m_state->indref > 0)
+    {
+        csubstr ws = rem.left_of(rem.first_not_of(' '));
+        if(m_state->indref <= ws.len)
+        {
+            _c4dbgpf("skipping base indentation of {}", m_state->indref);
+            _line_progressed(m_state->indref);
+            rem = rem.sub(m_state->indref);
+        }
+    }
+
+    if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t")))
+    {
+        _c4dbgpf("it's a seq (as_child={})", start_as_child);
+        _move_key_anchor_to_val_anchor();
+        _move_key_tag_to_val_tag();
+        _push_level();
+        _start_seq(start_as_child);
+        _save_indentation();
+        _line_progressed(2);
+        return true;
+    }
+    else if(rem == '-')
+    {
+        _c4dbgpf("it's a seq (as_child={})", start_as_child);
+        _move_key_anchor_to_val_anchor();
+        _move_key_tag_to_val_tag();
+        _push_level();
+        _start_seq(start_as_child);
+        _save_indentation();
+        _line_progressed(1);
+        return true;
+    }
+    else if(rem.begins_with('['))
+    {
+        _c4dbgpf("it's a seq, flow (as_child={})", start_as_child);
+        _move_key_anchor_to_val_anchor();
+        _move_key_tag_to_val_tag();
+        _push_level(/*explicit flow*/true);
+        _start_seq(start_as_child);
+        add_flags(FLOW);
+        _line_progressed(1);
+        return true;
+    }
+    else if(rem.begins_with('{'))
+    {
+        _c4dbgpf("it's a map, flow (as_child={})", start_as_child);
+        _move_key_anchor_to_val_anchor();
+        _move_key_tag_to_val_tag();
+        _push_level(/*explicit flow*/true);
+        _start_map(start_as_child);
+        addrem_flags(FLOW|RKEY, RVAL);
+        _line_progressed(1);
+        return true;
+    }
+    else if(rem.begins_with("? "))
+    {
+        _c4dbgpf("it's a map (as_child={}) + this key is complex", start_as_child);
+        _move_key_anchor_to_val_anchor();
+        _move_key_tag_to_val_tag();
+        _push_level();
+        _start_map(start_as_child);
+        addrem_flags(RKEY|QMRK, RVAL);
+        _save_indentation();
+        _line_progressed(2);
+        return true;
+    }
+    else if(rem.begins_with(": ") && !has_all(SSCL))
+    {
+        _c4dbgp("it's a map with an empty key");
+        _move_key_anchor_to_val_anchor();
+        _move_key_tag_to_val_tag();
+        _push_level();
+        _start_map(start_as_child);
+        _store_scalar_null(rem.str);
+        addrem_flags(RVAL, RKEY);
+        _save_indentation();
+        _line_progressed(2);
+        return true;
+    }
+    else if(rem == ':' && !has_all(SSCL))
+    {
+        _c4dbgp("it's a map with an empty key");
+        _move_key_anchor_to_val_anchor();
+        _move_key_tag_to_val_tag();
+        _push_level();
+        _start_map(start_as_child);
+        _store_scalar_null(rem.str);
+        addrem_flags(RVAL, RKEY);
+        _save_indentation();
+        _line_progressed(1);
+        return true;
+    }
+    else if(_handle_types())
+    {
+        return true;
+    }
+    else if(!rem.begins_with('*') && _handle_key_anchors_and_refs())
+    {
+        return true;
+    }
+    else if(has_all(SSCL))
+    {
+        _c4dbgpf("there's a stored scalar: '{}'", m_state->scalar);
+
+        csubstr saved_scalar;
+        bool is_quoted;
+        if(_scan_scalar_unk(&saved_scalar, &is_quoted))
+        {
+            rem = m_state->line_contents.rem;
+            _c4dbgpf("... and there's also a scalar next! '{}'", saved_scalar);
+            if(rem.begins_with_any(" \t"))
+            {
+                size_t n = rem.first_not_of(" \t");
+                _c4dbgpf("skipping {} spaces/tabs", n);
+                rem = rem.sub(n);
+                _line_progressed(n);
+            }
+        }
+
+        _c4dbgpf("rem='{}'", rem);
+
+        if(rem.begins_with(", "))
+        {
+            _c4dbgpf("got a ',' -- it's a seq (as_child={})", start_as_child);
+            _start_seq(start_as_child);
+            add_flags(FLOW);
+            _append_val(_consume_scalar());
+            _line_progressed(2);
+        }
+        else if(rem.begins_with(','))
+        {
+            _c4dbgpf("got a ',' -- it's a seq (as_child={})", start_as_child);
+            _start_seq(start_as_child);
+            add_flags(FLOW);
+            _append_val(_consume_scalar());
+            _line_progressed(1);
+        }
+        else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
+        {
+            _c4dbgpf("got a ': ' -- it's a map (as_child={})", start_as_child);
+            _start_map_unk(start_as_child); // wait for the val scalar to append the key-val pair
+            _line_progressed(2);
+        }
+        else if(rem == ":" || rem.begins_with(":\"") || rem.begins_with(":'"))
+        {
+            if(rem == ":") { _c4dbgpf("got a ':' -- it's a map (as_child={})", start_as_child); }
+            else { _c4dbgpf("got a '{}' -- it's a map (as_child={})", rem.first(2), start_as_child); }
+            _start_map_unk(start_as_child); // wait for the val scalar to append the key-val pair
+            _line_progressed(1); // advance only 1
+        }
+        else if(rem.begins_with('}'))
+        {
+            if(!has_all(RMAP|FLOW))
+            {
+                _c4err("invalid token: not reading a map");
+            }
+            if(!has_all(SSCL))
+            {
+                _c4err("no scalar stored");
+            }
+            _append_key_val(saved_scalar);
+            _stop_map();
+            _line_progressed(1);
+        }
+        else if(rem.begins_with("..."))
+        {
+            _c4dbgp("got stream end '...'");
+            _end_stream();
+            _line_progressed(3);
+        }
+        else if(rem.begins_with('#'))
+        {
+            _c4dbgpf("it's a comment: '{}'", rem);
+            _scan_comment();
+            return true;
+        }
+        else if(_handle_key_anchors_and_refs())
+        {
+            return true;
+        }
+        else if(rem.begins_with(" ") || rem.begins_with("\t"))
+        {
+            size_t n = rem.first_not_of(" \t");
+            if(n == npos)
+                n = rem.len;
+            _c4dbgpf("has {} spaces/tabs, skip...", n);
+            _line_progressed(n);
+            return true;
+        }
+        else if(rem.empty())
+        {
+            // nothing to do
+        }
+        else if(rem == "---" || rem.begins_with("--- "))
+        {
+            _c4dbgp("caught ---: starting doc");
+            _start_new_doc(rem);
+            return true;
+        }
+        else if(rem.begins_with('%'))
+        {
+            _c4dbgp("caught a directive: ignoring...");
+            _line_progressed(rem.len);
+            return true;
+        }
+        else
+        {
+            _c4err("parse error");
+        }
+
+        if( ! saved_scalar.empty())
+        {
+            _store_scalar(saved_scalar, is_quoted);
+        }
+
+        return true;
+    }
+    else
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(SSCL));
+        csubstr scalar;
+        size_t indentation = m_state->line_contents.indentation; // save
+        bool is_quoted;
+        if(_scan_scalar_unk(&scalar, &is_quoted))
+        {
+            _c4dbgpf("got a {} scalar", is_quoted ? "quoted" : "");
+            rem = m_state->line_contents.rem;
+            {
+                size_t first = rem.first_not_of(" \t");
+                if(first && first != npos)
+                {
+                    _c4dbgpf("skip {} whitespace characters", first);
+                   _line_progressed(first);
+                   rem = rem.sub(first);
+                }
+            }
+            _store_scalar(scalar, is_quoted);
+            if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
+            {
+                _c4dbgpf("got a ': ' next -- it's a map (as_child={})", start_as_child);
+                _push_level();
+                _start_map(start_as_child); // wait for the val scalar to append the key-val pair
+                _set_indentation(indentation);
+                _line_progressed(2); // call this AFTER saving the indentation
+            }
+            else if(rem == ":")
+            {
+                _c4dbgpf("got a ':' next -- it's a map (as_child={})", start_as_child);
+                _push_level();
+                _start_map(start_as_child); // wait for the val scalar to append the key-val pair
+                _set_indentation(indentation);
+                _line_progressed(1); // call this AFTER saving the indentation
+            }
+            else
+            {
+                // we still don't know whether it's a seq or a map
+                // so just store the scalar
+            }
+            return true;
+        }
+        else if(rem.begins_with_any(" \t"))
+        {
+            csubstr ws = rem.left_of(rem.first_not_of(" \t"));
+            rem = rem.right_of(ws);
+            if(has_all(RTOP) && rem.begins_with("---"))
+            {
+                _c4dbgp("there's a doc starting, and it's indented");
+                _set_indentation(ws.len);
+            }
+            _c4dbgpf("skipping {} spaces/tabs", ws.len);
+            _line_progressed(ws.len);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+
+//-----------------------------------------------------------------------------
+C4_ALWAYS_INLINE void Parser::_skipchars(char c)
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.begins_with(c));
+    size_t pos = m_state->line_contents.rem.first_not_of(c);
+    if(pos == npos)
+        pos = m_state->line_contents.rem.len; // maybe the line is just whitespace
+    _c4dbgpf("skip {} '{}'", pos, c);
+    _line_progressed(pos);
+}
+
+template<size_t N>
+C4_ALWAYS_INLINE void Parser::_skipchars(const char (&chars)[N])
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.begins_with_any(chars));
+    size_t pos = m_state->line_contents.rem.first_not_of(chars);
+    if(pos == npos)
+        pos = m_state->line_contents.rem.len; // maybe the line is just whitespace
+    _c4dbgpf("skip {} characters", pos);
+    _line_progressed(pos);
+}
+
+
+//-----------------------------------------------------------------------------
+bool Parser::_handle_seq_flow()
+{
+    _c4dbgpf("handle_seq_flow: node_id={} level={}", m_state->node_id, m_state->level);
+    csubstr rem = m_state->line_contents.rem;
+
+    _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
+    _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ|FLOW));
+
+    if(rem.begins_with(' '))
+    {
+        // with explicit flow, indentation does not matter
+        _c4dbgp("starts with spaces");
+        _skipchars(' ');
+        return true;
+    }
+    _RYML_WITH_TAB_TOKENS(else if(rem.begins_with('\t'))
+    {
+        _c4dbgp("starts with tabs");
+        _skipchars('\t');
+        return true;
+    })
+    else if(rem.begins_with('#'))
+    {
+        _c4dbgp("it's a comment");
+        rem = _scan_comment(); // also progresses the line
+        return true;
+    }
+    else if(rem.begins_with(']'))
+    {
+        _c4dbgp("end the sequence");
+        _pop_level();
+        _line_progressed(1);
+        if(has_all(RSEQIMAP))
+        {
+            _stop_seqimap();
+            _pop_level();
+        }
+        return true;
+    }
+
+    if(has_any(RVAL))
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
+        bool is_quoted;
+        if(_scan_scalar_seq_flow(&rem, &is_quoted))
+        {
+            _c4dbgp("it's a scalar");
+            addrem_flags(RNXT, RVAL);
+            _append_val(rem, is_quoted);
+            return true;
+        }
+        else if(rem.begins_with('['))
+        {
+            _c4dbgp("val is a child seq");
+            addrem_flags(RNXT, RVAL); // before _push_level!
+            _push_level(/*explicit flow*/true);
+            _start_seq();
+            add_flags(FLOW);
+            _line_progressed(1);
+            return true;
+        }
+        else if(rem.begins_with('{'))
+        {
+            _c4dbgp("val is a child map");
+            addrem_flags(RNXT, RVAL); // before _push_level!
+            _push_level(/*explicit flow*/true);
+            _start_map();
+            addrem_flags(FLOW|RKEY, RVAL);
+            _line_progressed(1);
+            return true;
+        }
+        else if(rem == ':')
+        {
+            _c4dbgpf("found ':' -- there's an implicit map in the seq node[{}]", m_state->node_id);
+            _start_seqimap();
+            _line_progressed(1);
+            return true;
+        }
+        else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
+        {
+            _c4dbgpf("found ': ' -- there's an implicit map in the seq node[{}]", m_state->node_id);
+            _start_seqimap();
+            _line_progressed(2);
+            return true;
+        }
+        else if(rem.begins_with("? "))
+        {
+            _c4dbgpf("found '? ' -- there's an implicit map in the seq node[{}]", m_state->node_id);
+            _start_seqimap();
+            _line_progressed(2);
+            _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(SSCL) && m_state->scalar == "");
+            addrem_flags(QMRK|RKEY, RVAL|SSCL);
+            return true;
+        }
+        else if(_handle_types())
+        {
+            return true;
+        }
+        else if(_handle_val_anchors_and_refs())
+        {
+            return true;
+        }
+        else if(rem.begins_with(", "))
+        {
+            _c4dbgp("found ',' -- the value was null");
+            _append_val_null(rem.str - 1);
+            _line_progressed(2);
+            return true;
+        }
+        else if(rem.begins_with(','))
+        {
+            _c4dbgp("found ',' -- the value was null");
+            _append_val_null(rem.str - 1);
+            _line_progressed(1);
+            return true;
+        }
+        else if(rem.begins_with('\t'))
+        {
+            _skipchars('\t');
+            return true;
+        }
+        else
+        {
+            _c4err("parse error");
+        }
+    }
+    else if(has_any(RNXT))
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
+        if(rem.begins_with(", "))
+        {
+            _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW));
+            _c4dbgp("seq: expect next val");
+            addrem_flags(RVAL, RNXT);
+            _line_progressed(2);
+            return true;
+        }
+        else if(rem.begins_with(','))
+        {
+            _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW));
+            _c4dbgp("seq: expect next val");
+            addrem_flags(RVAL, RNXT);
+            _line_progressed(1);
+            return true;
+        }
+        else if(rem == ':')
+        {
+            _c4dbgpf("found ':' -- there's an implicit map in the seq node[{}]", m_state->node_id);
+            _start_seqimap();
+            _line_progressed(1);
+            return true;
+        }
+        else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
+        {
+            _c4dbgpf("found ': ' -- there's an implicit map in the seq node[{}]", m_state->node_id);
+            _start_seqimap();
+            _line_progressed(2);
+            return true;
+        }
+        else
+        {
+            _c4err("was expecting a comma");
+        }
+    }
+    else
+    {
+        _c4err("internal error");
+    }
+
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+bool Parser::_handle_seq_blck()
+{
+    _c4dbgpf("handle_seq_impl: node_id={} level={}", m_state->node_id, m_state->level);
+    csubstr rem = m_state->line_contents.rem;
+
+    _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ));
+    _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
+    _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW));
+
+    if(rem.begins_with('#'))
+    {
+        _c4dbgp("it's a comment");
+        rem = _scan_comment();
+        return true;
+    }
+    if(has_any(RNXT))
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
+
+        if(_handle_indentation())
+            return true;
+
+        if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t")))
+        {
+            _c4dbgp("expect another val");
+            addrem_flags(RVAL, RNXT);
+            _line_progressed(2);
+            return true;
+        }
+        else if(rem == '-')
+        {
+            _c4dbgp("expect another val");
+            addrem_flags(RVAL, RNXT);
+            _line_progressed(1);
+            return true;
+        }
+        else if(rem.begins_with_any(" \t"))
+        {
+            _RYML_CB_ASSERT(m_stack.m_callbacks,  ! _at_line_begin());
+            _skipchars(" \t");
+            return true;
+        }
+        else if(rem.begins_with("..."))
+        {
+            _c4dbgp("got stream end '...'");
+            _end_stream();
+            _line_progressed(3);
+            return true;
+        }
+        else if(rem.begins_with("---"))
+        {
+            _c4dbgp("got document start '---'");
+            _start_new_doc(rem);
+            return true;
+        }
+        else
+        {
+            _c4err("parse error");
+        }
+    }
+    else if(has_any(RVAL))
+    {
+        // there can be empty values
+        if(_handle_indentation())
+            return true;
+
+        csubstr s;
+        bool is_quoted;
+        if(_scan_scalar_seq_blck(&s, &is_quoted)) // this also progresses the line
+        {
+            _c4dbgpf("it's a{} scalar", is_quoted ? " quoted" : "");
+
+            rem = m_state->line_contents.rem;
+            if(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(rem.begins_with_any(" \t"), rem.begins_with(' ')))
+            {
+                _c4dbgp("skipping whitespace...");
+                size_t skip = rem.first_not_of(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
+                if(skip == csubstr::npos)
+                    skip = rem.len; // maybe the line is just whitespace
+                _line_progressed(skip);
+                rem = rem.sub(skip);
+            }
+
+            _c4dbgpf("rem=[{}]~~~{}~~~", rem.len, rem);
+            if(!rem.begins_with('#') && (rem.ends_with(':') || rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))))
+            {
+                _c4dbgp("actually, the scalar is the first key of a map, and it opens a new scope");
+                if(m_key_anchor.empty())
+                    _move_val_anchor_to_key_anchor();
+                if(m_key_tag.empty())
+                    _move_val_tag_to_key_tag();
+                addrem_flags(RNXT, RVAL); // before _push_level! This prepares the current level for popping by setting it to RNXT
+                _push_level();
+                _start_map();
+                _store_scalar(s, is_quoted);
+                if( ! _maybe_set_indentation_from_anchor_or_tag())
+                {
+                    _c4dbgpf("set indentation from scalar: {}", m_state->scalar_col);
+                    _set_indentation(m_state->scalar_col); // this is the column where the scalar starts
+                }
+                _move_key_tag2_to_key_tag();
+                addrem_flags(RVAL, RKEY);
+                _line_progressed(1);
+            }
+            else
+            {
+                _c4dbgp("appending val to current seq");
+                _append_val(s, is_quoted);
+                addrem_flags(RNXT, RVAL);
+            }
+            return true;
+        }
+        else if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t")))
+        {
+            if(_rval_dash_start_or_continue_seq())
+                _line_progressed(2);
+            return true;
+        }
+        else if(rem == '-')
+        {
+            if(_rval_dash_start_or_continue_seq())
+                _line_progressed(1);
+            return true;
+        }
+        else if(rem.begins_with('['))
+        {
+            _c4dbgp("val is a child seq, flow");
+            addrem_flags(RNXT, RVAL); // before _push_level!
+            _push_level(/*explicit flow*/true);
+            _start_seq();
+            add_flags(FLOW);
+            _line_progressed(1);
+            return true;
+        }
+        else if(rem.begins_with('{'))
+        {
+            _c4dbgp("val is a child map, flow");
+            addrem_flags(RNXT, RVAL); // before _push_level!
+            _push_level(/*explicit flow*/true);
+            _start_map();
+            addrem_flags(FLOW|RKEY, RVAL);
+            _line_progressed(1);
+            return true;
+        }
+        else if(rem.begins_with("? "))
+        {
+            _c4dbgp("val is a child map + this key is complex");
+            addrem_flags(RNXT, RVAL); // before _push_level!
+            _push_level();
+            _start_map();
+            addrem_flags(QMRK|RKEY, RVAL);
+            _save_indentation();
+            _line_progressed(2);
+            return true;
+        }
+        else if(rem.begins_with(' '))
+        {
+            csubstr spc = rem.left_of(rem.first_not_of(' '));
+            if(_at_line_begin())
+            {
+                _c4dbgpf("skipping value indentation: {} spaces", spc.len);
+                _line_progressed(spc.len);
+                return true;
+            }
+            else
+            {
+                _c4dbgpf("skipping {} spaces", spc.len);
+                _line_progressed(spc.len);
+                return true;
+            }
+        }
+        else if(_handle_types())
+        {
+            return true;
+        }
+        else if(_handle_val_anchors_and_refs())
+        {
+            return true;
+        }
+        /* pathological case:
+         * - &key : val
+         * - &key :
+         * - : val
+         */
+        else if((!has_all(SSCL)) &&
+                (rem.begins_with(": ") || rem.left_of(rem.find("#")).trimr("\t") == ":"))
+        {
+            if(!m_val_anchor.empty() || !m_val_tag.empty())
+            {
+                _c4dbgp("val is a child map + this key is empty, with anchors or tags");
+                addrem_flags(RNXT, RVAL); // before _push_level!
+                _move_val_tag_to_key_tag();
+                _move_val_anchor_to_key_anchor();
+                _push_level();
+                _start_map();
+                _store_scalar_null(rem.str);
+                addrem_flags(RVAL, RKEY);
+                RYML_CHECK(_maybe_set_indentation_from_anchor_or_tag()); // one of them must exist
+                _line_progressed(rem.begins_with(": ") ? 2u : 1u);
+                return true;
+            }
+            else
+            {
+                _c4dbgp("val is a child map + this key is empty, no anchors or tags");
+                addrem_flags(RNXT, RVAL); // before _push_level!
+                size_t ind = m_state->indref;
+                _push_level();
+                _start_map();
+                _store_scalar_null(rem.str);
+                addrem_flags(RVAL, RKEY);
+                _c4dbgpf("set indentation from map anchor: {}", ind + 2);
+                _set_indentation(ind + 2); // this is the column where the map starts
+                _line_progressed(rem.begins_with(": ") ? 2u : 1u);
+                return true;
+            }
+        }
+        else
+        {
+            _c4err("parse error");
+        }
+    }
+
+    return false;
+}
+
+//-----------------------------------------------------------------------------
+
+bool Parser::_rval_dash_start_or_continue_seq()
+{
+    size_t ind = m_state->line_contents.current_col();
+    _RYML_CB_ASSERT(m_stack.m_callbacks, ind >= m_state->indref);
+    size_t delta_ind = ind - m_state->indref;
+    if( ! delta_ind)
+    {
+        _c4dbgp("prev val was empty");
+        addrem_flags(RNXT, RVAL);
+        _append_val_null(&m_state->line_contents.full[ind]);
+        return false;
+    }
+    _c4dbgp("val is a nested seq, indented");
+    addrem_flags(RNXT, RVAL); // before _push_level!
+    _push_level();
+    _start_seq();
+    _save_indentation();
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+bool Parser::_handle_map_flow()
+{
+    // explicit flow, ie, inside {}, separated by commas
+    _c4dbgpf("handle_map_flow: node_id={}  level={}", m_state->node_id, m_state->level);
+    csubstr rem = m_state->line_contents.rem;
+
+    _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RMAP|FLOW));
+
+    if(rem.begins_with(' '))
+    {
+        // with explicit flow, indentation does not matter
+        _c4dbgp("starts with spaces");
+        _skipchars(' ');
+        return true;
+    }
+    _RYML_WITH_TAB_TOKENS(else if(rem.begins_with('\t'))
+    {
+        // with explicit flow, indentation does not matter
+        _c4dbgp("starts with tabs");
+        _skipchars('\t');
+        return true;
+    })
+    else if(rem.begins_with('#'))
+    {
+        _c4dbgp("it's a comment");
+        rem = _scan_comment(); // also progresses the line
+        return true;
+    }
+    else if(rem.begins_with('}'))
+    {
+        _c4dbgp("end the map");
+        if(has_all(SSCL))
+        {
+            _c4dbgp("the last val was null");
+            _append_key_val_null(rem.str - 1);
+            rem_flags(RVAL);
+        }
+        _pop_level();
+        _line_progressed(1);
+        if(has_all(RSEQIMAP))
+        {
+            _c4dbgp("stopping implicitly nested 1x map");
+            _stop_seqimap();
+            _pop_level();
+        }
+        return true;
+    }
+
+    if(has_any(RNXT))
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RSEQIMAP));
+
+        if(rem.begins_with(", "))
+        {
+            _c4dbgp("seq: expect next keyval");
+            addrem_flags(RKEY, RNXT);
+            _line_progressed(2);
+            return true;
+        }
+        else if(rem.begins_with(','))
+        {
+            _c4dbgp("seq: expect next keyval");
+            addrem_flags(RKEY, RNXT);
+            _line_progressed(1);
+            return true;
+        }
+        else
+        {
+            _c4err("parse error");
+        }
+    }
+    else if(has_any(RKEY))
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
+
+        bool is_quoted;
+        if(has_none(SSCL) && _scan_scalar_map_flow(&rem, &is_quoted))
+        {
+            _c4dbgp("it's a scalar");
+            _store_scalar(rem, is_quoted);
+            rem = m_state->line_contents.rem;
+            csubstr trimmed = rem.triml(" \t");
+            if(trimmed.len && (trimmed.begins_with(": ") || trimmed.begins_with_any(":,}") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))))
+            {
+                _RYML_CB_ASSERT(m_stack.m_callbacks, trimmed.str >= rem.str);
+                size_t num = static_cast<size_t>(trimmed.str - rem.str);
+                _c4dbgpf("trimming {} whitespace after the scalar: '{}' --> '{}'", num, rem, rem.sub(num));
+                rem = rem.sub(num);
+                _line_progressed(num);
+            }
+        }
+
+        if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
+        {
+            _c4dbgp("wait for val");
+            addrem_flags(RVAL, RKEY|QMRK);
+            _line_progressed(2);
+            if(!has_all(SSCL))
+            {
+                _c4dbgp("no key was found, defaulting to empty key ''");
+                _store_scalar_null(rem.str);
+            }
+            return true;
+        }
+        else if(rem == ':')
+        {
+            _c4dbgp("wait for val");
+            addrem_flags(RVAL, RKEY|QMRK);
+            _line_progressed(1);
+            if(!has_all(SSCL))
+            {
+                _c4dbgp("no key was found, defaulting to empty key ''");
+                _store_scalar_null(rem.str);
+            }
+            return true;
+        }
+        else if(rem.begins_with('?'))
+        {
+            _c4dbgp("complex key");
+            add_flags(QMRK);
+            _line_progressed(1);
+            return true;
+        }
+        else if(rem.begins_with(','))
+        {
+            _c4dbgp("prev scalar was a key with null value");
+            _append_key_val_null(rem.str - 1);
+            _line_progressed(1);
+            return true;
+        }
+        else if(rem.begins_with('}'))
+        {
+            _c4dbgp("map terminates after a key...");
+            _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(SSCL));
+            _c4dbgp("the last val was null");
+            _append_key_val_null(rem.str - 1);
+            rem_flags(RVAL);
+            if(has_all(RSEQIMAP))
+            {
+                _c4dbgp("stopping implicitly nested 1x map");
+                _stop_seqimap();
+                _pop_level();
+            }
+            _pop_level();
+            _line_progressed(1);
+            return true;
+        }
+        else if(_handle_types())
+        {
+            return true;
+        }
+        else if(_handle_key_anchors_and_refs())
+        {
+            return true;
+        }
+        else if(rem == "")
+        {
+            return true;
+        }
+        else
+        {
+            size_t pos = rem.first_not_of(" \t");
+            if(pos == csubstr::npos)
+               pos = 0;
+            rem = rem.sub(pos);
+            if(rem.begins_with(':'))
+            {
+                _c4dbgp("wait for val");
+                addrem_flags(RVAL, RKEY|QMRK);
+                _line_progressed(pos + 1);
+                if(!has_all(SSCL))
+                {
+                    _c4dbgp("no key was found, defaulting to empty key ''");
+                    _store_scalar_null(rem.str);
+                }
+                return true;
+            }
+            else if(rem.begins_with('#'))
+            {
+                _c4dbgp("it's a comment");
+                _line_progressed(pos);
+                rem = _scan_comment(); // also progresses the line
+                return true;
+            }
+            else
+            {
+                _c4err("parse error");
+            }
+        }
+    }
+    else if(has_any(RVAL))
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(SSCL));
+        bool is_quoted;
+        if(_scan_scalar_map_flow(&rem, &is_quoted))
+        {
+            _c4dbgp("it's a scalar");
+            addrem_flags(RNXT, RVAL|RKEY);
+            _append_key_val(rem, is_quoted);
+            if(has_all(RSEQIMAP))
+            {
+                _c4dbgp("stopping implicitly nested 1x map");
+                _stop_seqimap();
+                _pop_level();
+            }
+            return true;
+        }
+        else if(rem.begins_with('['))
+        {
+            _c4dbgp("val is a child seq");
+            addrem_flags(RNXT, RVAL|RKEY); // before _push_level!
+            _push_level(/*explicit flow*/true);
+            _move_scalar_from_top();
+            _start_seq();
+            add_flags(FLOW);
+            _line_progressed(1);
+            return true;
+        }
+        else if(rem.begins_with('{'))
+        {
+            _c4dbgp("val is a child map");
+            addrem_flags(RNXT, RVAL|RKEY); // before _push_level!
+            _push_level(/*explicit flow*/true);
+            _move_scalar_from_top();
+            _start_map();
+            addrem_flags(FLOW|RKEY, RNXT|RVAL);
+            _line_progressed(1);
+            return true;
+        }
+        else if(_handle_types())
+        {
+            return true;
+        }
+        else if(_handle_val_anchors_and_refs())
+        {
+            return true;
+        }
+        else if(rem.begins_with(','))
+        {
+            _c4dbgp("appending empty val");
+            _append_key_val_null(rem.str - 1);
+            addrem_flags(RKEY, RVAL);
+            _line_progressed(1);
+            if(has_any(RSEQIMAP))
+            {
+                _c4dbgp("stopping implicitly nested 1x map");
+                _stop_seqimap();
+                _pop_level();
+            }
+            return true;
+        }
+        else if(has_any(RSEQIMAP) && rem.begins_with(']'))
+        {
+            _c4dbgp("stopping implicitly nested 1x map");
+            if(has_any(SSCL))
+            {
+                _append_key_val_null(rem.str - 1);
+            }
+            _stop_seqimap();
+            _pop_level();
+            return true;
+        }
+        else
+        {
+            _c4err("parse error");
+        }
+    }
+    else
+    {
+        _c4err("internal error");
+    }
+
+    return false;
+}
+
+//-----------------------------------------------------------------------------
+bool Parser::_handle_map_blck()
+{
+    _c4dbgpf("handle_map_blck: node_id={}  level={}", m_state->node_id, m_state->level);
+    csubstr rem = m_state->line_contents.rem;
+
+    _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RMAP));
+    _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW));
+
+    if(rem.begins_with('#'))
+    {
+        _c4dbgp("it's a comment");
+        rem = _scan_comment();
+        return true;
+    }
+
+    if(has_any(RNXT))
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
+        // actually, we don't need RNXT in indent-based maps.
+        addrem_flags(RKEY, RNXT);
+    }
+
+    if(_handle_indentation())
+    {
+        _c4dbgp("indentation token");
+        return true;
+    }
+
+    if(has_any(RKEY))
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
+
+        _c4dbgp("RMAP|RKEY read scalar?");
+        bool is_quoted;
+        if(_scan_scalar_map_blck(&rem, &is_quoted)) // this also progresses the line
+        {
+            _c4dbgpf("it's a{} scalar", is_quoted ? " quoted" : "");
+            if(has_all(QMRK|SSCL))
+            {
+                _c4dbgpf("current key is QMRK; SSCL is set. so take store scalar='{}' as key and add an empty val", m_state->scalar);
+                _append_key_val_null(rem.str - 1);
+            }
+            _store_scalar(rem, is_quoted);
+            if(has_all(QMRK|RSET))
+            {
+                _c4dbgp("it's a complex key, so use null value '~'");
+                _append_key_val_null(rem.str);
+            }
+            rem = m_state->line_contents.rem;
+
+            if(rem.begins_with(':'))
+            {
+                _c4dbgp("wait for val");
+                addrem_flags(RVAL, RKEY|QMRK);
+                _line_progressed(1);
+                rem = m_state->line_contents.rem;
+                if(rem.begins_with_any(" \t"))
+                {
+                    _RYML_CB_ASSERT(m_stack.m_callbacks,  ! _at_line_begin());
+                    rem = rem.left_of(rem.first_not_of(" \t"));
+                    _c4dbgpf("skip {} spaces/tabs", rem.len);
+                    _line_progressed(rem.len);
+                }
+            }
+            return true;
+        }
+        else if(rem.begins_with_any(" \t"))
+        {
+            size_t pos = rem.first_not_of(" \t");
+            if(pos == npos)
+                pos = rem.len;
+            _c4dbgpf("skip {} spaces/tabs", pos);
+            _line_progressed(pos);
+            return true;
+        }
+        else if(rem == '?' || rem.begins_with("? "))
+        {
+            _c4dbgp("it's a complex key");
+            _line_progressed(rem.begins_with("? ") ? 2u : 1u);
+            if(has_any(SSCL))
+                _append_key_val_null(rem.str - 1);
+            add_flags(QMRK);
+            return true;
+        }
+        else if(has_all(QMRK) && rem.begins_with(':'))
+        {
+            _c4dbgp("complex key finished");
+            if(!has_any(SSCL))
+                _store_scalar_null(rem.str);
+            addrem_flags(RVAL, RKEY|QMRK);
+            _line_progressed(1);
+            rem = m_state->line_contents.rem;
+            if(rem.begins_with(' '))
+            {
+                _RYML_CB_ASSERT(m_stack.m_callbacks,  ! _at_line_begin());
+                _skipchars(' ');
+            }
+            return true;
+        }
+        else if(rem == ':' || rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
+        {
+            _c4dbgp("key finished");
+            if(!has_all(SSCL))
+            {
+                _c4dbgp("key was empty...");
+                _store_scalar_null(rem.str);
+                rem_flags(QMRK);
+            }
+            addrem_flags(RVAL, RKEY);
+            _line_progressed(rem == ':' ? 1 : 2);
+            return true;
+        }
+        else if(rem.begins_with("..."))
+        {
+            _c4dbgp("end current document");
+            _end_stream();
+            _line_progressed(3);
+            return true;
+        }
+        else if(rem.begins_with("---"))
+        {
+            _c4dbgp("start new document '---'");
+            _start_new_doc(rem);
+            return true;
+        }
+        else if(_handle_types())
+        {
+            return true;
+        }
+        else if(_handle_key_anchors_and_refs())
+        {
+            return true;
+        }
+        else
+        {
+            _c4err("parse error");
+        }
+    }
+    else if(has_any(RVAL))
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
+
+        _c4dbgp("RMAP|RVAL read scalar?");
+        csubstr s;
+        bool is_quoted;
+        if(_scan_scalar_map_blck(&s, &is_quoted)) // this also progresses the line
+        {
+            _c4dbgpf("it's a{} scalar", is_quoted ? " quoted" : "");
+
+            rem = m_state->line_contents.rem;
+
+            if(rem.begins_with(": "))
+            {
+                _c4dbgp("actually, the scalar is the first key of a map");
+                addrem_flags(RKEY, RVAL); // before _push_level! This prepares the current level for popping by setting it to RNXT
+                _push_level();
+                _move_scalar_from_top();
+                _move_val_anchor_to_key_anchor();
+                _start_map();
+                _save_indentation(m_state->scalar_col);
+                addrem_flags(RVAL, RKEY);
+                _line_progressed(2);
+            }
+            else if(rem.begins_with(':'))
+            {
+                _c4dbgp("actually, the scalar is the first key of a map, and it opens a new scope");
+                addrem_flags(RKEY, RVAL); // before _push_level! This prepares the current level for popping by setting it to RNXT
+                _push_level();
+                _move_scalar_from_top();
+                _move_val_anchor_to_key_anchor();
+                _start_map();
+                _save_indentation(/*behind*/s.len);
+                addrem_flags(RVAL, RKEY);
+                _line_progressed(1);
+            }
+            else
+            {
+                _c4dbgp("appending keyval to current map");
+                _append_key_val(s, is_quoted);
+                addrem_flags(RKEY, RVAL);
+            }
+            return true;
+        }
+        else if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t")))
+        {
+            _c4dbgp("val is a nested seq, indented");
+            addrem_flags(RKEY, RVAL); // before _push_level!
+            _push_level();
+            _move_scalar_from_top();
+            _start_seq();
+            _save_indentation();
+            _line_progressed(2);
+            return true;
+        }
+        else if(rem == '-')
+        {
+            _c4dbgp("maybe a seq. start unknown, indented");
+            _start_unk();
+            _save_indentation();
+            _line_progressed(1);
+            return true;
+        }
+        else if(rem.begins_with('['))
+        {
+            _c4dbgp("val is a child seq, flow");
+            addrem_flags(RKEY, RVAL); // before _push_level!
+            _push_level(/*explicit flow*/true);
+            _move_scalar_from_top();
+            _start_seq();
+            add_flags(FLOW);
+            _line_progressed(1);
+            return true;
+        }
+        else if(rem.begins_with('{'))
+        {
+            _c4dbgp("val is a child map, flow");
+            addrem_flags(RKEY, RVAL); // before _push_level!
+            _push_level(/*explicit flow*/true);
+            _move_scalar_from_top();
+            _start_map();
+            addrem_flags(FLOW|RKEY, RVAL);
+            _line_progressed(1);
+            return true;
+        }
+        else if(rem.begins_with(' '))
+        {
+            csubstr spc = rem.left_of(rem.first_not_of(' '));
+            if(_at_line_begin())
+            {
+                _c4dbgpf("skipping value indentation: {} spaces", spc.len);
+                _line_progressed(spc.len);
+                return true;
+            }
+            else
+            {
+                _c4dbgpf("skipping {} spaces", spc.len);
+                _line_progressed(spc.len);
+                return true;
+            }
+        }
+        else if(_handle_types())
+        {
+            return true;
+        }
+        else if(_handle_val_anchors_and_refs())
+        {
+            return true;
+        }
+        else if(rem.begins_with("--- ") || rem == "---" || rem.begins_with("---\t"))
+        {
+            _start_new_doc(rem);
+            return true;
+        }
+        else if(rem.begins_with("..."))
+        {
+            _c4dbgp("end current document");
+            _end_stream();
+            _line_progressed(3);
+            return true;
+        }
+        else
+        {
+            _c4err("parse error");
+        }
+    }
+    else
+    {
+        _c4err("internal error");
+    }
+
+    return false;
+}
+
+
+//-----------------------------------------------------------------------------
+bool Parser::_handle_top()
+{
+    _c4dbgp("handle_top");
+    csubstr rem = m_state->line_contents.rem;
+
+    if(rem.begins_with('#'))
+    {
+        _c4dbgp("a comment line");
+        _scan_comment();
+        return true;
+    }
+
+    csubstr trimmed = rem.triml(' ');
+
+    if(trimmed.begins_with('%'))
+    {
+        _handle_directive(trimmed);
+        _line_progressed(rem.len);
+        return true;
+    }
+    else if(trimmed.begins_with("--- ") || trimmed == "---" || trimmed.begins_with("---\t"))
+    {
+        _start_new_doc(rem);
+        if(trimmed.len < rem.len)
+        {
+            _line_progressed(rem.len - trimmed.len);
+            _save_indentation();
+        }
+        return true;
+    }
+    else if(trimmed.begins_with("..."))
+    {
+        _c4dbgp("end current document");
+        _end_stream();
+        if(trimmed.len < rem.len)
+        {
+            _line_progressed(rem.len - trimmed.len);
+        }
+        _line_progressed(3);
+        return true;
+    }
+    else
+    {
+        _c4err("parse error");
+    }
+
+    return false;
+}
+
+
+//-----------------------------------------------------------------------------
+
+bool Parser::_handle_key_anchors_and_refs()
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks, !has_any(RVAL));
+    const csubstr rem = m_state->line_contents.rem;
+    if(rem.begins_with('&'))
+    {
+        _c4dbgp("found a key anchor!!!");
+        if(has_all(QMRK|SSCL))
+        {
+            _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RKEY));
+            _c4dbgp("there is a stored key, so this anchor is for the next element");
+            _append_key_val_null(rem.str - 1);
+            rem_flags(QMRK);
+            return true;
+        }
+        csubstr anchor = rem.left_of(rem.first_of(' '));
+        _line_progressed(anchor.len);
+        anchor = anchor.sub(1); // skip the first character
+        _move_key_anchor_to_val_anchor();
+        _c4dbgpf("key anchor value: '{}'", anchor);
+        m_key_anchor = anchor;
+        m_key_anchor_indentation = m_state->line_contents.current_col(rem);
+        return true;
+    }
+    else if(C4_UNLIKELY(rem.begins_with('*')))
+    {
+        _c4err("not implemented - this should have been catched elsewhere");
+        C4_NEVER_REACH();
+        return false;
+    }
+    return false;
+}
+
+bool Parser::_handle_val_anchors_and_refs()
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks, !has_any(RKEY));
+    const csubstr rem = m_state->line_contents.rem;
+    if(rem.begins_with('&'))
+    {
+        csubstr anchor = rem.left_of(rem.first_of(' '));
+        _line_progressed(anchor.len);
+        anchor = anchor.sub(1); // skip the first character
+        _c4dbgpf("val: found an anchor: '{}', indentation={}!!!", anchor, m_state->line_contents.current_col(rem));
+        if(m_val_anchor.empty())
+        {
+            _c4dbgpf("save val anchor: '{}'", anchor);
+            m_val_anchor = anchor;
+            m_val_anchor_indentation = m_state->line_contents.current_col(rem);
+        }
+        else
+        {
+            _c4dbgpf("there is a pending val anchor '{}'", m_val_anchor);
+            if(m_tree->is_seq(m_state->node_id))
+            {
+                if(m_tree->has_children(m_state->node_id))
+                {
+                    _c4dbgpf("current node={} is a seq, has {} children", m_state->node_id, m_tree->num_children(m_state->node_id));
+                    _c4dbgpf("... so take the new one as a key anchor '{}'", anchor);
+                    m_key_anchor = anchor;
+                    m_key_anchor_indentation = m_state->line_contents.current_col(rem);
+                }
+                else
+                {
+                    _c4dbgpf("current node={} is a seq, has no children", m_state->node_id);
+                    if(m_tree->has_val_anchor(m_state->node_id))
+                    {
+                        _c4dbgpf("... node={} already has val anchor: '{}'", m_state->node_id, m_tree->val_anchor(m_state->node_id));
+                        _c4dbgpf("... so take the new one as a key anchor '{}'", anchor);
+                        m_key_anchor = anchor;
+                        m_key_anchor_indentation = m_state->line_contents.current_col(rem);
+                    }
+                    else
+                    {
+                        _c4dbgpf("... so set pending val anchor: '{}' on current node {}", m_val_anchor, m_state->node_id);
+                        m_tree->set_val_anchor(m_state->node_id, m_val_anchor);
+                        m_val_anchor = anchor;
+                        m_val_anchor_indentation = m_state->line_contents.current_col(rem);
+                    }
+                }
+            }
+        }
+        return true;
+    }
+    else if(C4_UNLIKELY(rem.begins_with('*')))
+    {
+        _c4err("not implemented - this should have been catched elsewhere");
+        C4_NEVER_REACH();
+        return false;
+    }
+    return false;
+}
+
+void Parser::_move_key_anchor_to_val_anchor()
+{
+    if(m_key_anchor.empty())
+        return;
+    _c4dbgpf("move current key anchor to val slot: key='{}' -> val='{}'", m_key_anchor, m_val_anchor);
+    if(!m_val_anchor.empty())
+        _c4err("triple-pending anchor");
+    m_val_anchor = m_key_anchor;
+    m_val_anchor_indentation = m_key_anchor_indentation;
+    m_key_anchor = {};
+    m_key_anchor_indentation = {};
+}
+
+void Parser::_move_val_anchor_to_key_anchor()
+{
+    if(m_val_anchor.empty())
+        return;
+    if(!_token_is_from_this_line(m_val_anchor))
+        return;
+    _c4dbgpf("move current val anchor to key slot: key='{}' <- val='{}'", m_key_anchor, m_val_anchor);
+    if(!m_key_anchor.empty())
+        _c4err("triple-pending anchor");
+    m_key_anchor = m_val_anchor;
+    m_key_anchor_indentation = m_val_anchor_indentation;
+    m_val_anchor = {};
+    m_val_anchor_indentation = {};
+}
+
+void Parser::_move_key_tag_to_val_tag()
+{
+    if(m_key_tag.empty())
+        return;
+    _c4dbgpf("move key tag to val tag: key='{}' -> val='{}'", m_key_tag, m_val_tag);
+    m_val_tag = m_key_tag;
+    m_val_tag_indentation = m_key_tag_indentation;
+    m_key_tag.clear();
+    m_key_tag_indentation = 0;
+}
+
+void Parser::_move_val_tag_to_key_tag()
+{
+    if(m_val_tag.empty())
+        return;
+    if(!_token_is_from_this_line(m_val_tag))
+        return;
+    _c4dbgpf("move val tag to key tag: key='{}' <- val='{}'", m_key_tag, m_val_tag);
+    m_key_tag = m_val_tag;
+    m_key_tag_indentation = m_val_tag_indentation;
+    m_val_tag.clear();
+    m_val_tag_indentation = 0;
+}
+
+void Parser::_move_key_tag2_to_key_tag()
+{
+    if(m_key_tag2.empty())
+        return;
+    _c4dbgpf("move key tag2 to key tag: key='{}' <- key2='{}'", m_key_tag, m_key_tag2);
+    m_key_tag = m_key_tag2;
+    m_key_tag_indentation = m_key_tag2_indentation;
+    m_key_tag2.clear();
+    m_key_tag2_indentation = 0;
+}
+
+
+//-----------------------------------------------------------------------------
+
+bool Parser::_handle_types()
+{
+    csubstr rem = m_state->line_contents.rem.triml(' ');
+    csubstr t;
+
+    if(rem.begins_with("!!"))
+    {
+        _c4dbgp("begins with '!!'");
+        t = rem.left_of(rem.first_of(" ,"));
+        _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 2);
+        //t = t.sub(2);
+        if(t == "!!set")
+            add_flags(RSET);
+    }
+    else if(rem.begins_with("!<"))
+    {
+        _c4dbgp("begins with '!<'");
+        t = rem.left_of(rem.first_of('>'), true);
+        _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 2);
+        //t = t.sub(2, t.len-1);
+    }
+    else if(rem.begins_with("!h!"))
+    {
+        _c4dbgp("begins with '!h!'");
+        t = rem.left_of(rem.first_of(' '));
+        _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 3);
+        //t = t.sub(3);
+    }
+    else if(rem.begins_with('!'))
+    {
+        _c4dbgp("begins with '!'");
+        t = rem.left_of(rem.first_of(' '));
+        _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 1);
+        //t = t.sub(1);
+    }
+
+    if(t.empty())
+        return false;
+
+    if(has_all(QMRK|SSCL))
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RKEY));
+        _c4dbgp("there is a stored key, so this tag is for the next element");
+        _append_key_val_null(rem.str - 1);
+        rem_flags(QMRK);
+    }
+
+    #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
+    const char *tag_beginning = rem.str;
+    #endif
+    size_t tag_indentation = m_state->line_contents.current_col(t);
+    _c4dbgpf("there was a tag: '{}', indentation={}", t, tag_indentation);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, t.end() > m_state->line_contents.rem.begin());
+    _line_progressed(static_cast<size_t>(t.end() - m_state->line_contents.rem.begin()));
+    {
+        size_t pos = m_state->line_contents.rem.first_not_of(" \t");
+        if(pos != csubstr::npos)
+            _line_progressed(pos);
+    }
+
+    if(has_all(RMAP|RKEY))
+    {
+        _c4dbgpf("saving map key tag '{}'", t);
+        _RYML_CB_ASSERT(m_stack.m_callbacks, m_key_tag.empty());
+        m_key_tag = t;
+        m_key_tag_indentation = tag_indentation;
+    }
+    else if(has_all(RMAP|RVAL))
+    {
+        /* foo: !!str
+         * !!str : bar  */
+        rem = m_state->line_contents.rem;
+        rem = rem.left_of(rem.find("#"));
+        rem = rem.trimr(" \t");
+        _c4dbgpf("rem='{}'", rem);
+        #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
+        if(rem == ':' || rem.begins_with(": "))
+        {
+            _c4dbgp("the last val was null, and this is a tag from a null key");
+            _append_key_val_null(tag_beginning - 1);
+            _store_scalar_null(rem.str - 1);
+            // do not change the flag to key, it is ~
+            _RYML_CB_ASSERT(m_stack.m_callbacks, rem.begin() > m_state->line_contents.rem.begin());
+            size_t token_len = rem == ':' ? 1 : 2;
+            _line_progressed(static_cast<size_t>(token_len + rem.begin() - m_state->line_contents.rem.begin()));
+        }
+        #endif
+        _c4dbgpf("saving map val tag '{}'", t);
+        _RYML_CB_ASSERT(m_stack.m_callbacks, m_val_tag.empty());
+        m_val_tag = t;
+        m_val_tag_indentation = tag_indentation;
+    }
+    else if(has_all(RSEQ|RVAL) || has_all(RTOP|RUNK|NDOC))
+    {
+        if(m_val_tag.empty())
+        {
+            _c4dbgpf("saving seq/doc val tag '{}'", t);
+            m_val_tag = t;
+            m_val_tag_indentation = tag_indentation;
+        }
+        else
+        {
+            _c4dbgpf("saving seq/doc key tag '{}'", t);
+            m_key_tag = t;
+            m_key_tag_indentation = tag_indentation;
+        }
+    }
+    else if(has_all(RTOP|RUNK) || has_any(RUNK))
+    {
+        rem = m_state->line_contents.rem;
+        rem = rem.left_of(rem.find("#"));
+        rem = rem.trimr(" \t");
+        if(rem.empty())
+        {
+            _c4dbgpf("saving val tag '{}'", t);
+            _RYML_CB_ASSERT(m_stack.m_callbacks, m_val_tag.empty());
+            m_val_tag = t;
+            m_val_tag_indentation = tag_indentation;
+        }
+        else
+        {
+            _c4dbgpf("saving key tag '{}'", t);
+            if(m_key_tag.empty())
+            {
+                m_key_tag = t;
+                m_key_tag_indentation = tag_indentation;
+            }
+            else
+            {
+                /* handle this case:
+                 * !!str foo: !!map
+                 *   !!int 1: !!float 20.0
+                 *   !!int 3: !!float 40.0
+                 *
+                 * (m_key_tag would be !!str and m_key_tag2 would be !!int)
+                 */
+                m_key_tag2 = t;
+                m_key_tag2_indentation = tag_indentation;
+            }
+        }
+    }
+    else
+    {
+        _c4err("internal error");
+    }
+
+    if(m_val_tag.not_empty())
+    {
+        YamlTag_e tag = to_tag(t);
+        if(tag == TAG_STR)
+        {
+            _c4dbgpf("tag '{}' is a str-type tag", t);
+            if(has_all(RTOP|RUNK|NDOC))
+            {
+                _c4dbgpf("docval. slurping the string. pos={}", m_state->pos.offset);
+                csubstr scalar = _slurp_doc_scalar();
+                _c4dbgpf("docval. after slurp: {}, at node {}: '{}'", m_state->pos.offset, m_state->node_id, scalar);
+                m_tree->to_val(m_state->node_id, scalar, DOC);
+                _c4dbgpf("docval. val tag {} -> {}", m_val_tag, normalize_tag(m_val_tag));
+                m_tree->set_val_tag(m_state->node_id, normalize_tag(m_val_tag));
+                m_val_tag.clear();
+                if(!m_val_anchor.empty())
+                {
+                    _c4dbgpf("setting val anchor[{}]='{}'", m_state->node_id, m_val_anchor);
+                    m_tree->set_val_anchor(m_state->node_id, m_val_anchor);
+                    m_val_anchor.clear();
+                }
+                _end_stream();
+            }
+        }
+    }
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+csubstr Parser::_slurp_doc_scalar()
+{
+    csubstr s = m_state->line_contents.rem;
+    size_t pos = m_state->pos.offset;
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.full.find("---") != csubstr::npos);
+    _c4dbgpf("slurp 0 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
+    if(s.len == 0)
+    {
+        _line_ended();
+        _scan_line();
+        s = m_state->line_contents.rem;
+        pos = m_state->pos.offset;
+    }
+
+    size_t skipws = s.first_not_of(" \t");
+    _c4dbgpf("slurp 1 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
+    if(skipws != npos)
+    {
+        _line_progressed(skipws);
+        s = m_state->line_contents.rem;
+        pos = m_state->pos.offset;
+        _c4dbgpf("slurp 2 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
+    }
+
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_val_anchor.empty());
+    _handle_val_anchors_and_refs();
+    if(!m_val_anchor.empty())
+    {
+        s = m_state->line_contents.rem;
+        skipws = s.first_not_of(" \t");
+        if(skipws != npos)
+        {
+            _line_progressed(skipws);
+        }
+        s = m_state->line_contents.rem;
+        pos = m_state->pos.offset;
+        _c4dbgpf("slurp 3 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
+    }
+
+    if(s.begins_with('\''))
+    {
+        m_state->scalar_col = m_state->line_contents.current_col(s);
+        return _scan_squot_scalar();
+    }
+    else if(s.begins_with('"'))
+    {
+        m_state->scalar_col = m_state->line_contents.current_col(s);
+        return _scan_dquot_scalar();
+    }
+    else if(s.begins_with('|') || s.begins_with('>'))
+    {
+        return _scan_block();
+    }
+
+    _c4dbgpf("slurp 4 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
+
+    m_state->scalar_col = m_state->line_contents.current_col(s);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() >= m_buf.begin() + pos);
+    _line_progressed(static_cast<size_t>(s.end() - (m_buf.begin() + pos)));
+
+    _c4dbgpf("slurp 5 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
+
+    if(_at_line_end())
+    {
+        _c4dbgpf("at line end. curr='{}'", s);
+        s = _extend_scanned_scalar(s);
+    }
+
+    _c4dbgpf("scalar was '{}'", s);
+
+    return s;
+}
+
+
+//-----------------------------------------------------------------------------
+
+bool Parser::_scan_scalar_seq_blck(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RSEQ));
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RVAL));
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(RKEY));
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(FLOW));
+
+    csubstr s = m_state->line_contents.rem;
+    if(s.len == 0)
+        return false;
+    s = s.trim(" \t");
+    if(s.len == 0)
+        return false;
+
+    if(s.begins_with('\''))
+    {
+        _c4dbgp("got a ': scanning single-quoted scalar");
+        m_state->scalar_col = m_state->line_contents.current_col(s);
+        *scalar = _scan_squot_scalar();
+        *quoted = true;
+        return true;
+    }
+    else if(s.begins_with('"'))
+    {
+        _c4dbgp("got a \": scanning double-quoted scalar");
+        m_state->scalar_col = m_state->line_contents.current_col(s);
+        *scalar = _scan_dquot_scalar();
+        *quoted = true;
+        return true;
+    }
+    else if(s.begins_with('|') || s.begins_with('>'))
+    {
+        *scalar = _scan_block();
+        *quoted = true;
+        return true;
+    }
+    else if(has_any(RTOP) && _is_doc_sep(s))
+    {
+        return false;
+    }
+
+    _c4dbgp("RSEQ|RVAL");
+    if( ! _is_scalar_next__rseq_rval(s))
+        return false;
+    _RYML_WITH_TAB_TOKENS(else if(s.begins_with("-\t"))
+        return false;
+    )
+
+    if(s.ends_with(':'))
+    {
+        --s.len;
+    }
+    else
+    {
+        auto first = s.first_of_any(": " _RYML_WITH_TAB_TOKENS( , ":\t"), " #");
+        if(first)
+            s.len = first.pos;
+    }
+    s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
+
+    if(s.empty())
+        return false;
+
+    m_state->scalar_col = m_state->line_contents.current_col(s);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
+    _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
+
+    if(_at_line_end() && s != '~')
+    {
+        _c4dbgpf("at line end. curr='{}'", s);
+        s = _extend_scanned_scalar(s);
+    }
+
+    _c4dbgpf("scalar was '{}'", s);
+
+    *scalar = s;
+    *quoted = false;
+    return true;
+}
+
+bool Parser::_scan_scalar_map_blck(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
+{
+    _c4dbgp("_scan_scalar_map_blck");
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RMAP));
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(FLOW));
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RKEY|RVAL));
+
+    csubstr s = m_state->line_contents.rem;
+    #ifdef RYML_NO_COVERAGE__TO_BE_DELETED__OR_REFACTORED
+    if(s.len == 0)
+        return false;
+    #endif
+    s = s.trim(" \t");
+    if(s.len == 0)
+        return false;
+
+    if(s.begins_with('\''))
+    {
+        _c4dbgp("got a ': scanning single-quoted scalar");
+        m_state->scalar_col = m_state->line_contents.current_col(s);
+        *scalar = _scan_squot_scalar();
+        *quoted = true;
+        return true;
+    }
+    else if(s.begins_with('"'))
+    {
+        _c4dbgp("got a \": scanning double-quoted scalar");
+        m_state->scalar_col = m_state->line_contents.current_col(s);
+        *scalar = _scan_dquot_scalar();
+        *quoted = true;
+        return true;
+    }
+    else if(s.begins_with('|') || s.begins_with('>'))
+    {
+        *scalar = _scan_block();
+        *quoted = true;
+        return true;
+    }
+    else if(has_any(RTOP) && _is_doc_sep(s))
+    {
+        return false;
+    }
+
+    if( ! _is_scalar_next__rmap(s))
+        return false;
+
+    size_t colon_token = s.find(": ");
+    if(colon_token == npos)
+    {
+        _RYML_WITH_OR_WITHOUT_TAB_TOKENS(
+            // with tab tokens
+            colon_token = s.find(":\t");
+            if(colon_token == npos)
+            {
+                _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0);
+                colon_token = s.find(':');
+                if(colon_token != s.len-1)
+                    colon_token = npos;
+            }
+            ,
+            // without tab tokens
+            colon_token = s.find(':');
+            _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0);
+            if(colon_token != s.len-1)
+                colon_token = npos;
+        )
+    }
+
+    if(has_all(RKEY))
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, !s.begins_with(' '));
+        if(has_any(QMRK))
+        {
+            _c4dbgp("RMAP|RKEY|CPLX");
+            _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RMAP));
+            if(s.begins_with("? ") || s == '?')
+                return false;
+            s = s.left_of(colon_token);
+            s = s.left_of(s.first_of("#"));
+            s = s.trimr(" \t");
+            if(s.begins_with("---"))
+                return false;
+            else if(s.begins_with("..."))
+                return false;
+        }
+        else
+        {
+            _c4dbgp("RMAP|RKEY");
+            _RYML_CB_CHECK(m_stack.m_callbacks, !s.begins_with('{'));
+            if(s.begins_with("? ") || s == '?')
+                return false;
+            s = s.left_of(colon_token);
+            s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
+            if(s.begins_with("---"))
+            {
+                return false;
+            }
+            else if(s.begins_with("..."))
+            {
+                return false;
+            }
+        }
+    }
+    else if(has_all(RVAL))
+    {
+        _c4dbgp("RMAP|RVAL");
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(QMRK));
+        if( ! _is_scalar_next__rmap_val(s))
+            return false;
+        _RYML_WITH_TAB_TOKENS(
+        else if(s.begins_with("-\t"))
+            return false;
+        )
+        _c4dbgp("RMAP|RVAL: scalar");
+        s = s.left_of(s.find(" #")); // is there a comment?
+        s = s.left_of(s.find("\t#")); // is there a comment?
+        s = s.trim(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
+        if(s.begins_with("---"))
+            return false;
+        #ifdef RYML_NO_COVERAGE__TO_BE_DELETED__OR_REFACTORED
+        else if(s.begins_with("..."))
+            return false;
+        #endif
+    }
+
+    if(s.empty())
+        return false;
+
+    m_state->scalar_col = m_state->line_contents.current_col(s);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
+    _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
+
+    if(_at_line_end() && s != '~')
+    {
+        _c4dbgpf("at line end. curr='{}'", s);
+        s = _extend_scanned_scalar(s);
+    }
+
+    _c4dbgpf("scalar was '{}'", s);
+
+    *scalar = s;
+    *quoted = false;
+    return true;
+}
+
+bool Parser::_scan_scalar_seq_flow(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RSEQ));
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(FLOW));
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RVAL));
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(RKEY));
+
+    csubstr s = m_state->line_contents.rem;
+    if(s.len == 0)
+        return false;
+    s = s.trim(" \t");
+    if(s.len == 0)
+        return false;
+
+    if(s.begins_with('\''))
+    {
+        _c4dbgp("got a ': scanning single-quoted scalar");
+        m_state->scalar_col = m_state->line_contents.current_col(s);
+        *scalar = _scan_squot_scalar();
+        *quoted = true;
+        return true;
+    }
+    else if(s.begins_with('"'))
+    {
+        _c4dbgp("got a \": scanning double-quoted scalar");
+        m_state->scalar_col = m_state->line_contents.current_col(s);
+        *scalar = _scan_dquot_scalar();
+        *quoted = true;
+        return true;
+    }
+
+    if(has_all(RVAL))
+    {
+        _c4dbgp("RSEQ|RVAL");
+        if( ! _is_scalar_next__rseq_rval(s))
+            return false;
+        _RYML_WITH_TAB_TOKENS(else if(s.begins_with("-\t"))
+            return false;
+        )
+        _c4dbgp("RSEQ|RVAL|FLOW");
+        s = s.left_of(s.first_of(",]"));
+        if(s.ends_with(':'))
+        {
+            --s.len;
+        }
+        else
+        {
+            auto first = s.first_of_any(": " _RYML_WITH_TAB_TOKENS( , ":\t"), " #");
+            if(first)
+                s.len = first.pos;
+        }
+        s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
+    }
+
+    if(s.empty())
+        return false;
+
+    m_state->scalar_col = m_state->line_contents.current_col(s);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
+    _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
+
+    if(_at_line_end() && s != '~')
+    {
+        _c4dbgpf("at line end. curr='{}'", s);
+        s = _extend_scanned_scalar(s);
+    }
+
+    _c4dbgpf("scalar was '{}'", s);
+
+    *scalar = s;
+    *quoted = false;
+    return true;
+}
+
+bool Parser::_scan_scalar_map_flow(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RMAP));
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(FLOW));
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RKEY|RVAL));
+
+    csubstr s = m_state->line_contents.rem;
+    if(s.len == 0)
+        return false;
+    s = s.trim(" \t");
+    if(s.len == 0)
+        return false;
+
+    if(s.begins_with('\''))
+    {
+        _c4dbgp("got a ': scanning single-quoted scalar");
+        m_state->scalar_col = m_state->line_contents.current_col(s);
+        *scalar = _scan_squot_scalar();
+        *quoted = true;
+        return true;
+    }
+    else if(s.begins_with('"'))
+    {
+        _c4dbgp("got a \": scanning double-quoted scalar");
+        m_state->scalar_col = m_state->line_contents.current_col(s);
+        *scalar = _scan_dquot_scalar();
+        *quoted = true;
+        return true;
+    }
+
+    if( ! _is_scalar_next__rmap(s))
+        return false;
+
+    if(has_all(RKEY))
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, !s.begins_with(' '));
+        size_t colon_token = s.find(": ");
+        if(colon_token == npos)
+        {
+            _RYML_WITH_OR_WITHOUT_TAB_TOKENS(
+                // with tab tokens
+                colon_token = s.find(":\t");
+                if(colon_token == npos)
+                {
+                    _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0);
+                    colon_token = s.find(':');
+                    if(colon_token != s.len-1)
+                        colon_token = npos;
+                }
+                ,
+                // without tab tokens
+                colon_token = s.find(':');
+                _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0);
+                if(colon_token != s.len-1)
+                    colon_token = npos;
+            )
+        }
+        if(s.begins_with("? ") || s == '?')
+            return false;
+        if(has_any(QMRK))
+        {
+            _c4dbgp("RMAP|RKEY|CPLX");
+            _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RMAP));
+            s = s.left_of(colon_token);
+            s = s.left_of(s.first_of("#"));
+            s = s.left_of(s.first_of(':'));
+            s = s.trimr(" \t");
+            if(s.begins_with("---"))
+                return false;
+            else if(s.begins_with("..."))
+                return false;
+        }
+        else
+        {
+            _RYML_CB_CHECK(m_stack.m_callbacks, !s.begins_with('{'));
+            _c4dbgp("RMAP|RKEY");
+            s = s.left_of(colon_token);
+            s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
+            _c4dbgpf("RMAP|RKEY|FLOW: '{}'", s);
+            s = s.left_of(s.first_of(",}"));
+            if(s.ends_with(':'))
+                --s.len;
+        }
+    }
+    else if(has_all(RVAL))
+    {
+        _c4dbgp("RMAP|RVAL");
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(QMRK));
+        if( ! _is_scalar_next__rmap_val(s))
+            return false;
+        _RYML_WITH_TAB_TOKENS(else if(s.begins_with("-\t"))
+            return false;
+        )
+        _c4dbgp("RMAP|RVAL|FLOW");
+        if(has_none(RSEQIMAP))
+            s = s.left_of(s.first_of(",}"));
+        else
+            s = s.left_of(s.first_of(",]"));
+        s = s.left_of(s.find(" #")); // is there a comment?
+        s = s.left_of(s.find("\t#")); // is there a comment?
+        s = s.trim(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
+    }
+
+    if(s.empty())
+        return false;
+
+    m_state->scalar_col = m_state->line_contents.current_col(s);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
+    _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
+
+    if(_at_line_end() && s != '~')
+    {
+        _c4dbgpf("at line end. curr='{}'", s);
+        s = _extend_scanned_scalar(s);
+    }
+
+    _c4dbgpf("scalar was '{}'", s);
+
+    *scalar = s;
+    *quoted = false;
+    return true;
+}
+
+bool Parser::_scan_scalar_unk(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RUNK));
+
+    csubstr s = m_state->line_contents.rem;
+    if(s.len == 0)
+        return false;
+    s = s.trim(" \t");
+    if(s.len == 0)
+        return false;
+
+    if(s.begins_with('\''))
+    {
+        _c4dbgp("got a ': scanning single-quoted scalar");
+        m_state->scalar_col = m_state->line_contents.current_col(s);
+        *scalar = _scan_squot_scalar();
+        *quoted = true;
+        return true;
+    }
+    else if(s.begins_with('"'))
+    {
+        _c4dbgp("got a \": scanning double-quoted scalar");
+        m_state->scalar_col = m_state->line_contents.current_col(s);
+        *scalar = _scan_dquot_scalar();
+        *quoted = true;
+        return true;
+    }
+    else if(s.begins_with('|') || s.begins_with('>'))
+    {
+        *scalar = _scan_block();
+        *quoted = true;
+        return true;
+    }
+    else if(has_any(RTOP) && _is_doc_sep(s))
+    {
+        return false;
+    }
+
+    _c4dbgpf("RUNK '[{}]~~~{}~~~", s.len, s);
+    if( ! _is_scalar_next__runk(s))
+    {
+        _c4dbgp("RUNK: no scalar next");
+        return false;
+    }
+    size_t pos = s.find(" #");
+    if(pos != npos)
+        s = s.left_of(pos);
+    pos = s.find(": ");
+    if(pos != npos)
+        s = s.left_of(pos);
+    else if(s.ends_with(':'))
+        s = s.left_of(s.len-1);
+    _RYML_WITH_TAB_TOKENS(
+    else if((pos = s.find(":\t")) != npos) // TABS
+        s = s.left_of(pos);
+    )
+    else
+        s = s.left_of(s.first_of(','));
+    s = s.trim(" \t");
+    _c4dbgpf("RUNK: scalar='{}'", s);
+
+    if(s.empty())
+        return false;
+
+    m_state->scalar_col = m_state->line_contents.current_col(s);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
+    _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
+
+    if(_at_line_end() && s != '~')
+    {
+        _c4dbgpf("at line end. curr='{}'", s);
+        s = _extend_scanned_scalar(s);
+    }
+
+    _c4dbgpf("scalar was '{}'", s);
+
+    *scalar = s;
+    *quoted = false;
+    return true;
+}
+
+
+//-----------------------------------------------------------------------------
+
+csubstr Parser::_extend_scanned_scalar(csubstr s)
+{
+    if(has_all(RMAP|RKEY|QMRK))
+    {
+        size_t scalar_indentation = has_any(FLOW) ? 0 : m_state->scalar_col;
+        _c4dbgpf("extend_scalar: explicit key! indref={} scalar_indentation={} scalar_col={}", m_state->indref, scalar_indentation, m_state->scalar_col);
+        csubstr n = _scan_to_next_nonempty_line(scalar_indentation);
+        if(!n.empty())
+        {
+            substr full = _scan_complex_key(s, n).trimr(" \t\r\n");
+            if(full != s)
+                s = _filter_plain_scalar(full, scalar_indentation);
+        }
+    }
+    // deal with plain (unquoted) scalars that continue to the next line
+    else if(!s.begins_with_any("*")) // cannot be a plain scalar if it starts with * (that's an anchor reference)
+    {
+        _c4dbgpf("extend_scalar: line ended, scalar='{}'", s);
+        if(has_none(FLOW))
+        {
+            size_t scalar_indentation = m_state->indref + 1;
+            if(has_all(RUNK) && scalar_indentation == 1)
+                scalar_indentation = 0;
+            csubstr n = _scan_to_next_nonempty_line(scalar_indentation);
+            if(!n.empty())
+            {
+                _c4dbgpf("rscalar[IMPL]: state_indref={} state_indentation={} scalar_indentation={}", m_state->indref, m_state->line_contents.indentation, scalar_indentation);
+                _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.full.is_super(n));
+                substr full = _scan_plain_scalar_blck(s, n, scalar_indentation);
+                if(full.len >= s.len)
+                    s = _filter_plain_scalar(full, scalar_indentation);
+            }
+        }
+        else
+        {
+            _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW));
+            csubstr n = _scan_to_next_nonempty_line(/*indentation*/0);
+            if(!n.empty())
+            {
+                _c4dbgp("rscalar[FLOW]");
+                substr full = _scan_plain_scalar_flow(s, n);
+                s = _filter_plain_scalar(full, /*indentation*/0);
+            }
+        }
+    }
+
+    return s;
+}
+
+
+//-----------------------------------------------------------------------------
+
+substr Parser::_scan_plain_scalar_flow(csubstr currscalar, csubstr peeked_line)
+{
+    static constexpr const csubstr chars = "[]{}?#,";
+    size_t pos = peeked_line.first_of(chars);
+    bool first = true;
+    while(pos != 0)
+    {
+        if(has_all(RMAP|RKEY) || has_any(RUNK))
+        {
+            csubstr tpkl = peeked_line.triml(' ').trimr("\r\n");
+            if(tpkl.begins_with(": ") || tpkl == ':')
+            {
+                _c4dbgpf("rscalar[FLOW]: map value starts on the peeked line: '{}'", peeked_line);
+                peeked_line = peeked_line.first(0);
+                break;
+            }
+            else
+            {
+                auto colon_pos = peeked_line.first_of_any(": ", ":");
+                if(colon_pos && colon_pos.pos < pos)
+                {
+                    peeked_line = peeked_line.first(colon_pos.pos);
+                    _c4dbgpf("rscalar[FLOW]: found colon at {}. peeked='{}'", colon_pos.pos, peeked_line);
+                    _RYML_CB_ASSERT(m_stack.m_callbacks, peeked_line.end() >= m_state->line_contents.rem.begin());
+                    _line_progressed(static_cast<size_t>(peeked_line.end() - m_state->line_contents.rem.begin()));
+                    break;
+                }
+            }
+        }
+        if(pos != npos)
+        {
+            _c4dbgpf("rscalar[FLOW]: found special character '{}' at {}, stopping: '{}'", peeked_line[pos], pos, peeked_line.left_of(pos).trimr("\r\n"));
+            peeked_line = peeked_line.left_of(pos);
+            _RYML_CB_ASSERT(m_stack.m_callbacks, peeked_line.end() >= m_state->line_contents.rem.begin());
+            _line_progressed(static_cast<size_t>(peeked_line.end() - m_state->line_contents.rem.begin()));
+            break;
+        }
+        _c4dbgpf("rscalar[FLOW]: append another line, full: '{}'", peeked_line.trimr("\r\n"));
+        if(!first)
+        {
+            RYML_CHECK(_advance_to_peeked());
+        }
+        peeked_line = _scan_to_next_nonempty_line(/*indentation*/0);
+        if(peeked_line.empty())
+        {
+            _c4err("expected token or continuation");
+        }
+        pos = peeked_line.first_of(chars);
+        first = false;
+    }
+    substr full(m_buf.str + (currscalar.str - m_buf.str), m_buf.begin() + m_state->pos.offset);
+    full = full.trimr("\n\r ");
+    return full;
+}
+
+
+//-----------------------------------------------------------------------------
+
+substr Parser::_scan_plain_scalar_blck(csubstr currscalar, csubstr peeked_line, size_t indentation)
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(currscalar));
+    // NOTE. there's a problem with _scan_to_next_nonempty_line(), as it counts newlines twice
+    // size_t offs = m_state->pos.offset;   // so we workaround by directly counting from the end of the given scalar
+    _RYML_CB_ASSERT(m_stack.m_callbacks, currscalar.end() >= m_buf.begin());
+    size_t offs = static_cast<size_t>(currscalar.end() - m_buf.begin());
+    _RYML_CB_ASSERT(m_stack.m_callbacks, peeked_line.begins_with(' ', indentation));
+    while(true)
+    {
+        _c4dbgpf("rscalar[IMPL]: continuing... ref_indentation={}", indentation);
+        if(peeked_line.begins_with("...") || peeked_line.begins_with("---"))
+        {
+            _c4dbgpf("rscalar[IMPL]: document termination next -- bail now '{}'", peeked_line.trimr("\r\n"));
+            break;
+        }
+        else if(( ! peeked_line.begins_with(' ', indentation))) // is the line deindented?
+        {
+            if(!peeked_line.trim(" \r\n\t").empty()) // is the line not blank?
+            {
+                _c4dbgpf("rscalar[IMPL]: deindented line, not blank -- bail now '{}'", peeked_line.trimr("\r\n"));
+                break;
+            }
+            _c4dbgpf("rscalar[IMPL]: line is blank and has less indentation: ref={} line={}: '{}'", indentation, peeked_line.first_not_of(' ') == csubstr::npos ? 0 : peeked_line.first_not_of(' '), peeked_line.trimr("\r\n"));
+            _c4dbgpf("rscalar[IMPL]: ... searching for a line starting at indentation {}", indentation);
+            csubstr next_peeked = _scan_to_next_nonempty_line(indentation);
+            if(next_peeked.empty())
+            {
+                _c4dbgp("rscalar[IMPL]: ... finished.");
+                break;
+            }
+            _c4dbgp("rscalar[IMPL]: ... continuing.");
+            peeked_line = next_peeked;
+        }
+
+        _c4dbgpf("rscalar[IMPL]: line contents: '{}'", peeked_line.right_of(indentation, true).trimr("\r\n"));
+        size_t token_pos;
+        if(peeked_line.find(": ") != npos)
+        {
+            _line_progressed(peeked_line.find(": "));
+            _c4err("': ' is not a valid token in plain flow (unquoted) scalars");
+        }
+        else if(peeked_line.ends_with(':'))
+        {
+            _line_progressed(peeked_line.find(':'));
+            _c4err("lines cannot end with ':' in plain flow (unquoted) scalars");
+        }
+        else if((token_pos = peeked_line.find(" #")) != npos)
+        {
+            _line_progressed(token_pos);
+            break;
+            //_c4err("' #' is not a valid token in plain flow (unquoted) scalars");
+        }
+
+        _c4dbgpf("rscalar[IMPL]: append another line: (len={})'{}'", peeked_line.len, peeked_line.trimr("\r\n"));
+        if(!_advance_to_peeked())
+        {
+            _c4dbgp("rscalar[IMPL]: file finishes after the scalar");
+            break;
+        }
+        peeked_line = m_state->line_contents.rem;
+    }
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= offs);
+    substr full(m_buf.str + (currscalar.str - m_buf.str),
+                currscalar.len + (m_state->pos.offset - offs));
+    full = full.trimr("\r\n ");
+    return full;
+}
+
+substr Parser::_scan_complex_key(csubstr currscalar, csubstr peeked_line)
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(currscalar));
+    // NOTE. there's a problem with _scan_to_next_nonempty_line(), as it counts newlines twice
+    // size_t offs = m_state->pos.offset;   // so we workaround by directly counting from the end of the given scalar
+    _RYML_CB_ASSERT(m_stack.m_callbacks, currscalar.end() >= m_buf.begin());
+    size_t offs = static_cast<size_t>(currscalar.end() - m_buf.begin());
+    while(true)
+    {
+        _c4dbgp("rcplxkey: continuing...");
+        if(peeked_line.begins_with("...") || peeked_line.begins_with("---"))
+        {
+            _c4dbgpf("rcplxkey: document termination next -- bail now '{}'", peeked_line.trimr("\r\n"));
+            break;
+        }
+        else
+        {
+            size_t pos = peeked_line.first_of("?:[]{}");
+            if(pos == csubstr::npos)
+            {
+                pos = peeked_line.find("- ");
+            }
+            if(pos != csubstr::npos)
+            {
+                _c4dbgpf("rcplxkey: found special characters at pos={}: '{}'", pos, peeked_line.trimr("\r\n"));
+                _line_progressed(pos);
+                break;
+            }
+        }
+
+        _c4dbgpf("rcplxkey: no special chars found '{}'", peeked_line.trimr("\r\n"));
+        csubstr next_peeked = _scan_to_next_nonempty_line(0);
+        if(next_peeked.empty())
+        {
+            _c4dbgp("rcplxkey: empty ... finished.");
+            break;
+        }
+        _c4dbgp("rcplxkey: ... continuing.");
+        peeked_line = next_peeked;
+
+        _c4dbgpf("rcplxkey: line contents: '{}'", peeked_line.trimr("\r\n"));
+        size_t colpos;
+        if((colpos = peeked_line.find(": ")) != npos)
+        {
+            _c4dbgp("rcplxkey: found ': ', stopping.");
+            _line_progressed(colpos);
+            break;
+        }
+        #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
+        else if((colpos = peeked_line.ends_with(':')))
+        {
+            _c4dbgp("rcplxkey: ends with ':', stopping.");
+            _line_progressed(colpos);
+            break;
+        }
+        #endif
+        _c4dbgpf("rcplxkey: append another line: (len={})'{}'", peeked_line.len, peeked_line.trimr("\r\n"));
+        if(!_advance_to_peeked())
+        {
+            _c4dbgp("rcplxkey: file finishes after the scalar");
+            break;
+        }
+        peeked_line = m_state->line_contents.rem;
+    }
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= offs);
+    substr full(m_buf.str + (currscalar.str - m_buf.str),
+                currscalar.len + (m_state->pos.offset - offs));
+    return full;
+}
+
+//! scans to the next non-blank line starting with the given indentation
+csubstr Parser::_scan_to_next_nonempty_line(size_t indentation)
+{
+    csubstr next_peeked;
+    while(true)
+    {
+        _c4dbgpf("rscalar: ... curr offset: {} indentation={}", m_state->pos.offset, indentation);
+        next_peeked = _peek_next_line(m_state->pos.offset);
+        csubstr next_peeked_triml = next_peeked.triml(' ');
+        _c4dbgpf("rscalar: ... next peeked line='{}'", next_peeked.trimr("\r\n"));
+        if(next_peeked_triml.begins_with('#'))
+        {
+            _c4dbgp("rscalar: ... first non-space character is #");
+            return {};
+        }
+        else if(next_peeked.begins_with(' ', indentation))
+        {
+            _c4dbgpf("rscalar: ... begins at same indentation {}, assuming continuation", indentation);
+            _advance_to_peeked();
+            return next_peeked;
+        }
+        else   // check for de-indentation
+        {
+            csubstr trimmed = next_peeked_triml.trimr("\t\r\n");
+            _c4dbgpf("rscalar: ... deindented! trimmed='{}'", trimmed);
+            if(!trimmed.empty())
+            {
+                _c4dbgp("rscalar: ... and not empty. bailing out.");
+                return {};
+            }
+        }
+        if(!_advance_to_peeked())
+        {
+            _c4dbgp("rscalar: file finished");
+            return {};
+        }
+    }
+    return {};
+}
+
+// returns false when the file finished
+bool Parser::_advance_to_peeked()
+{
+    _line_progressed(m_state->line_contents.rem.len);
+    _line_ended(); // advances to the peeked-at line, consuming all remaining (probably newline) characters on the current line
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.first_of("\r\n") == csubstr::npos);
+    _c4dbgpf("advance to peeked: scan more... pos={} len={}", m_state->pos.offset, m_buf.len);
+    _scan_line();  // puts the peeked-at line in the buffer
+    if(_finished_file())
+    {
+        _c4dbgp("rscalar: finished file!");
+        return false;
+    }
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+
+C4_ALWAYS_INLINE size_t _extend_from_combined_newline(char nl, char following)
+{
+    return (nl == '\n' && following == '\r') || (nl == '\r' && following == '\n');
+}
+
+//! look for the next newline chars, and jump to the right of those
+csubstr from_next_line(csubstr rem)
+{
+    size_t nlpos = rem.first_of("\r\n");
+    if(nlpos == csubstr::npos)
+        return {};
+    const char nl = rem[nlpos];
+    rem = rem.right_of(nlpos);
+    if(rem.empty())
+        return {};
+    if(_extend_from_combined_newline(nl, rem.front()))
+        rem = rem.sub(1);
+    return rem;
+}
+
+csubstr Parser::_peek_next_line(size_t pos) const
+{
+    csubstr rem{}; // declare here because of the goto
+    size_t nlpos{}; // declare here because of the goto
+    pos = pos == npos ? m_state->pos.offset : pos;
+    if(pos >= m_buf.len)
+        goto next_is_empty;
+
+    // look for the next newline chars, and jump to the right of those
+    rem = from_next_line(m_buf.sub(pos));
+    if(rem.empty())
+        goto next_is_empty;
+
+    // now get everything up to and including the following newline chars
+    nlpos = rem.first_of("\r\n");
+    if((nlpos != csubstr::npos) && (nlpos + 1 < rem.len))
+        nlpos += _extend_from_combined_newline(rem[nlpos], rem[nlpos+1]);
+    rem = rem.left_of(nlpos, /*include_pos*/true);
+
+    _c4dbgpf("peek next line @ {}: (len={})'{}'", pos, rem.len, rem.trimr("\r\n"));
+    return rem;
+
+next_is_empty:
+    _c4dbgpf("peek next line @ {}: (len=0)''", pos);
+    return {};
+}
+
+
+//-----------------------------------------------------------------------------
+void Parser::LineContents::reset_with_next_line(csubstr buf, size_t offset)
+{
+    RYML_ASSERT(offset <= buf.len);
+    char const* C4_RESTRICT b = &buf[offset];
+    char const* C4_RESTRICT e = b;
+    // get the current line stripped of newline chars
+    while(e < buf.end() && (*e != '\n' && *e != '\r'))
+        ++e;
+    RYML_ASSERT(e >= b);
+    const csubstr stripped_ = buf.sub(offset, static_cast<size_t>(e - b));
+    // advance pos to include the first line ending
+    if(e != buf.end() && *e == '\r')
+        ++e;
+    if(e != buf.end() && *e == '\n')
+        ++e;
+    RYML_ASSERT(e >= b);
+    const csubstr full_ = buf.sub(offset, static_cast<size_t>(e - b));
+    reset(full_, stripped_);
+}
+
+void Parser::_scan_line()
+{
+    if(m_state->pos.offset >= m_buf.len)
+    {
+        m_state->line_contents.reset(m_buf.last(0), m_buf.last(0));
+        return;
+    }
+    m_state->line_contents.reset_with_next_line(m_buf, m_state->pos.offset);
+}
+
+
+//-----------------------------------------------------------------------------
+void Parser::_line_progressed(size_t ahead)
+{
+    _c4dbgpf("line[{}] ({} cols) progressed by {}:  col {}-->{}   offset {}-->{}", m_state->pos.line, m_state->line_contents.full.len, ahead, m_state->pos.col, m_state->pos.col+ahead, m_state->pos.offset, m_state->pos.offset+ahead);
+    m_state->pos.offset += ahead;
+    m_state->pos.col += ahead;
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.col <= m_state->line_contents.stripped.len+1);
+    m_state->line_contents.rem = m_state->line_contents.rem.sub(ahead);
+}
+
+void Parser::_line_ended()
+{
+    _c4dbgpf("line[{}] ({} cols) ended! offset {}-->{}", m_state->pos.line, m_state->line_contents.full.len, m_state->pos.offset, m_state->pos.offset+m_state->line_contents.full.len - m_state->line_contents.stripped.len);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.col == m_state->line_contents.stripped.len+1);
+    m_state->pos.offset += m_state->line_contents.full.len - m_state->line_contents.stripped.len;
+    ++m_state->pos.line;
+    m_state->pos.col = 1;
+}
+
+void Parser::_line_ended_undo()
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.col == 1u);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.line > 0u);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= m_state->line_contents.full.len - m_state->line_contents.stripped.len);
+    size_t delta = m_state->line_contents.full.len - m_state->line_contents.stripped.len;
+    _c4dbgpf("line[{}] undo ended! line {}-->{}, offset {}-->{}", m_state->pos.line, m_state->pos.line, m_state->pos.line - 1, m_state->pos.offset, m_state->pos.offset - delta);
+    m_state->pos.offset -= delta;
+    --m_state->pos.line;
+    m_state->pos.col = m_state->line_contents.stripped.len + 1u;
+    // don't forget to undo also the changes to the remainder of the line
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= m_buf.len || m_buf[m_state->pos.offset] == '\n' || m_buf[m_state->pos.offset] == '\r');
+    m_state->line_contents.rem = m_buf.sub(m_state->pos.offset, 0);
+}
+
+
+//-----------------------------------------------------------------------------
+void Parser::_set_indentation(size_t indentation)
+{
+    m_state->indref = indentation;
+    _c4dbgpf("state[{}]: saving indentation: {}", m_state-m_stack.begin(), m_state->indref);
+}
+
+void Parser::_save_indentation(size_t behind)
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.begin() >= m_state->line_contents.full.begin());
+    m_state->indref = static_cast<size_t>(m_state->line_contents.rem.begin() - m_state->line_contents.full.begin());
+    _RYML_CB_ASSERT(m_stack.m_callbacks, behind <= m_state->indref);
+    m_state->indref -= behind;
+    _c4dbgpf("state[{}]: saving indentation: {}", m_state-m_stack.begin(), m_state->indref);
+}
+
+bool Parser::_maybe_set_indentation_from_anchor_or_tag()
+{
+    if(m_key_anchor.not_empty())
+    {
+        _c4dbgpf("set indentation from key anchor: {}", m_key_anchor_indentation);
+        _set_indentation(m_key_anchor_indentation); // this is the column where the anchor starts
+        return true;
+    }
+    else if(m_key_tag.not_empty())
+    {
+        _c4dbgpf("set indentation from key tag: {}", m_key_tag_indentation);
+        _set_indentation(m_key_tag_indentation); // this is the column where the tag starts
+        return true;
+    }
+    return false;
+}
+
+
+//-----------------------------------------------------------------------------
+void Parser::_write_key_anchor(size_t node_id)
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->has_key(node_id));
+    if( ! m_key_anchor.empty())
+    {
+        _c4dbgpf("node={}: set key anchor to '{}'", node_id, m_key_anchor);
+        m_tree->set_key_anchor(node_id, m_key_anchor);
+        m_key_anchor.clear();
+        m_key_anchor_was_before = false;
+        m_key_anchor_indentation = 0;
+    }
+    else if( ! m_tree->is_key_quoted(node_id))
+    {
+        csubstr r = m_tree->key(node_id);
+        if(r.begins_with('*'))
+        {
+            _c4dbgpf("node={}: set key reference: '{}'", node_id, r);
+            m_tree->set_key_ref(node_id, r.sub(1));
+        }
+        else if(r == "<<")
+        {
+            m_tree->set_key_ref(node_id, r);
+            _c4dbgpf("node={}: it's an inheriting reference", node_id);
+            if(m_tree->is_seq(node_id))
+            {
+                _c4dbgpf("node={}: inheriting from seq of {}", node_id, m_tree->num_children(node_id));
+                for(size_t i = m_tree->first_child(node_id); i != NONE; i = m_tree->next_sibling(i))
+                {
+                    if( ! (m_tree->val(i).begins_with('*')))
+                        _c4err("malformed reference: '{}'", m_tree->val(i));
+                }
+            }
+            else if( ! m_tree->val(node_id).begins_with('*'))
+            {
+                 _c4err("malformed reference: '{}'", m_tree->val(node_id));
+            }
+            //m_tree->set_key_ref(node_id, r);
+        }
+    }
+}
+
+//-----------------------------------------------------------------------------
+void Parser::_write_val_anchor(size_t node_id)
+{
+    if( ! m_val_anchor.empty())
+    {
+        _c4dbgpf("node={}: set val anchor to '{}'", node_id, m_val_anchor);
+        m_tree->set_val_anchor(node_id, m_val_anchor);
+        m_val_anchor.clear();
+    }
+    csubstr r = m_tree->has_val(node_id) ? m_tree->val(node_id) : "";
+    if(!m_tree->is_val_quoted(node_id) && r.begins_with('*'))
+    {
+        _c4dbgpf("node={}: set val reference: '{}'", node_id, r);
+        RYML_CHECK(!m_tree->has_val_anchor(node_id));
+        m_tree->set_val_ref(node_id, r.sub(1));
+    }
+}
+
+//-----------------------------------------------------------------------------
+void Parser::_push_level(bool explicit_flow_chars)
+{
+    _c4dbgpf("pushing level! currnode={}  currlevel={} stacksize={} stackcap={}", m_state->node_id, m_state->level, m_stack.size(), m_stack.capacity());
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state == &m_stack.top());
+    if(node(m_state) == nullptr)
+    {
+        _c4dbgp("pushing level! actually no, current node is null");
+        //_RYML_CB_ASSERT(m_stack.m_callbacks,  ! explicit_flow_chars);
+        return;
+    }
+    flag_t st = RUNK;
+    if(explicit_flow_chars || has_all(FLOW))
+    {
+        st |= FLOW;
+    }
+    m_stack.push_top();
+    m_state = &m_stack.top();
+    set_flags(st);
+    m_state->node_id = (size_t)NONE;
+    m_state->indref = (size_t)NONE;
+    ++m_state->level;
+    _c4dbgpf("pushing level: now, currlevel={}", m_state->level);
+}
+
+void Parser::_pop_level()
+{
+    _c4dbgpf("popping level! currnode={} currlevel={}", m_state->node_id, m_state->level);
+    if(has_any(RMAP) || m_tree->is_map(m_state->node_id))
+    {
+        _stop_map();
+    }
+    if(has_any(RSEQ) || m_tree->is_seq(m_state->node_id))
+    {
+        _stop_seq();
+    }
+    if(m_tree->is_doc(m_state->node_id))
+    {
+        _stop_doc();
+    }
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_stack.size() > 1);
+    _prepare_pop();
+    m_stack.pop();
+    m_state = &m_stack.top();
+    /*if(has_any(RMAP))
+    {
+        _toggle_key_val();
+    }*/
+    if(m_state->line_contents.indentation == 0)
+    {
+        //_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RTOP));
+        add_flags(RTOP);
+    }
+    _c4dbgpf("popping level: now, currnode={} currlevel={}", m_state->node_id, m_state->level);
+}
+
+//-----------------------------------------------------------------------------
+void Parser::_start_unk(bool /*as_child*/)
+{
+    _c4dbgp("start_unk");
+    _push_level();
+    _move_scalar_from_top();
+}
+
+//-----------------------------------------------------------------------------
+void Parser::_start_doc(bool as_child)
+{
+    _c4dbgpf("start_doc (as child={})", as_child);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_stack.bottom()) == node(m_root_id));
+    size_t parent_id = m_stack.size() < 2 ? m_root_id : m_stack.top(1).node_id;
+    _RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_root(parent_id));
+    _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) == nullptr || node(m_state) == node(m_root_id));
+    if(as_child)
+    {
+        _c4dbgpf("start_doc: parent={}", parent_id);
+        if( ! m_tree->is_stream(parent_id))
+        {
+            _c4dbgp("start_doc: rearranging with root as STREAM");
+            m_tree->set_root_as_stream();
+        }
+        m_state->node_id = m_tree->append_child(parent_id);
+        m_tree->to_doc(m_state->node_id);
+    }
+    #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
+    else
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_seq(parent_id) || m_tree->empty(parent_id));
+        m_state->node_id = parent_id;
+        if( ! m_tree->is_doc(parent_id))
+        {
+            m_tree->to_doc(parent_id, DOC);
+        }
+    }
+    #endif
+    _c4dbgpf("start_doc: id={}", m_state->node_id);
+    add_flags(RUNK|RTOP|NDOC);
+    _handle_types();
+    rem_flags(NDOC);
+}
+
+void Parser::_stop_doc()
+{
+    size_t doc_node = m_state->node_id;
+    _c4dbgpf("stop_doc[{}]", doc_node);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_doc(doc_node));
+    if(!m_tree->is_seq(doc_node) && !m_tree->is_map(doc_node) && !m_tree->is_val(doc_node))
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(SSCL));
+        _c4dbgpf("stop_doc[{}]: there was nothing; adding null val", doc_node);
+        m_tree->to_val(doc_node, {}, DOC);
+    }
+}
+
+void Parser::_end_stream()
+{
+    _c4dbgpf("end_stream, level={} node_id={}", m_state->level, m_state->node_id);
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  ! m_stack.empty());
+    NodeData *added = nullptr;
+    if(has_any(SSCL))
+    {
+        if(m_tree->is_seq(m_state->node_id))
+        {
+            _c4dbgp("append val...");
+            added = _append_val(_consume_scalar());
+        }
+        else if(m_tree->is_map(m_state->node_id))
+        {
+            _c4dbgp("append null key val...");
+            added = _append_key_val_null(m_state->line_contents.rem.str);
+            #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
+            if(has_any(RSEQIMAP))
+            {
+                _stop_seqimap();
+                _pop_level();
+            }
+            #endif
+        }
+        else if(m_tree->is_doc(m_state->node_id) || m_tree->type(m_state->node_id) == NOTYPE)
+        {
+            NodeType_e quoted = has_any(QSCL) ? VALQUO : NOTYPE; // do this before consuming the scalar
+            csubstr scalar = _consume_scalar();
+            _c4dbgpf("node[{}]: to docval '{}'{}", m_state->node_id, scalar, quoted == VALQUO ? ", quoted" : "");
+            m_tree->to_val(m_state->node_id, scalar, DOC|quoted);
+            added = m_tree->get(m_state->node_id);
+        }
+        else
+        {
+            _c4err("internal error");
+        }
+    }
+    else if(has_all(RSEQ|RVAL) && has_none(FLOW))
+    {
+        _c4dbgp("add last...");
+        added = _append_val_null(m_state->line_contents.rem.str);
+    }
+    else if(!m_val_tag.empty() && (m_tree->is_doc(m_state->node_id) || m_tree->type(m_state->node_id) == NOTYPE))
+    {
+        csubstr scalar = m_state->line_contents.rem.first(0);
+        _c4dbgpf("node[{}]: add null scalar as docval", m_state->node_id);
+        m_tree->to_val(m_state->node_id, scalar, DOC);
+        added = m_tree->get(m_state->node_id);
+    }
+
+    if(added)
+    {
+        size_t added_id = m_tree->id(added);
+        if(m_tree->is_seq(m_state->node_id) || m_tree->is_doc(m_state->node_id))
+        {
+            if(!m_key_anchor.empty())
+            {
+                _c4dbgpf("node[{}]: move key to val anchor: '{}'", added_id, m_key_anchor);
+                m_val_anchor = m_key_anchor;
+                m_key_anchor = {};
+            }
+            if(!m_key_tag.empty())
+            {
+                _c4dbgpf("node[{}]: move key to val tag: '{}'", added_id, m_key_tag);
+                m_val_tag = m_key_tag;
+                m_key_tag = {};
+            }
+        }
+        #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
+        if(!m_key_anchor.empty())
+        {
+            _c4dbgpf("node[{}]: set key anchor='{}'", added_id, m_key_anchor);
+            m_tree->set_key_anchor(added_id, m_key_anchor);
+            m_key_anchor = {};
+        }
+        #endif
+        if(!m_val_anchor.empty())
+        {
+            _c4dbgpf("node[{}]: set val anchor='{}'", added_id, m_val_anchor);
+            m_tree->set_val_anchor(added_id, m_val_anchor);
+            m_val_anchor = {};
+        }
+        #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
+        if(!m_key_tag.empty())
+        {
+            _c4dbgpf("node[{}]: set key tag='{}' -> '{}'", added_id, m_key_tag, normalize_tag(m_key_tag));
+            m_tree->set_key_tag(added_id, normalize_tag(m_key_tag));
+            m_key_tag = {};
+        }
+        #endif
+        if(!m_val_tag.empty())
+        {
+            _c4dbgpf("node[{}]: set val tag='{}' -> '{}'", added_id, m_val_tag, normalize_tag(m_val_tag));
+            m_tree->set_val_tag(added_id, normalize_tag(m_val_tag));
+            m_val_tag = {};
+        }
+    }
+
+    while(m_stack.size() > 1)
+    {
+        _c4dbgpf("popping level: {} (stack sz={})", m_state->level, m_stack.size());
+        _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(SSCL, &m_stack.top()));
+        if(has_all(RSEQ|FLOW))
+            _err("closing ] not found");
+        _pop_level();
+    }
+    add_flags(NDOC);
+}
+
+void Parser::_start_new_doc(csubstr rem)
+{
+    _c4dbgp("_start_new_doc");
+    _RYML_CB_ASSERT(m_stack.m_callbacks, rem.begins_with("---"));
+    C4_UNUSED(rem);
+
+    _end_stream();
+
+    size_t indref = m_state->indref;
+    _c4dbgpf("start a document, indentation={}", indref);
+    _line_progressed(3);
+    _push_level();
+    _start_doc();
+    _set_indentation(indref);
+}
+
+
+//-----------------------------------------------------------------------------
+void Parser::_start_map(bool as_child)
+{
+    _c4dbgpf("start_map (as child={})", as_child);
+    addrem_flags(RMAP|RVAL, RKEY|RUNK);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_stack.bottom()) == node(m_root_id));
+    size_t parent_id = m_stack.size() < 2 ? m_root_id : m_stack.top(1).node_id;
+    _RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) == nullptr || node(m_state) == node(m_root_id));
+    if(as_child)
+    {
+        m_state->node_id = m_tree->append_child(parent_id);
+        if(has_all(SSCL))
+        {
+            type_bits key_quoted = NOTYPE;
+            if(m_state->flags & QSCL) // before consuming the scalar
+                key_quoted |= KEYQUO;
+            csubstr key = _consume_scalar();
+            m_tree->to_map(m_state->node_id, key, key_quoted);
+            _c4dbgpf("start_map: id={} key='{}'", m_state->node_id, m_tree->key(m_state->node_id));
+            _write_key_anchor(m_state->node_id);
+            if( ! m_key_tag.empty())
+            {
+                _c4dbgpf("node[{}]: set key tag='{}' -> '{}'", m_state->node_id, m_key_tag, normalize_tag(m_key_tag));
+                m_tree->set_key_tag(m_state->node_id, normalize_tag(m_key_tag));
+                m_key_tag.clear();
+            }
+        }
+        else
+        {
+            m_tree->to_map(m_state->node_id);
+            _c4dbgpf("start_map: id={}", m_state->node_id);
+        }
+        m_tree->_p(m_state->node_id)->m_val.scalar.str = m_state->line_contents.rem.str;
+        _write_val_anchor(m_state->node_id);
+    }
+    else
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE);
+        m_state->node_id = parent_id;
+        _c4dbgpf("start_map: id={}", m_state->node_id);
+        type_bits as_doc = 0;
+        if(m_tree->is_doc(m_state->node_id))
+            as_doc |= DOC;
+        if(!m_tree->is_map(parent_id))
+        {
+            RYML_CHECK(!m_tree->has_children(parent_id));
+            m_tree->to_map(parent_id, as_doc);
+        }
+        else
+        {
+            m_tree->_add_flags(parent_id, as_doc);
+        }
+        _move_scalar_from_top();
+        if(m_key_anchor.not_empty())
+            m_key_anchor_was_before = true;
+        _write_val_anchor(parent_id);
+        if(m_stack.size() >= 2)
+        {
+            State const& parent_state = m_stack.top(1);
+            if(parent_state.flags & RSET)
+                add_flags(RSET);
+        }
+        m_tree->_p(parent_id)->m_val.scalar.str = m_state->line_contents.rem.str;
+    }
+    if( ! m_val_tag.empty())
+    {
+        _c4dbgpf("node[{}]: set val tag='{}' -> '{}'", m_state->node_id, m_val_tag, normalize_tag(m_val_tag));
+        m_tree->set_val_tag(m_state->node_id, normalize_tag(m_val_tag));
+        m_val_tag.clear();
+    }
+}
+
+void Parser::_start_map_unk(bool as_child)
+{
+    if(!m_key_anchor_was_before)
+    {
+        _c4dbgpf("stash key anchor before starting map... '{}'", m_key_anchor);
+        csubstr ka = m_key_anchor;
+        m_key_anchor = {};
+        _start_map(as_child);
+        m_key_anchor = ka;
+    }
+    else
+    {
+        _start_map(as_child);
+        m_key_anchor_was_before = false;
+    }
+    if(m_key_tag2.not_empty())
+    {
+        m_key_tag = m_key_tag2;
+        m_key_tag_indentation = m_key_tag2_indentation;
+        m_key_tag2.clear();
+        m_key_tag2_indentation = 0;
+    }
+}
+
+void Parser::_stop_map()
+{
+    _c4dbgpf("stop_map[{}]", m_state->node_id);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_map(m_state->node_id));
+    if(has_all(QMRK|RKEY) && !has_all(SSCL))
+    {
+        _c4dbgpf("stop_map[{}]: RKEY", m_state->node_id);
+        _store_scalar_null(m_state->line_contents.rem.str);
+        _append_key_val_null(m_state->line_contents.rem.str);
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+void Parser::_start_seq(bool as_child)
+{
+    _c4dbgpf("start_seq (as child={})", as_child);
+    if(has_all(RTOP|RUNK))
+    {
+        _c4dbgpf("start_seq: moving key tag to val tag: '{}'", m_key_tag);
+        m_val_tag = m_key_tag;
+        m_key_tag.clear();
+    }
+    addrem_flags(RSEQ|RVAL, RUNK);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_stack.bottom()) == node(m_root_id));
+    size_t parent_id = m_stack.size() < 2 ? m_root_id : m_stack.top(1).node_id;
+    _RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) == nullptr || node(m_state) == node(m_root_id));
+    if(as_child)
+    {
+        m_state->node_id = m_tree->append_child(parent_id);
+        if(has_all(SSCL))
+        {
+            _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_map(parent_id));
+            type_bits key_quoted = 0;
+            if(m_state->flags & QSCL) // before consuming the scalar
+                key_quoted |= KEYQUO;
+            csubstr key = _consume_scalar();
+            m_tree->to_seq(m_state->node_id, key, key_quoted);
+            _c4dbgpf("start_seq: id={} name='{}'", m_state->node_id, m_tree->key(m_state->node_id));
+            _write_key_anchor(m_state->node_id);
+            if( ! m_key_tag.empty())
+            {
+                _c4dbgpf("start_seq[{}]: set key tag='{}' -> '{}'", m_state->node_id, m_key_tag, normalize_tag(m_key_tag));
+                m_tree->set_key_tag(m_state->node_id, normalize_tag(m_key_tag));
+                m_key_tag.clear();
+            }
+        }
+        else
+        {
+            type_bits as_doc = 0;
+            _RYML_CB_ASSERT(m_stack.m_callbacks, !m_tree->is_doc(m_state->node_id));
+            m_tree->to_seq(m_state->node_id, as_doc);
+            _c4dbgpf("start_seq: id={}{}", m_state->node_id, as_doc ? " as doc" : "");
+        }
+        _write_val_anchor(m_state->node_id);
+        m_tree->_p(m_state->node_id)->m_val.scalar.str = m_state->line_contents.rem.str;
+    }
+    else
+    {
+        m_state->node_id = parent_id;
+        type_bits as_doc = 0;
+        if(m_tree->is_doc(m_state->node_id))
+            as_doc |= DOC;
+        if(!m_tree->is_seq(parent_id))
+        {
+            RYML_CHECK(!m_tree->has_children(parent_id));
+            m_tree->to_seq(parent_id, as_doc);
+        }
+        else
+        {
+            m_tree->_add_flags(parent_id, as_doc);
+        }
+        _move_scalar_from_top();
+        _c4dbgpf("start_seq: id={}{}", m_state->node_id, as_doc ? " as_doc" : "");
+        _write_val_anchor(parent_id);
+        m_tree->_p(parent_id)->m_val.scalar.str = m_state->line_contents.rem.str;
+    }
+    if( ! m_val_tag.empty())
+    {
+        _c4dbgpf("start_seq[{}]: set val tag='{}' -> '{}'", m_state->node_id, m_val_tag, normalize_tag(m_val_tag));
+        m_tree->set_val_tag(m_state->node_id, normalize_tag(m_val_tag));
+        m_val_tag.clear();
+    }
+}
+
+void Parser::_stop_seq()
+{
+    _c4dbgp("stop_seq");
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_seq(m_state->node_id));
+}
+
+
+//-----------------------------------------------------------------------------
+void Parser::_start_seqimap()
+{
+    _c4dbgpf("start_seqimap at node={}. has_children={}", m_state->node_id, m_tree->has_children(m_state->node_id));
+    _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ|FLOW));
+    // create a map, and turn the last scalar of this sequence
+    // into the key of the map's first child. This scalar was
+    // understood to be a value in the sequence, but it is
+    // actually a key of a map, implicitly opened here.
+    // Eg [val, key: val]
+    //
+    // Yep, YAML is crazy.
+    if(m_tree->has_children(m_state->node_id) && m_tree->has_val(m_tree->last_child(m_state->node_id)))
+    {
+        size_t prev = m_tree->last_child(m_state->node_id);
+        NodeType ty = m_tree->_p(prev)->m_type; // don't use type() because it masks out the quotes
+        NodeScalar tmp = m_tree->valsc(prev);
+        _c4dbgpf("has children and last child={} has val. saving the scalars, val='{}' quoted={}", prev, tmp.scalar, ty.is_val_quoted());
+        m_tree->remove(prev);
+        _push_level();
+        _start_map();
+        _store_scalar(tmp.scalar, ty.is_val_quoted());
+        m_key_anchor = tmp.anchor;
+        m_key_tag = tmp.tag;
+    }
+    else
+    {
+        _c4dbgpf("node {} has no children yet, using empty key", m_state->node_id);
+        _push_level();
+        _start_map();
+        _store_scalar_null(m_state->line_contents.rem.str);
+    }
+    add_flags(RSEQIMAP|FLOW);
+}
+
+void Parser::_stop_seqimap()
+{
+    _c4dbgp("stop_seqimap");
+    _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQIMAP));
+}
+
+
+//-----------------------------------------------------------------------------
+NodeData* Parser::_append_val(csubstr val, flag_t quoted)
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_all(SSCL));
+    _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) != nullptr);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_seq(m_state->node_id));
+    type_bits additional_flags = quoted ? VALQUO : NOTYPE;
+    _c4dbgpf("append val: '{}' to parent id={} (level={}){}", val, m_state->node_id, m_state->level, quoted ? " VALQUO!" : "");
+    size_t nid = m_tree->append_child(m_state->node_id);
+    m_tree->to_val(nid, val, additional_flags);
+
+    _c4dbgpf("append val: id={} val='{}'", nid, m_tree->get(nid)->m_val.scalar);
+    if( ! m_val_tag.empty())
+    {
+        _c4dbgpf("append val[{}]: set val tag='{}' -> '{}'", nid, m_val_tag, normalize_tag(m_val_tag));
+        m_tree->set_val_tag(nid, normalize_tag(m_val_tag));
+        m_val_tag.clear();
+    }
+    _write_val_anchor(nid);
+    return m_tree->get(nid);
+}
+
+NodeData* Parser::_append_key_val(csubstr val, flag_t val_quoted)
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_map(m_state->node_id));
+    type_bits additional_flags = 0;
+    if(m_state->flags & QSCL)
+        additional_flags |= KEYQUO;
+    if(val_quoted)
+        additional_flags |= VALQUO;
+
+    csubstr key = _consume_scalar();
+    _c4dbgpf("append keyval: '{}' '{}' to parent id={} (level={}){}{}", key, val, m_state->node_id, m_state->level, (additional_flags & KEYQUO) ? " KEYQUO!" : "", (additional_flags & VALQUO) ? " VALQUO!" : "");
+    size_t nid = m_tree->append_child(m_state->node_id);
+    m_tree->to_keyval(nid, key, val, additional_flags);
+    _c4dbgpf("append keyval: id={} key='{}' val='{}'", nid, m_tree->key(nid), m_tree->val(nid));
+    if( ! m_key_tag.empty())
+    {
+        _c4dbgpf("append keyval[{}]: set key tag='{}' -> '{}'", nid, m_key_tag, normalize_tag(m_key_tag));
+        m_tree->set_key_tag(nid, normalize_tag(m_key_tag));
+        m_key_tag.clear();
+    }
+    if( ! m_val_tag.empty())
+    {
+        _c4dbgpf("append keyval[{}]: set val tag='{}' -> '{}'", nid, m_val_tag, normalize_tag(m_val_tag));
+        m_tree->set_val_tag(nid, normalize_tag(m_val_tag));
+        m_val_tag.clear();
+    }
+    _write_key_anchor(nid);
+    _write_val_anchor(nid);
+    rem_flags(QMRK);
+    return m_tree->get(nid);
+}
+
+
+//-----------------------------------------------------------------------------
+void Parser::_store_scalar(csubstr s, flag_t is_quoted)
+{
+    _c4dbgpf("state[{}]: storing scalar '{}' (flag: {}) (old scalar='{}')",
+             m_state-m_stack.begin(), s, m_state->flags & SSCL, m_state->scalar);
+    RYML_CHECK(has_none(SSCL));
+    add_flags(SSCL | (is_quoted * QSCL));
+    m_state->scalar = s;
+}
+
+csubstr Parser::_consume_scalar()
+{
+    _c4dbgpf("state[{}]: consuming scalar '{}' (flag: {}))", m_state-m_stack.begin(), m_state->scalar, m_state->flags & SSCL);
+    RYML_CHECK(m_state->flags & SSCL);
+    csubstr s = m_state->scalar;
+    rem_flags(SSCL | QSCL);
+    m_state->scalar.clear();
+    return s;
+}
+
+void Parser::_move_scalar_from_top()
+{
+    if(m_stack.size() < 2) return;
+    State &prev = m_stack.top(1);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state == &m_stack.top());
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state != &prev);
+    if(prev.flags & SSCL)
+    {
+        _c4dbgpf("moving scalar '{}' from state[{}] to state[{}] (overwriting '{}')", prev.scalar, &prev-m_stack.begin(), m_state-m_stack.begin(), m_state->scalar);
+        add_flags(prev.flags & (SSCL | QSCL));
+        m_state->scalar = prev.scalar;
+        rem_flags(SSCL | QSCL, &prev);
+        prev.scalar.clear();
+    }
+}
+
+//-----------------------------------------------------------------------------
+/** @todo this function is a monster and needs love. Likely, it needs
+ * to be split like _scan_scalar_*() */
+bool Parser::_handle_indentation()
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW));
+    if( ! _at_line_begin())
+        return false;
+
+    size_t ind = m_state->line_contents.indentation;
+    csubstr rem = m_state->line_contents.rem;
+    /** @todo instead of trimming, we should use the indentation index from above */
+    csubstr remt = rem.triml(' ');
+
+    if(remt.empty() || remt.begins_with('#')) // this is a blank or comment line
+    {
+        _line_progressed(rem.size());
+        return true;
+    }
+
+    _c4dbgpf("indentation? ind={} indref={}", ind, m_state->indref);
+    if(ind == m_state->indref)
+    {
+        _c4dbgpf("same indentation: {}", ind);
+        if(!rem.sub(ind).begins_with('-'))
+        {
+            _c4dbgp("does not begin with -");
+            if(has_any(RMAP))
+            {
+                if(has_all(SSCL|RVAL))
+                {
+                    _c4dbgp("add with null val");
+                    _append_key_val_null(rem.str + ind - 1);
+                    addrem_flags(RKEY, RVAL);
+                }
+            }
+            else if(has_any(RSEQ))
+            {
+                if(m_stack.size() > 2) // do not pop to root level
+                {
+                    if(has_any(RNXT))
+                    {
+                        _c4dbgp("end the indentless seq");
+                        _pop_level();
+                        return true;
+                    }
+                    else if(has_any(RVAL))
+                    {
+                        _c4dbgp("add with null val");
+                        _append_val_null(rem.str);
+                        _c4dbgp("end the indentless seq");
+                        _pop_level();
+                        return true;
+                    }
+                }
+            }
+        }
+        _line_progressed(ind);
+        return ind > 0;
+    }
+    else if(ind < m_state->indref)
+    {
+        _c4dbgpf("smaller indentation ({} < {})!!!", ind, m_state->indref);
+        if(has_all(RVAL))
+        {
+            _c4dbgp("there was an empty val -- appending");
+            if(has_all(RMAP))
+            {
+                _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(SSCL));
+                _append_key_val_null(rem.sub(ind).str - 1);
+            }
+            else if(has_all(RSEQ))
+            {
+                _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(SSCL));
+                _append_val_null(rem.sub(ind).str - 1);
+            }
+        }
+        // search the stack frame to jump to based on its indentation
+        State const* popto = nullptr;
+        _RYML_CB_ASSERT(m_stack.m_callbacks, m_stack.is_contiguous()); // this search relies on the stack being contiguous
+        for(State const* s = m_state-1; s >= m_stack.begin(); --s)
+        {
+            _c4dbgpf("searching for state with indentation {}. curr={} (level={},node={})", ind, s->indref, s->level, s->node_id);
+            if(s->indref == ind)
+            {
+                _c4dbgpf("gotit!!! level={} node={}", s->level, s->node_id);
+                popto = s;
+                // while it may be tempting to think we're done at this
+                // point, we must still determine whether we're jumping to a
+                // parent with the same indentation. Consider this case with
+                // an indentless sequence:
+                //
+                // product:
+                // - sku: BL394D
+                //   quantity: 4
+                //   description: Basketball
+                //   price: 450.00
+                // - sku: BL4438H
+                //   quantity: 1
+                //   description: Super Hoop
+                //   price: 2392.00  # jumping one level here would be wrong.
+                // tax: 1234.5       # we must jump two levels
+                if(popto > m_stack.begin())
+                {
+                    auto parent = popto - 1;
+                    if(parent->indref == popto->indref)
+                    {
+                        _c4dbgpf("the parent (level={},node={}) has the same indentation ({}). is this in an indentless sequence?", parent->level, parent->node_id, popto->indref);
+                        _c4dbgpf("isseq(popto)={} ismap(parent)={}", m_tree->is_seq(popto->node_id), m_tree->is_map(parent->node_id));
+                        if(m_tree->is_seq(popto->node_id) && m_tree->is_map(parent->node_id))
+                        {
+                            if( ! remt.begins_with('-'))
+                            {
+                                _c4dbgp("this is an indentless sequence");
+                                popto = parent;
+                            }
+                            else
+                            {
+                                _c4dbgp("not an indentless sequence");
+                            }
+                        }
+                    }
+                }
+                break;
+            }
+        }
+        if(!popto || popto >= m_state || popto->level >= m_state->level)
+        {
+            _c4err("parse error: incorrect indentation?");
+        }
+        _c4dbgpf("popping {} levels: from level {} to level {}", m_state->level-popto->level, m_state->level, popto->level);
+        while(m_state != popto)
+        {
+            _c4dbgpf("popping level {} (indentation={})", m_state->level, m_state->indref);
+            _pop_level();
+        }
+        _RYML_CB_ASSERT(m_stack.m_callbacks, ind == m_state->indref);
+        _line_progressed(ind);
+        return true;
+    }
+    else
+    {
+        _c4dbgpf("larger indentation ({} > {})!!!", ind, m_state->indref);
+        _RYML_CB_ASSERT(m_stack.m_callbacks, ind > m_state->indref);
+        if(has_all(RMAP|RVAL))
+        {
+            if(_is_scalar_next__rmap_val(remt) && remt.first_of(":?") == npos)
+            {
+                _c4dbgpf("actually it seems a value: '{}'", remt);
+            }
+            else
+            {
+                addrem_flags(RKEY, RVAL);
+                _start_unk();
+                //_move_scalar_from_top();
+                _line_progressed(ind);
+                _save_indentation();
+                return true;
+            }
+        }
+        else if(has_all(RSEQ|RVAL))
+        {
+            // nothing to do here
+        }
+        else
+        {
+            _c4err("parse error - indentation should not increase at this point");
+        }
+    }
+
+    return false;
+}
+
+//-----------------------------------------------------------------------------
+csubstr Parser::_scan_comment()
+{
+    csubstr s = m_state->line_contents.rem;
+    _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('#'));
+    _line_progressed(s.len);
+    // skip the # character
+    s = s.sub(1);
+    // skip leading whitespace
+    s = s.right_of(s.first_not_of(' '), /*include_pos*/true);
+    _c4dbgpf("comment was '{}'", s);
+    return s;
+}
+
+//-----------------------------------------------------------------------------
+csubstr Parser::_scan_squot_scalar()
+{
+    // quoted scalars can spread over multiple lines!
+    // nice explanation here: http://yaml-multiline.info/
+
+    // a span to the end of the file
+    size_t b = m_state->pos.offset;
+    substr s = m_buf.sub(b);
+    if(s.begins_with(' '))
+    {
+        s = s.triml(' ');
+        _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.sub(b).is_super(s));
+        _RYML_CB_ASSERT(m_stack.m_callbacks, s.begin() >= m_buf.sub(b).begin());
+        _line_progressed((size_t)(s.begin() - m_buf.sub(b).begin()));
+    }
+    b = m_state->pos.offset; // take this into account
+    _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('\''));
+
+    // skip the opening quote
+    _line_progressed(1);
+    s = s.sub(1);
+
+    bool needs_filter = false;
+
+    size_t numlines = 1; // we already have one line
+    size_t pos = npos; // find the pos of the matching quote
+    while( ! _finished_file())
+    {
+        const csubstr line = m_state->line_contents.rem;
+        bool line_is_blank = true;
+        _c4dbgpf("scanning single quoted scalar @ line[{}]: ~~~{}~~~", m_state->pos.line, line);
+        for(size_t i = 0; i < line.len; ++i)
+        {
+            const char curr = line.str[i];
+            if(curr == '\'') // single quotes are escaped with two single quotes
+            {
+                const char next = i+1 < line.len ? line.str[i+1] : '~';
+                if(next != '\'') // so just look for the first quote
+                {                // without another after it
+                    pos = i;
+                    break;
+                }
+                else
+                {
+                    needs_filter = true; // needs filter to remove escaped quotes
+                    ++i; // skip the escaped quote
+                }
+            }
+            else if(curr != ' ')
+            {
+                line_is_blank = false;
+            }
+        }
+
+        // leading whitespace also needs filtering
+        needs_filter = needs_filter
+            || (numlines > 1)
+            || line_is_blank
+            || (_at_line_begin() && line.begins_with(' '));
+
+        if(pos == npos)
+        {
+            _line_progressed(line.len);
+            ++numlines;
+        }
+        else
+        {
+            _RYML_CB_ASSERT(m_stack.m_callbacks, pos >= 0 && pos < m_buf.len);
+            _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf[m_state->pos.offset + pos] == '\'');
+            _line_progressed(pos + 1); // progress beyond the quote
+            pos = m_state->pos.offset - b - 1; // but we stop before it
+            break;
+        }
+
+        _line_ended();
+        _scan_line();
+    }
+
+    if(pos == npos)
+    {
+        _c4err("reached end of file while looking for closing quote");
+    }
+    else
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, pos > 0);
+        _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() >= m_buf.begin() && s.end() <= m_buf.end());
+        _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() == m_buf.end() || *s.end() == '\'');
+        s = s.sub(0, pos-1);
+    }
+
+    if(needs_filter)
+    {
+        csubstr ret = _filter_squot_scalar(s);
+        _RYML_CB_ASSERT(m_stack.m_callbacks, ret.len <= s.len || s.empty() || s.trim(' ').empty());
+        _c4dbgpf("final scalar: \"{}\"", ret);
+        return ret;
+    }
+
+    _c4dbgpf("final scalar: \"{}\"", s);
+
+    return s;
+}
+
+//-----------------------------------------------------------------------------
+csubstr Parser::_scan_dquot_scalar()
+{
+    // quoted scalars can spread over multiple lines!
+    // nice explanation here: http://yaml-multiline.info/
+
+    // a span to the end of the file
+    size_t b = m_state->pos.offset;
+    substr s = m_buf.sub(b);
+    if(s.begins_with(' '))
+    {
+        s = s.triml(' ');
+        _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.sub(b).is_super(s));
+        _RYML_CB_ASSERT(m_stack.m_callbacks, s.begin() >= m_buf.sub(b).begin());
+        _line_progressed((size_t)(s.begin() - m_buf.sub(b).begin()));
+    }
+    b = m_state->pos.offset; // take this into account
+    _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('"'));
+
+    // skip the opening quote
+    _line_progressed(1);
+    s = s.sub(1);
+
+    bool needs_filter = false;
+
+    size_t numlines = 1; // we already have one line
+    size_t pos = npos; // find the pos of the matching quote
+    while( ! _finished_file())
+    {
+        const csubstr line = m_state->line_contents.rem;
+        bool line_is_blank = true;
+        _c4dbgpf("scanning double quoted scalar @ line[{}]:  line='{}'", m_state->pos.line, line);
+        for(size_t i = 0; i < line.len; ++i)
+        {
+            const char curr = line.str[i];
+            if(curr != ' ')
+                line_is_blank = false;
+            // every \ is an escape
+            if(curr == '\\')
+            {
+                const char next = i+1 < line.len ? line.str[i+1] : '~';
+                needs_filter = true;
+                if(next == '"' || next == '\\')
+                    ++i;
+            }
+            else if(curr == '"')
+            {
+                pos = i;
+                break;
+            }
+        }
+
+        // leading whitespace also needs filtering
+        needs_filter = needs_filter
+            || (numlines > 1)
+            || line_is_blank
+            || (_at_line_begin() && line.begins_with(' '));
+
+        if(pos == npos)
+        {
+            _line_progressed(line.len);
+            ++numlines;
+        }
+        else
+        {
+            _RYML_CB_ASSERT(m_stack.m_callbacks, pos >= 0 && pos < m_buf.len);
+            _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf[m_state->pos.offset + pos] == '"');
+            _line_progressed(pos + 1); // progress beyond the quote
+            pos = m_state->pos.offset - b - 1; // but we stop before it
+            break;
+        }
+
+        _line_ended();
+        _scan_line();
+    }
+
+    if(pos == npos)
+    {
+        _c4err("reached end of file looking for closing quote");
+    }
+    else
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, pos > 0);
+        _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() == m_buf.end() || *s.end() == '"');
+        _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() >= m_buf.begin() && s.end() <= m_buf.end());
+        s = s.sub(0, pos-1);
+    }
+
+    if(needs_filter)
+    {
+        csubstr ret = _filter_dquot_scalar(s);
+        _c4dbgpf("final scalar: [{}]\"{}\"", ret.len, ret);
+        _RYML_CB_ASSERT(m_stack.m_callbacks, ret.len <= s.len || s.empty() || s.trim(' ').empty());
+        return ret;
+    }
+
+    _c4dbgpf("final scalar: \"{}\"", s);
+
+    return s;
+}
+
+//-----------------------------------------------------------------------------
+csubstr Parser::_scan_block()
+{
+    // nice explanation here: http://yaml-multiline.info/
+    csubstr s = m_state->line_contents.rem;
+    csubstr trimmed = s.triml(' ');
+    if(trimmed.str > s.str)
+    {
+        _c4dbgp("skipping whitespace");
+        _RYML_CB_ASSERT(m_stack.m_callbacks, trimmed.str >= s.str);
+        _line_progressed(static_cast<size_t>(trimmed.str - s.str));
+        s = trimmed;
+    }
+    _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('|') || s.begins_with('>'));
+
+    _c4dbgpf("scanning block: specs=\"{}\"", s);
+
+    // parse the spec
+    BlockStyle_e newline = s.begins_with('>') ? BLOCK_FOLD : BLOCK_LITERAL;
+    BlockChomp_e chomp = CHOMP_CLIP; // default to clip unless + or - are used
+    size_t indentation = npos; // have to find out if no spec is given
+    csubstr digits;
+    if(s.len > 1)
+    {
+        _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with_any("|>"));
+        csubstr t = s.sub(1);
+        _c4dbgpf("scanning block: spec is multichar: '{}'", t);
+        _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 1);
+        size_t pos = t.first_of("-+");
+        _c4dbgpf("scanning block: spec chomp char at {}", pos);
+        if(pos != npos)
+        {
+            if(t[pos] == '-')
+                chomp = CHOMP_STRIP;
+            else if(t[pos] == '+')
+                chomp = CHOMP_KEEP;
+            if(pos == 0)
+                t = t.sub(1);
+            else
+                t = t.first(pos);
+        }
+        // from here to the end, only digits are considered
+        digits = t.left_of(t.first_not_of("0123456789"));
+        if( ! digits.empty())
+        {
+            if( ! c4::atou(digits, &indentation))
+                _c4err("parse error: could not read decimal");
+            _c4dbgpf("scanning block: indentation specified: {}. add {} from curr state -> {}", indentation, m_state->indref, indentation+m_state->indref);
+            indentation += m_state->indref;
+        }
+    }
+
+    // finish the current line
+    _line_progressed(s.len);
+    _line_ended();
+    _scan_line();
+
+    _c4dbgpf("scanning block: style={}  chomp={}  indentation={}", newline==BLOCK_FOLD ? "fold" : "literal", chomp==CHOMP_CLIP ? "clip" : (chomp==CHOMP_STRIP ? "strip" : "keep"), indentation);
+
+    // start with a zero-length block, already pointing at the right place
+    substr raw_block(m_buf.data() + m_state->pos.offset, size_t(0));// m_state->line_contents.full.sub(0, 0);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, raw_block.begin() == m_state->line_contents.full.begin());
+
+    // read every full line into a raw block,
+    // from which newlines are to be stripped as needed.
+    //
+    // If no explicit indentation was given, pick it from the first
+    // non-empty line. See
+    // https://yaml.org/spec/1.2.2/#8111-block-indentation-indicator
+    size_t num_lines = 0, first = m_state->pos.line, provisional_indentation = npos;
+    LineContents lc;
+    while(( ! _finished_file()))
+    {
+        // peek next line, but do not advance immediately
+        lc.reset_with_next_line(m_buf, m_state->pos.offset);
+        _c4dbgpf("scanning block: peeking at '{}'", lc.stripped);
+        // evaluate termination conditions
+        if(indentation != npos)
+        {
+            // stop when the line is deindented and not empty
+            if(lc.indentation < indentation && ( ! lc.rem.trim(" \t\r\n").empty()))
+            {
+                _c4dbgpf("scanning block: indentation decreased ref={} thisline={}", indentation, lc.indentation);
+                break;
+            }
+            else if(indentation == 0)
+            {
+                if((lc.rem == "..." || lc.rem.begins_with("... "))
+                    ||
+                   (lc.rem == "---" || lc.rem.begins_with("--- ")))
+                {
+                    _c4dbgp("scanning block: stop. indentation=0 and stream ended");
+                    break;
+                }
+            }
+        }
+        else
+        {
+            _c4dbgpf("scanning block: indentation ref not set. firstnonws={}", lc.stripped.first_not_of(' '));
+            if(lc.stripped.first_not_of(' ') != npos) // non-empty line
+            {
+                _c4dbgpf("scanning block: line not empty. indref={} indprov={} indentation={}", m_state->indref, provisional_indentation, lc.indentation);
+                if(provisional_indentation == npos)
+                {
+                    if(lc.indentation < m_state->indref)
+                    {
+                        _c4dbgpf("scanning block: block terminated indentation={} < indref={}", lc.indentation, m_state->indref);
+                        if(raw_block.len == 0)
+                        {
+                            _c4dbgp("scanning block: was empty, undo next line");
+                            _line_ended_undo();
+                        }
+                        break;
+                    }
+                    else if(lc.indentation == m_state->indref)
+                    {
+                        if(has_any(RSEQ|RMAP))
+                        {
+                            _c4dbgpf("scanning block: block terminated. reading container and indentation={}==indref={}", lc.indentation, m_state->indref);
+                            break;
+                        }
+                    }
+                    _c4dbgpf("scanning block: set indentation ref from this line: ref={}", lc.indentation);
+                    indentation = lc.indentation;
+                }
+                else
+                {
+                    if(lc.indentation >= provisional_indentation)
+                    {
+                        _c4dbgpf("scanning block: set indentation ref from provisional indentation: provisional_ref={}, thisline={}", provisional_indentation, lc.indentation);
+                        //indentation = provisional_indentation ? provisional_indentation : lc.indentation;
+                        indentation = lc.indentation;
+                    }
+                    else
+                    {
+                        break;
+                        //_c4err("parse error: first non-empty block line should have at least the original indentation");
+                    }
+                }
+            }
+            else // empty line
+            {
+                _c4dbgpf("scanning block: line empty or {} spaces. line_indentation={} prov_indentation={}", lc.stripped.len, lc.indentation, provisional_indentation);
+                if(provisional_indentation != npos)
+                {
+                    if(lc.stripped.len >= provisional_indentation)
+                    {
+                        _c4dbgpf("scanning block: increase provisional_ref {} -> {}", provisional_indentation, lc.stripped.len);
+                        provisional_indentation = lc.stripped.len;
+                    }
+                    #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
+                    else if(lc.indentation >= provisional_indentation && lc.indentation != npos)
+                    {
+                        _c4dbgpf("scanning block: increase provisional_ref {} -> {}", provisional_indentation, lc.indentation);
+                        provisional_indentation = lc.indentation;
+                    }
+                    #endif
+                }
+                else
+                {
+                    provisional_indentation = lc.indentation ? lc.indentation : has_any(RSEQ|RVAL);
+                    _c4dbgpf("scanning block: initialize provisional_ref={}", provisional_indentation);
+                    if(provisional_indentation == npos)
+                    {
+                        provisional_indentation = lc.stripped.len ? lc.stripped.len : has_any(RSEQ|RVAL);
+                        _c4dbgpf("scanning block: initialize provisional_ref={}", provisional_indentation);
+                    }
+                }
+            }
+        }
+        // advance now that we know the folded scalar continues
+        m_state->line_contents = lc;
+        _c4dbgpf("scanning block: append '{}'", m_state->line_contents.rem);
+        raw_block.len += m_state->line_contents.full.len;
+        _line_progressed(m_state->line_contents.rem.len);
+        _line_ended();
+        ++num_lines;
+    }
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.line == (first + num_lines) || (raw_block.len == 0));
+    C4_UNUSED(num_lines);
+    C4_UNUSED(first);
+
+    if(indentation == npos)
+    {
+        _c4dbgpf("scanning block: set indentation from provisional: {}", provisional_indentation);
+        indentation = provisional_indentation;
+    }
+
+    if(num_lines)
+        _line_ended_undo();
+
+    _c4dbgpf("scanning block: raw=~~~{}~~~", raw_block);
+
+    // ok! now we strip the newlines and spaces according to the specs
+    s = _filter_block_scalar(raw_block, newline, chomp, indentation);
+
+    _c4dbgpf("scanning block: final=~~~{}~~~", s);
+
+    return s;
+}
+
+
+//-----------------------------------------------------------------------------
+
+template<bool backslash_is_escape, bool keep_trailing_whitespace>
+bool Parser::_filter_nl(substr r, size_t *C4_RESTRICT i, size_t *C4_RESTRICT pos, size_t indentation)
+{
+    // a debugging scaffold:
+    #if 0
+    #define _c4dbgfnl(fmt, ...) _c4dbgpf("filter_nl[{}]: " fmt, *i, __VA_ARGS__)
+    #else
+    #define _c4dbgfnl(...)
+    #endif
+
+    const char curr = r[*i];
+    bool replaced = false;
+
+    _RYML_CB_ASSERT(m_stack.m_callbacks, indentation != npos);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, curr == '\n');
+
+    _c4dbgfnl("found newline. sofar=[{}]~~~{}~~~", *pos, m_filter_arena.first(*pos));
+    size_t ii = *i;
+    size_t numnl_following = count_following_newlines(r, &ii, indentation);
+    if(numnl_following)
+    {
+        _c4dbgfnl("{} consecutive (empty) lines {} in the middle. totalws={}", 1+numnl_following, ii < r.len ? "in the middle" : "at the end", ii - *i);
+        for(size_t j = 0; j < numnl_following; ++j)
+            m_filter_arena.str[(*pos)++] = '\n';
+    }
+    else
+    {
+        if(r.first_not_of(" \t", *i+1) != npos)
+        {
+            m_filter_arena.str[(*pos)++] = ' ';
+            _c4dbgfnl("single newline. convert to space. ii={}/{}. sofar=[{}]~~~{}~~~", ii, r.len, *pos, m_filter_arena.first(*pos));
+            replaced = true;
+        }
+        else
+        {
+            if C4_IF_CONSTEXPR (keep_trailing_whitespace)
+            {
+                m_filter_arena.str[(*pos)++] = ' ';
+                _c4dbgfnl("single newline. convert to space. ii={}/{}. sofar=[{}]~~~{}~~~", ii, r.len, *pos, m_filter_arena.first(*pos));
+                replaced = true;
+            }
+            else
+            {
+                _c4dbgfnl("last newline, everything else is whitespace. ii={}/{}", ii, r.len);
+                *i = r.len;
+            }
+        }
+        if C4_IF_CONSTEXPR (backslash_is_escape)
+        {
+            if(ii < r.len && r.str[ii] == '\\')
+            {
+                const char next = ii+1 < r.len ? r.str[ii+1] : '\0';
+                if(next == ' ' || next == '\t')
+                {
+                    _c4dbgfnl("extend skip to backslash{}", "");
+                    ++ii;
+                }
+            }
+        }
+    }
+    *i = ii - 1; // correct for the loop increment
+
+    #undef _c4dbgfnl
+
+    return replaced;
+}
+
+
+//-----------------------------------------------------------------------------
+
+template<bool keep_trailing_whitespace>
+void Parser::_filter_ws(substr r, size_t *C4_RESTRICT i, size_t *C4_RESTRICT pos)
+{
+    // a debugging scaffold:
+    #if 0
+    #define _c4dbgfws(fmt, ...) _c4dbgpf("filt_nl[{}]: " fmt, *i, __VA_ARGS__)
+    #else
+    #define _c4dbgfws(...)
+    #endif
+
+    const char curr = r[*i];
+    _c4dbgfws("found whitespace '{}'", _c4prc(curr));
+    _RYML_CB_ASSERT(m_stack.m_callbacks, curr == ' ' || curr == '\t');
+
+    size_t first = *i > 0 ? r.first_not_of(" \t", *i) : r.first_not_of(' ', *i);
+    if(first != npos)
+    {
+        if(r[first] == '\n' || r[first] == '\r') // skip trailing whitespace
+        {
+            _c4dbgfws("whitespace is trailing on line. firstnonws='{}'@{}", _c4prc(r[first]), first);
+            *i = first - 1; // correct for the loop increment
+        }
+        else // a legit whitespace
+        {
+            m_filter_arena.str[(*pos)++] = curr;
+            _c4dbgfws("legit whitespace. sofar=[{}]~~~{}~~~", *pos, m_filter_arena.first(*pos));
+        }
+    }
+    else
+    {
+        _c4dbgfws("... everything else is trailing whitespace{}", "");
+        if C4_IF_CONSTEXPR (keep_trailing_whitespace)
+            for(size_t j = *i; j < r.len; ++j)
+                m_filter_arena.str[(*pos)++] = r[j];
+        *i = r.len;
+    }
+
+    #undef _c4dbgfws
+}
+
+
+//-----------------------------------------------------------------------------
+csubstr Parser::_filter_plain_scalar(substr s, size_t indentation)
+{
+    // a debugging scaffold:
+    #if 0
+    #define _c4dbgfps(...) _c4dbgpf("filt_plain_scalar" __VA_ARGS__)
+    #else
+    #define _c4dbgfps(...)
+    #endif
+
+    _c4dbgfps("before=~~~{}~~~", s);
+
+    substr r = s.triml(" \t");
+    _grow_filter_arena(r.len);
+    size_t pos = 0; // the filtered size
+    bool filtered_chars = false;
+    for(size_t i = 0; i < r.len; ++i)
+    {
+        const char curr = r.str[i];
+        _c4dbgfps("[{}]: '{}'", i, _c4prc(curr));
+        if(curr == ' ' || curr == '\t')
+        {
+            _filter_ws</*keep_trailing_ws*/false>(r, &i, &pos);
+        }
+        else if(curr == '\n')
+        {
+            filtered_chars = _filter_nl</*backslash_is_escape*/false, /*keep_trailing_ws*/false>(r, &i, &pos, indentation);
+        }
+        else if(curr == '\r')  // skip \r --- https://stackoverflow.com/questions/1885900
+        {
+            ;
+        }
+        else
+        {
+            m_filter_arena.str[pos++] = r[i];
+        }
+    }
+
+    _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
+    if(pos < r.len || filtered_chars)
+    {
+        r = _finish_filter_arena(r, pos);
+    }
+
+    _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len);
+    _c4dbgfps("#filteredchars={} after=~~~{}~~~", s.len - r.len, r);
+
+    #undef _c4dbgfps
+    return r;
+}
+
+
+//-----------------------------------------------------------------------------
+csubstr Parser::_filter_squot_scalar(substr s)
+{
+    // a debugging scaffold:
+    #if 0
+    #define _c4dbgfsq(...) _c4dbgpf("filt_squo_scalar")
+    #else
+    #define _c4dbgfsq(...)
+    #endif
+
+    // from the YAML spec for double-quoted scalars:
+    // https://yaml.org/spec/1.2-old/spec.html#style/flow/single-quoted
+
+    _c4dbgfsq(": before=~~~{}~~~", s);
+
+    _grow_filter_arena(s.len);
+    substr r = s;
+    size_t pos = 0; // the filtered size
+    bool filtered_chars = false;
+    for(size_t i = 0; i < r.len; ++i)
+    {
+        const char curr = r[i];
+        _c4dbgfsq("[{}]: '{}'", i, _c4prc(curr));
+        if(curr == ' ' || curr == '\t')
+        {
+            _filter_ws</*keep_trailing_ws*/true>(r, &i, &pos);
+        }
+        else if(curr == '\n')
+        {
+            filtered_chars = _filter_nl</*backslash_is_escape*/false, /*keep_trailing_ws*/true>(r, &i, &pos, /*indentation*/0);
+        }
+        else if(curr == '\r')  // skip \r --- https://stackoverflow.com/questions/1885900
+        {
+            ;
+        }
+        else if(curr == '\'')
+        {
+            char next = i+1 < r.len ? r[i+1] : '\0';
+            if(next == '\'')
+            {
+                _c4dbgfsq("[{}]: two consecutive quotes", i);
+                filtered_chars = true;
+                m_filter_arena.str[pos++] = '\'';
+                ++i;
+            }
+        }
+        else
+        {
+            m_filter_arena.str[pos++] = curr;
+        }
+    }
+
+    _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
+    if(pos < r.len || filtered_chars)
+    {
+        r = _finish_filter_arena(r, pos);
+    }
+
+    _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len);
+    _c4dbgpf(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r);
+
+    #undef _c4dbgfsq
+    return r;
+}
+
+
+//-----------------------------------------------------------------------------
+csubstr Parser::_filter_dquot_scalar(substr s)
+{
+    // a debugging scaffold:
+    #if 0
+    #define _c4dbgfdq(...) _c4dbgpf("filt_dquo_scalar" __VA_ARGS__)
+    #else
+    #define _c4dbgfdq(...)
+    #endif
+
+    _c4dbgfdq(": before=~~~{}~~~", s);
+
+    // from the YAML spec for double-quoted scalars:
+    // https://yaml.org/spec/1.2-old/spec.html#style/flow/double-quoted
+    //
+    // All leading and trailing white space characters are excluded
+    // from the content. Each continuation line must therefore contain
+    // at least one non-space character. Empty lines, if any, are
+    // consumed as part of the line folding.
+
+    _grow_filter_arena(s.len + 2u * s.count('\\'));
+    substr r = s;
+    size_t pos = 0; // the filtered size
+    bool filtered_chars = false;
+    for(size_t i = 0; i < r.len; ++i)
+    {
+        const char curr = r[i];
+        _c4dbgfdq("[{}]: '{}'", i, _c4prc(curr));
+        if(curr == ' ' || curr == '\t')
+        {
+            _filter_ws</*keep_trailing_ws*/true>(r, &i, &pos);
+        }
+        else if(curr == '\n')
+        {
+            filtered_chars = _filter_nl</*backslash_is_escape*/true, /*keep_trailing_ws*/true>(r, &i, &pos, /*indentation*/0);
+        }
+        else if(curr == '\r')  // skip \r --- https://stackoverflow.com/questions/1885900
+        {
+            ;
+        }
+        else if(curr == '\\')
+        {
+            char next = i+1 < r.len ? r[i+1] : '\0';
+            _c4dbgfdq("[{}]: backslash, next='{}'", i, _c4prc(next));
+            filtered_chars = true;
+            if(next == '\r')
+            {
+                if(i+2 < r.len && r[i+2] == '\n')
+                {
+                    ++i; // newline escaped with \ -- skip both (add only one as i is loop-incremented)
+                    next = '\n';
+                    _c4dbgfdq("[{}]: was \\r\\n, now next='\\n'", i);
+                }
+            }
+            // remember the loop will also increment i
+            if(next == '\n')
+            {
+                size_t ii = i + 2;
+                for( ; ii < r.len; ++ii)
+                {
+                    if(r.str[ii] == ' ' || r.str[ii] == '\t')  // skip leading whitespace
+                        ;
+                    else
+                        break;
+                }
+                i += ii - i - 1;
+            }
+            else if(next == '"' || next == '/'  || next == ' ' || next == '\t') // escapes for json compatibility
+            {
+                m_filter_arena.str[pos++] = next;
+                ++i;
+            }
+            else if(next == '\r')
+            {
+                //++i;
+            }
+            else if(next == 'n')
+            {
+                m_filter_arena.str[pos++] = '\n';
+                ++i;
+            }
+            else if(next == 'r')
+            {
+                m_filter_arena.str[pos++] = '\r';
+                ++i; // skip
+            }
+            else if(next == 't')
+            {
+                m_filter_arena.str[pos++] = '\t';
+                ++i;
+            }
+            else if(next == '\\')
+            {
+                m_filter_arena.str[pos++] = '\\';
+                ++i;
+            }
+            else if(next == 'x') // UTF8
+            {
+                if(i + 1u + 2u >= r.len)
+                    _c4err("\\x requires 2 hex digits");
+                uint8_t byteval = {};
+                if(!read_hex(r.sub(i + 2u, 2u), &byteval))
+                    _c4err("failed to read \\x codepoint");
+                m_filter_arena.str[pos++] = *(char*)&byteval;
+                i += 1u + 2u;
+            }
+            else if(next == 'u') // UTF16
+            {
+                if(i + 1u + 4u >= r.len)
+                    _c4err("\\u requires 4 hex digits");
+                char readbuf[8];
+                csubstr codepoint = r.sub(i + 2u, 4u);
+                uint32_t codepoint_val = {};
+                if(!read_hex(codepoint, &codepoint_val))
+                    _c4err("failed to parse \\u codepoint");
+                size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val);
+                C4_ASSERT(numbytes <= 4);
+                memcpy(m_filter_arena.str + pos, readbuf, numbytes);
+                pos += numbytes;
+                i += 1u + 4u;
+            }
+            else if(next == 'U') // UTF32
+            {
+                if(i + 1u + 8u >= r.len)
+                    _c4err("\\U requires 8 hex digits");
+                char readbuf[8];
+                csubstr codepoint = r.sub(i + 2u, 8u);
+                uint32_t codepoint_val = {};
+                if(!read_hex(codepoint, &codepoint_val))
+                    _c4err("failed to parse \\U codepoint");
+                size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val);
+                C4_ASSERT(numbytes <= 4);
+                memcpy(m_filter_arena.str + pos, readbuf, numbytes);
+                pos += numbytes;
+                i += 1u + 8u;
+            }
+            // https://yaml.org/spec/1.2.2/#rule-c-ns-esc-char
+            else if(next == '0')
+            {
+                m_filter_arena.str[pos++] = '\0';
+                ++i;
+            }
+            else if(next == 'b') // backspace
+            {
+                m_filter_arena.str[pos++] = '\b';
+                ++i;
+            }
+            else if(next == 'f') // form feed
+            {
+                m_filter_arena.str[pos++] = '\f';
+                ++i;
+            }
+            else if(next == 'a') // bell character
+            {
+                m_filter_arena.str[pos++] = '\a';
+                ++i;
+            }
+            else if(next == 'v') // vertical tab
+            {
+                m_filter_arena.str[pos++] = '\v';
+                ++i;
+            }
+            else if(next == 'e') // escape character
+            {
+                m_filter_arena.str[pos++] = '\x1b';
+                ++i;
+            }
+            else if(next == '_') // unicode non breaking space \u00a0
+            {
+                // https://www.compart.com/en/unicode/U+00a0
+                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x3e, 0xc2);
+                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x60, 0xa0);
+                ++i;
+            }
+            else if(next == 'N') // unicode next line \u0085
+            {
+                // https://www.compart.com/en/unicode/U+0085
+                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x3e, 0xc2);
+                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x7b, 0x85);
+                ++i;
+            }
+            else if(next == 'L') // unicode line separator \u2028
+            {
+                // https://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=1024&names=-&utf8=0x&unicodeinhtml=hex
+                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x1e, 0xe2);
+                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x80, 0x80);
+                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x58, 0xa8);
+                ++i;
+            }
+            else if(next == 'P') // unicode paragraph separator \u2029
+            {
+                // https://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=1024&names=-&utf8=0x&unicodeinhtml=hex
+                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x1e, 0xe2);
+                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x80, 0x80);
+                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x57, 0xa9);
+                ++i;
+            }
+            _c4dbgfdq("[{}]: backslash...sofar=[{}]~~~{}~~~", i, pos, m_filter_arena.first(pos));
+        }
+        else
+        {
+            m_filter_arena.str[pos++] = curr;
+        }
+    }
+
+    _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
+    if(pos < r.len || filtered_chars)
+    {
+        r = _finish_filter_arena(r, pos);
+    }
+
+    _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len);
+    _c4dbgpf(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r);
+
+    #undef _c4dbgfdq
+
+    return r;
+}
+
+
+//-----------------------------------------------------------------------------
+bool Parser::_apply_chomp(substr buf, size_t *C4_RESTRICT pos, BlockChomp_e chomp)
+{
+    substr trimmed = buf.first(*pos).trimr('\n');
+    bool added_newline = false;
+    switch(chomp)
+    {
+    case CHOMP_KEEP:
+        if(trimmed.len == *pos)
+        {
+            _c4dbgpf("chomp=KEEP: add missing newline @{}", *pos);
+            //m_filter_arena.str[(*pos)++] = '\n';
+            added_newline = true;
+        }
+        break;
+    case CHOMP_CLIP:
+        if(trimmed.len == *pos)
+        {
+            _c4dbgpf("chomp=CLIP: add missing newline @{}", *pos);
+            m_filter_arena.str[(*pos)++] = '\n';
+            added_newline = true;
+        }
+        else
+        {
+            _c4dbgpf("chomp=CLIP: include single trailing newline @{}", trimmed.len+1);
+            *pos = trimmed.len + 1;
+        }
+        break;
+    case CHOMP_STRIP:
+        _c4dbgpf("chomp=STRIP: strip {}-{}-{} newlines", *pos, trimmed.len, *pos-trimmed.len);
+        *pos = trimmed.len;
+        break;
+    default:
+        _c4err("unknown chomp style");
+    }
+    return added_newline;
+}
+
+
+//-----------------------------------------------------------------------------
+csubstr Parser::_filter_block_scalar(substr s, BlockStyle_e style, BlockChomp_e chomp, size_t indentation)
+{
+    // a debugging scaffold:
+    #if 0
+    #define _c4dbgfbl(fmt, ...) _c4dbgpf("filt_block" fmt, __VA_ARGS__)
+    #else
+    #define _c4dbgfbl(...)
+    #endif
+
+    _c4dbgfbl(": indentation={} before=[{}]~~~{}~~~", indentation, s.len, s);
+
+    if(chomp != CHOMP_KEEP && s.trim(" \n\r").len == 0u)
+    {
+        _c4dbgp("filt_block: empty scalar");
+        return s.first(0);
+    }
+
+    substr r = s;
+
+    switch(style)
+    {
+    case BLOCK_LITERAL:
+        {
+            _c4dbgp("filt_block: style=literal");
+            // trim leading whitespace up to indentation
+            {
+                size_t numws = r.first_not_of(' ');
+                if(numws != npos)
+                {
+                    if(numws > indentation)
+                        r = r.sub(indentation);
+                    else
+                        r = r.sub(numws);
+                    _c4dbgfbl(": after triml=[{}]~~~{}~~~", r.len, r);
+                }
+                else
+                {
+                    if(chomp != CHOMP_KEEP || r.len == 0)
+                    {
+                        _c4dbgfbl(": all spaces {}, return empty", r.len);
+                        return r.first(0);
+                    }
+                    else
+                    {
+                        r[0] = '\n';
+                        return r.first(1);
+                    }
+                }
+            }
+            _grow_filter_arena(s.len + 2u);  // use s.len! because we may need to add a newline at the end, so the leading indentation will allow space for that newline
+            size_t pos = 0; // the filtered size
+            for(size_t i = 0; i < r.len; ++i)
+            {
+                const char curr = r.str[i];
+                _c4dbgfbl("[{}]='{}'  pos={}", i, _c4prc(curr), pos);
+                if(curr == '\r')
+                    continue;
+                m_filter_arena.str[pos++] = curr;
+                if(curr == '\n')
+                {
+                    _c4dbgfbl("[{}]: found newline", i);
+                    // skip indentation on the next line
+                    csubstr rem = r.sub(i+1);
+                    size_t first = rem.first_not_of(' ');
+                    if(first != npos)
+                    {
+                        _RYML_CB_ASSERT(m_stack.m_callbacks, first < rem.len);
+                        _RYML_CB_ASSERT(m_stack.m_callbacks, i+1+first < r.len);
+                        _c4dbgfbl("[{}]: {} spaces follow before next nonws character @ [{}]='{}'", i, first, i+1+first, rem.str[first]);
+                        if(first < indentation)
+                        {
+                            _c4dbgfbl("[{}]: skip {}<{} spaces from indentation", i, first, indentation);
+                            i += first;
+                        }
+                        else
+                        {
+                            _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation);
+                            i += indentation;
+                        }
+                    }
+                    else
+                    {
+                        _RYML_CB_ASSERT(m_stack.m_callbacks, i+1 <= r.len);
+                        first = rem.len;
+                        _c4dbgfbl("[{}]: {} spaces to the end", i, first);
+                        if(first)
+                        {
+                            if(first < indentation)
+                            {
+                                _c4dbgfbl("[{}]: skip everything", i);
+                                --pos;
+                                break;
+                            }
+                            else
+                            {
+                                _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation);
+                                i += indentation;
+                            }
+                        }
+                        else if(i+1 == r.len)
+                        {
+                            if(chomp == CHOMP_STRIP)
+                                --pos;
+                            break;
+                        }
+                    }
+                }
+            }
+            _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= pos);
+            _c4dbgfbl(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r);
+            bool changed = _apply_chomp(m_filter_arena, &pos, chomp);
+            _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
+            _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= s.len);
+            if(pos < r.len || changed)
+            {
+                r = _finish_filter_arena(s, pos); // write into s
+            }
+            break;
+        }
+    case BLOCK_FOLD:
+        {
+            _c4dbgp("filt_block: style=fold");
+            _grow_filter_arena(r.len + 2);
+            size_t pos = 0; // the filtered size
+            bool filtered_chars = false;
+            bool started = false;
+            bool is_indented = false;
+            size_t i = r.first_not_of(' ');
+            _c4dbgfbl(": first non space at {}", i);
+            if(i > indentation)
+            {
+                is_indented = true;
+                i = indentation;
+            }
+            _c4dbgfbl(": start folding at {}, is_indented={}", i, (int)is_indented);
+            auto on_change_indentation = [&](size_t numnl_following, size_t last_newl, size_t first_non_whitespace){
+                _c4dbgfbl("[{}]: add 1+{} newlines", i, numnl_following);
+                for(size_t j = 0; j < 1 + numnl_following; ++j)
+                    m_filter_arena.str[pos++] = '\n';
+                for(i = last_newl + 1 + indentation; i < first_non_whitespace; ++i)
+                {
+                    if(r.str[i] == '\r')
+                        continue;
+                    _c4dbgfbl("[{}]: add '{}'", i, _c4prc(r.str[i]));
+                    m_filter_arena.str[pos++] = r.str[i];
+                }
+                --i;
+            };
+            for( ; i < r.len; ++i)
+            {
+                const char curr = r.str[i];
+                _c4dbgfbl("[{}]='{}'", i, _c4prc(curr));
+                if(curr == '\n')
+                {
+                    filtered_chars = true;
+                    // skip indentation on the next line, and advance over the next non-indented blank lines as well
+                    size_t first_non_whitespace;
+                    size_t numnl_following = (size_t)-1;
+                    while(r[i] == '\n')
+                    {
+                        ++numnl_following;
+                        csubstr rem = r.sub(i+1);
+                        size_t first = rem.first_not_of(' ');
+                        _c4dbgfbl("[{}]: found newline. first={} rem.len={}", i, first, rem.len);
+                        if(first != npos)
+                        {
+                            first_non_whitespace = first + i+1;
+                            while(first_non_whitespace < r.len && r[first_non_whitespace] == '\r')
+                                ++first_non_whitespace;
+                            _RYML_CB_ASSERT(m_stack.m_callbacks, first < rem.len);
+                            _RYML_CB_ASSERT(m_stack.m_callbacks, i+1+first < r.len);
+                            _c4dbgfbl("[{}]: {} spaces follow before next nonws character @ [{}]='{}'", i, first, i+1+first, _c4prc(rem.str[first]));
+                            if(first < indentation)
+                            {
+                                _c4dbgfbl("[{}]: skip {}<{} spaces from indentation", i, first, indentation);
+                                i += first;
+                            }
+                            else
+                            {
+                                _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation);
+                                i += indentation;
+                                if(first > indentation)
+                                {
+                                    _c4dbgfbl("[{}]: {} further indented than {}, stop newlining", i, first, indentation);
+                                    goto finished_counting_newlines;
+                                }
+                            }
+                            // prepare the next while loop iteration
+                            // by setting i at the next newline after
+                            // an empty line
+                            if(r[first_non_whitespace] == '\n')
+                                i = first_non_whitespace;
+                            else
+                                goto finished_counting_newlines;
+                        }
+                        else
+                        {
+                            _RYML_CB_ASSERT(m_stack.m_callbacks, i+1 <= r.len);
+                            first = rem.len;
+                            first_non_whitespace = first + i+1;
+                            if(first)
+                            {
+                                _c4dbgfbl("[{}]: {} spaces to the end", i, first);
+                                if(first < indentation)
+                                {
+                                    _c4dbgfbl("[{}]: skip everything", i);
+                                    i += first;
+                                }
+                                else
+                                {
+                                    _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation);
+                                    i += indentation;
+                                    if(first > indentation)
+                                    {
+                                        _c4dbgfbl("[{}]: {} spaces missing. not done yet", i, indentation - first);
+                                        goto finished_counting_newlines;
+                                    }
+                                }
+                            }
+                            else // if(i+1 == r.len)
+                            {
+                                _c4dbgfbl("[{}]: it's the final newline", i);
+                                _RYML_CB_ASSERT(m_stack.m_callbacks, i+1 == r.len);
+                                _RYML_CB_ASSERT(m_stack.m_callbacks, rem.len == 0);
+                            }
+                            goto end_of_scalar;
+                        }
+                    }
+                end_of_scalar:
+                    // Write all the trailing newlines. Since we're
+                    // at the end no folding is needed, so write every
+                    // newline (add 1).
+                    _c4dbgfbl("[{}]: add {} trailing newlines", i, 1+numnl_following);
+                    for(size_t j = 0; j < 1 + numnl_following; ++j)
+                        m_filter_arena.str[pos++] = '\n';
+                    break;
+                finished_counting_newlines:
+                    _c4dbgfbl("[{}]: #newlines={} firstnonws={}", i, numnl_following, first_non_whitespace);
+                    while(first_non_whitespace < r.len && r[first_non_whitespace] == '\t')
+                        ++first_non_whitespace;
+                    _c4dbgfbl("[{}]: #newlines={} firstnonws={}", i, numnl_following, first_non_whitespace);
+                    _RYML_CB_ASSERT(m_stack.m_callbacks, first_non_whitespace <= r.len);
+                    size_t last_newl = r.last_of('\n', first_non_whitespace);
+                    size_t this_indentation = first_non_whitespace - last_newl - 1;
+                    _c4dbgfbl("[{}]: #newlines={} firstnonws={} lastnewl={} this_indentation={} vs indentation={}", i, numnl_following, first_non_whitespace, last_newl, this_indentation, indentation);
+                    _RYML_CB_ASSERT(m_stack.m_callbacks, first_non_whitespace >= last_newl + 1);
+                    _RYML_CB_ASSERT(m_stack.m_callbacks, this_indentation >= indentation);
+                    if(!started)
+                    {
+                        _c4dbgfbl("[{}]: #newlines={}. write all leading newlines", i, numnl_following);
+                        for(size_t j = 0; j < 1 + numnl_following; ++j)
+                            m_filter_arena.str[pos++] = '\n';
+                        if(this_indentation > indentation)
+                        {
+                            is_indented = true;
+                            _c4dbgfbl("[{}]: advance ->{}", i, last_newl + indentation);
+                            i = last_newl + indentation;
+                        }
+                        else
+                        {
+                            i = first_non_whitespace - 1;
+                            _c4dbgfbl("[{}]: advance ->{}", i, first_non_whitespace);
+                        }
+                    }
+                    else if(this_indentation == indentation)
+                    {
+                        _c4dbgfbl("[{}]: same indentation", i);
+                        if(!is_indented)
+                        {
+                            if(numnl_following == 0)
+                            {
+                                _c4dbgfbl("[{}]: fold!", i);
+                                m_filter_arena.str[pos++] = ' ';
+                            }
+                            else
+                            {
+                                _c4dbgfbl("[{}]: add {} newlines", i, 1 + numnl_following);
+                                for(size_t j = 0; j < numnl_following; ++j)
+                                    m_filter_arena.str[pos++] = '\n';
+                            }
+                            i = first_non_whitespace - 1;
+                            _c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace);
+                        }
+                        else
+                        {
+                            _c4dbgfbl("[{}]: back to ref indentation", i);
+                            is_indented = false;
+                            on_change_indentation(numnl_following, last_newl, first_non_whitespace);
+                            _c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace);
+                        }
+                    }
+                    else
+                    {
+                        _c4dbgfbl("[{}]: increased indentation.", i);
+                        is_indented = true;
+                        _RYML_CB_ASSERT(m_stack.m_callbacks, this_indentation > indentation);
+                        on_change_indentation(numnl_following, last_newl, first_non_whitespace);
+                        _c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace);
+                    }
+                }
+                else if(curr != '\r')
+                {
+                    if(curr != '\t')
+                        started = true;
+                    m_filter_arena.str[pos++] = curr;
+                }
+            }
+            _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
+            _c4dbgfbl(": #filteredchars={} after=[{}]~~~{}~~~", (int)s.len - (int)pos, pos, m_filter_arena.first(pos));
+            bool changed = _apply_chomp(m_filter_arena, &pos, chomp);
+            if(pos < r.len || filtered_chars || changed)
+            {
+                r = _finish_filter_arena(s, pos); // write into s
+            }
+        }
+        break;
+    default:
+        _c4err("unknown block style");
+    }
+
+    _c4dbgfbl(": final=[{}]~~~{}~~~", r.len, r);
+
+    #undef _c4dbgfbl
+
+    return r;
+}
+
+//-----------------------------------------------------------------------------
+size_t Parser::_count_nlines(csubstr src)
+{
+    return 1 + src.count('\n');
+}
+
+//-----------------------------------------------------------------------------
+void Parser::_handle_directive(csubstr directive_)
+{
+    csubstr directive = directive_;
+    if(directive.begins_with("%TAG"))
+    {
+        TagDirective td;
+        _c4dbgpf("%TAG directive: {}", directive_);
+        directive = directive.sub(4);
+        if(!directive.begins_with(' '))
+            _c4err("malformed tag directive: {}", directive_);
+        directive = directive.triml(' ');
+        size_t pos = directive.find(' ');
+        if(pos == npos)
+            _c4err("malformed tag directive: {}", directive_);
+        td.handle = directive.first(pos);
+        directive = directive.sub(td.handle.len).triml(' ');
+        pos = directive.find(' ');
+        if(pos != npos)
+            directive = directive.first(pos);
+        td.prefix = directive;
+        td.next_node_id = m_tree->size();
+        if(m_tree->size() > 0)
+        {
+            size_t prev = m_tree->size() - 1;
+            if(m_tree->is_root(prev) && m_tree->type(prev) != NOTYPE && !m_tree->is_stream(prev))
+                ++td.next_node_id;
+        }
+        _c4dbgpf("%TAG: handle={} prefix={} next_node={}", td.handle, td.prefix, td.next_node_id);
+        m_tree->add_tag_directive(td);
+    }
+    else if(directive.begins_with("%YAML"))
+    {
+        _c4dbgpf("%YAML directive! ignoring...: {}", directive);
+    }
+}
+
+//-----------------------------------------------------------------------------
+void Parser::set_flags(flag_t f, State * s)
+{
+#ifdef RYML_DBG
+    char buf1_[64], buf2_[64];
+    csubstr buf1 = _prfl(buf1_, f);
+    csubstr buf2 = _prfl(buf2_, s->flags);
+    _c4dbgpf("state[{}]: setting flags to {}: before={}", s-m_stack.begin(), buf1, buf2);
+#endif
+    s->flags = f;
+}
+
+void Parser::add_flags(flag_t on, State * s)
+{
+#ifdef RYML_DBG
+    char buf1_[64], buf2_[64], buf3_[64];
+    csubstr buf1 = _prfl(buf1_, on);
+    csubstr buf2 = _prfl(buf2_, s->flags);
+    csubstr buf3 = _prfl(buf3_, s->flags|on);
+    _c4dbgpf("state[{}]: adding flags {}: before={} after={}", s-m_stack.begin(), buf1, buf2, buf3);
+#endif
+    s->flags |= on;
+}
+
+void Parser::addrem_flags(flag_t on, flag_t off, State * s)
+{
+#ifdef RYML_DBG
+    char buf1_[64], buf2_[64], buf3_[64], buf4_[64];
+    csubstr buf1 = _prfl(buf1_, on);
+    csubstr buf2 = _prfl(buf2_, off);
+    csubstr buf3 = _prfl(buf3_, s->flags);
+    csubstr buf4 = _prfl(buf4_, ((s->flags|on)&(~off)));
+    _c4dbgpf("state[{}]: adding flags {} / removing flags {}: before={} after={}", s-m_stack.begin(), buf1, buf2, buf3, buf4);
+#endif
+    s->flags |= on;
+    s->flags &= ~off;
+}
+
+void Parser::rem_flags(flag_t off, State * s)
+{
+#ifdef RYML_DBG
+    char buf1_[64], buf2_[64], buf3_[64];
+    csubstr buf1 = _prfl(buf1_, off);
+    csubstr buf2 = _prfl(buf2_, s->flags);
+    csubstr buf3 = _prfl(buf3_, s->flags&(~off));
+    _c4dbgpf("state[{}]: removing flags {}: before={} after={}", s-m_stack.begin(), buf1, buf2, buf3);
+#endif
+    s->flags &= ~off;
+}
+
+//-----------------------------------------------------------------------------
+
+csubstr Parser::_prfl(substr buf, flag_t flags)
+{
+    size_t pos = 0;
+    bool gotone = false;
+
+    #define _prflag(fl)                                     \
+    if((flags & fl) == (fl))                                \
+    {                                                       \
+        if(gotone)                                          \
+        {                                                   \
+            if(pos + 1 < buf.len)                           \
+                buf[pos] = '|';                             \
+            ++pos;                                          \
+        }                                                   \
+        csubstr fltxt = #fl;                                \
+        if(pos + fltxt.len <= buf.len)                      \
+            memcpy(buf.str + pos, fltxt.str, fltxt.len);    \
+        pos += fltxt.len;                                   \
+        gotone = true;                                      \
+    }
+
+    _prflag(RTOP);
+    _prflag(RUNK);
+    _prflag(RMAP);
+    _prflag(RSEQ);
+    _prflag(FLOW);
+    _prflag(QMRK);
+    _prflag(RKEY);
+    _prflag(RVAL);
+    _prflag(RNXT);
+    _prflag(SSCL);
+    _prflag(QSCL);
+    _prflag(RSET);
+    _prflag(NDOC);
+    _prflag(RSEQIMAP);
+
+    #undef _prflag
+
+    RYML_ASSERT(pos <= buf.len);
+
+    return buf.first(pos);
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+void Parser::_grow_filter_arena(size_t num_characters_needed)
+{
+    _c4dbgpf("grow: arena={} numchars={}", m_filter_arena.len, num_characters_needed);
+    if(num_characters_needed <= m_filter_arena.len)
+        return;
+    size_t sz = m_filter_arena.len << 1;
+    _c4dbgpf("grow: sz={}", sz);
+    sz = num_characters_needed > sz ? num_characters_needed : sz;
+    _c4dbgpf("grow: sz={}", sz);
+    sz = sz < 128u ? 128u : sz;
+    _c4dbgpf("grow: sz={}", sz);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, sz >= num_characters_needed);
+    _resize_filter_arena(sz);
+}
+
+void Parser::_resize_filter_arena(size_t num_characters)
+{
+    if(num_characters > m_filter_arena.len)
+    {
+        _c4dbgpf("resize: sz={}", num_characters);
+        char *prev = m_filter_arena.str;
+        if(m_filter_arena.str)
+        {
+            _RYML_CB_ASSERT(m_stack.m_callbacks, m_filter_arena.len > 0);
+            _RYML_CB_FREE(m_stack.m_callbacks, m_filter_arena.str, char, m_filter_arena.len);
+        }
+        m_filter_arena.str = _RYML_CB_ALLOC_HINT(m_stack.m_callbacks, char, num_characters, prev);
+        m_filter_arena.len = num_characters;
+    }
+}
+
+substr Parser::_finish_filter_arena(substr dst, size_t pos)
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= dst.len);
+    memcpy(dst.str, m_filter_arena.str, pos);
+    return dst.first(pos);
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+csubstr Parser::location_contents(Location const& loc) const
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks, loc.offset < m_buf.len);
+    return m_buf.sub(loc.offset);
+}
+
+Location Parser::location(ConstNodeRef node) const
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks, node.valid());
+    return location(*node.tree(), node.id());
+}
+
+Location Parser::location(Tree const& tree, size_t node) const
+{
+    // try hard to avoid getting the location from a null string.
+    Location loc;
+    if(_location_from_node(tree, node, &loc, 0))
+        return loc;
+    return val_location(m_buf.str);
+}
+
+bool Parser::_location_from_node(Tree const& tree, size_t node, Location *C4_RESTRICT loc, size_t level) const
+{
+    if(tree.has_key(node))
+    {
+        csubstr k = tree.key(node);
+        if(C4_LIKELY(k.str != nullptr))
+        {
+            _RYML_CB_ASSERT(m_stack.m_callbacks, k.is_sub(m_buf));
+            _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(k));
+            *loc = val_location(k.str);
+            return true;
+        }
+    }
+
+    if(tree.has_val(node))
+    {
+        csubstr v = tree.val(node);
+        if(C4_LIKELY(v.str != nullptr))
+        {
+            _RYML_CB_ASSERT(m_stack.m_callbacks, v.is_sub(m_buf));
+            _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(v));
+            *loc = val_location(v.str);
+            return true;
+        }
+    }
+
+    if(tree.is_container(node))
+    {
+        if(_location_from_cont(tree, node, loc))
+            return true;
+    }
+
+    if(tree.type(node) != NOTYPE && level == 0)
+    {
+        // try the prev sibling
+        {
+            const size_t prev = tree.prev_sibling(node);
+            if(prev != NONE)
+            {
+                if(_location_from_node(tree, prev, loc, level+1))
+                    return true;
+            }
+        }
+        // try the next sibling
+        {
+            const size_t next = tree.next_sibling(node);
+            if(next != NONE)
+            {
+                if(_location_from_node(tree, next, loc, level+1))
+                    return true;
+            }
+        }
+        // try the parent
+        {
+            const size_t parent = tree.parent(node);
+            if(parent != NONE)
+            {
+                if(_location_from_node(tree, parent, loc, level+1))
+                    return true;
+            }
+        }
+    }
+
+    return false;
+}
+
+bool Parser::_location_from_cont(Tree const& tree, size_t node, Location *C4_RESTRICT loc) const
+{
+    _RYML_CB_ASSERT(m_stack.m_callbacks, tree.is_container(node));
+    if(!tree.is_stream(node))
+    {
+        const char *node_start = tree._p(node)->m_val.scalar.str;  // this was stored in the container
+        if(tree.has_children(node))
+        {
+            size_t child = tree.first_child(node);
+            if(tree.has_key(child))
+            {
+                // when a map starts, the container was set after the key
+                csubstr k = tree.key(child);
+                if(k.str && node_start > k.str)
+                    node_start = k.str;
+            }
+        }
+        *loc = val_location(node_start);
+        return true;
+    }
+    else // it's a stream
+    {
+        *loc = val_location(m_buf.str); // just return the front of the buffer
+    }
+    return true;
+}
+
+
+Location Parser::val_location(const char *val) const
+{
+    if(C4_UNLIKELY(val == nullptr))
+        return {m_file, 0, 0, 0};
+
+    _RYML_CB_CHECK(m_stack.m_callbacks, m_options.locations());
+    // NOTE: if any of these checks fails, the parser needs to be
+    // instantiated with locations enabled.
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.str == m_newline_offsets_buf.str);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.len == m_newline_offsets_buf.len);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_options.locations());
+    _RYML_CB_ASSERT(m_stack.m_callbacks, !_locations_dirty());
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_newline_offsets != nullptr);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_newline_offsets_size > 0);
+    // NOTE: the pointer needs to belong to the buffer that was used to parse.
+    csubstr src = m_buf;
+    _RYML_CB_CHECK(m_stack.m_callbacks, val != nullptr || src.str == nullptr);
+    _RYML_CB_CHECK(m_stack.m_callbacks, (val >= src.begin() && val <= src.end()) || (src.str == nullptr && val == nullptr));
+    // ok. search the first stored newline after the given ptr
+    using lineptr_type = size_t const* C4_RESTRICT;
+    lineptr_type lineptr = nullptr;
+    size_t offset = (size_t)(val - src.begin());
+    if(m_newline_offsets_size < 30) // TODO magic number
+    {
+        // just do a linear search if the size is small.
+        for(lineptr_type curr = m_newline_offsets, last = m_newline_offsets + m_newline_offsets_size; curr < last; ++curr)
+        {
+            if(*curr > offset)
+            {
+                lineptr = curr;
+                break;
+            }
+        }
+    }
+    else
+    {
+        // do a bisection search if the size is not small.
+        //
+        // We could use std::lower_bound but this is simple enough and
+        // spares the include of <algorithm>.
+        size_t count = m_newline_offsets_size;
+        size_t step;
+        lineptr_type it;
+        lineptr = m_newline_offsets;
+        while(count)
+        {
+            step = count >> 1;
+            it = lineptr + step;
+            if(*it < offset)
+            {
+                lineptr = ++it;
+                count -= step + 1;
+            }
+            else
+            {
+                count = step;
+            }
+        }
+    }
+    _RYML_CB_ASSERT(m_stack.m_callbacks, lineptr >= m_newline_offsets);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, lineptr <= m_newline_offsets + m_newline_offsets_size);
+    _RYML_CB_ASSERT(m_stack.m_callbacks, *lineptr > offset);
+    Location loc;
+    loc.name = m_file;
+    loc.offset = offset;
+    loc.line = (size_t)(lineptr - m_newline_offsets);
+    if(lineptr > m_newline_offsets)
+        loc.col = (offset - *(lineptr-1) - 1u);
+    else
+        loc.col = offset;
+    return loc;
+}
+
+void Parser::_prepare_locations()
+{
+    m_newline_offsets_buf = m_buf;
+    size_t numnewlines = 1u + m_buf.count('\n');
+    _resize_locations(numnewlines);
+    m_newline_offsets_size = 0;
+    for(size_t i = 0; i < m_buf.len; i++)
+        if(m_buf[i] == '\n')
+            m_newline_offsets[m_newline_offsets_size++] = i;
+    m_newline_offsets[m_newline_offsets_size++] = m_buf.len;
+    _RYML_CB_ASSERT(m_stack.m_callbacks, m_newline_offsets_size == numnewlines);
+}
+
+void Parser::_resize_locations(size_t numnewlines)
+{
+    if(numnewlines > m_newline_offsets_capacity)
+    {
+        if(m_newline_offsets)
+            _RYML_CB_FREE(m_stack.m_callbacks, m_newline_offsets, size_t, m_newline_offsets_capacity);
+        m_newline_offsets = _RYML_CB_ALLOC_HINT(m_stack.m_callbacks, size_t, numnewlines, m_newline_offsets);
+        m_newline_offsets_capacity = numnewlines;
+    }
+}
+
+bool Parser::_locations_dirty() const
+{
+    return !m_newline_offsets_size;
+}
+
+} // namespace yml
+} // namespace c4
+
+
+#if defined(_MSC_VER)
+#   pragma warning(pop)
+#elif defined(__clang__)
+#   pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic pop
+#endif
+
+#endif /* RYML_SINGLE_HDR_DEFINE_NOW */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/parse.cpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/node.cpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/node.cpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifdef RYML_SINGLE_HDR_DEFINE_NOW
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/node.hpp
+//#include "c4/yml/node.hpp"
+#if !defined(C4_YML_NODE_HPP_) && !defined(_C4_YML_NODE_HPP_)
+#error "amalgamate: file c4/yml/node.hpp must have been included at this point"
+#endif /* C4_YML_NODE_HPP_ */
+
+
+namespace c4 {
+namespace yml {
+
+
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+size_t NodeRef::set_key_serialized(c4::fmt::const_base64_wrapper w)
+{
+    _apply_seed();
+    csubstr encoded = this->to_arena(w);
+    this->set_key(encoded);
+    return encoded.len;
+}
+
+size_t NodeRef::set_val_serialized(c4::fmt::const_base64_wrapper w)
+{
+    _apply_seed();
+    csubstr encoded = this->to_arena(w);
+    this->set_val(encoded);
+    return encoded.len;
+}
+
+} // namespace yml
+} // namespace c4
+
+#endif /* RYML_SINGLE_HDR_DEFINE_NOW */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/node.cpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/preprocess.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/preprocess.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_YML_PREPROCESS_HPP_
+#define _C4_YML_PREPROCESS_HPP_
+
+/** @file preprocess.hpp Functions for preprocessing YAML prior to parsing. */
+
+/** @defgroup Preprocessors Preprocessor functions
+ *
+ * These are the existing preprocessors:
+ *
+ * @code{.cpp}
+ * size_t preprocess_json(csubstr json, substr buf)
+ * size_t preprocess_rxmap(csubstr json, substr buf)
+ * @endcode
+ */
+
+#ifndef _C4_YML_COMMON_HPP_
+//included above:
+//#include "./common.hpp"
+#endif
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/substr.hpp
+//#include <c4/substr.hpp>
+#if !defined(C4_SUBSTR_HPP_) && !defined(_C4_SUBSTR_HPP_)
+#error "amalgamate: file c4/substr.hpp must have been included at this point"
+#endif /* C4_SUBSTR_HPP_ */
+
+
+
+namespace c4 {
+namespace yml {
+
+namespace detail {
+using Preprocessor = size_t(csubstr, substr);
+template<Preprocessor PP, class CharContainer>
+substr preprocess_into_container(csubstr input, CharContainer *out)
+{
+    // try to write once. the preprocessor will stop writing at the end of
+    // the container, but will process all the input to determine the
+    // required container size.
+    size_t sz = PP(input, to_substr(*out));
+    // if the container size is not enough, resize, and run again in the
+    // resized container
+    if(sz > out->size())
+    {
+        out->resize(sz);
+        sz = PP(input, to_substr(*out));
+    }
+    return to_substr(*out).first(sz);
+}
+} // namespace detail
+
+
+//-----------------------------------------------------------------------------
+
+/** @name preprocess_rxmap
+ * Convert flow-type relaxed maps (with implicit bools) into strict YAML
+ * flow map.
+ *
+ * @code{.yaml}
+ * {a, b, c, d: [e, f], g: {a, b}}
+ * # is converted into this:
+ * {a: 1, b: 1, c: 1, d: [e, f], g: {a, b}}
+ * @endcode
+
+ * @note this is NOT recursive - conversion happens only in the top-level map
+ * @param rxmap A relaxed map
+ * @param buf output buffer
+ * @param out output container
+ */
+
+//@{
+
+/** Write into a given output buffer. This function is safe to call with
+ * empty or small buffers; it won't write beyond the end of the buffer.
+ *
+ * @return the number of characters required for output
+ */
+RYML_EXPORT size_t preprocess_rxmap(csubstr rxmap, substr buf);
+
+
+/** Write into an existing container. It is resized to contained the output.
+ * @return a substr of the container
+ * @overload preprocess_rxmap */
+template<class CharContainer>
+substr preprocess_rxmap(csubstr rxmap, CharContainer *out)
+{
+    return detail::preprocess_into_container<preprocess_rxmap>(rxmap, out);
+}
+
+
+/** Create a container with the result.
+ * @overload preprocess_rxmap */
+template<class CharContainer>
+CharContainer preprocess_rxmap(csubstr rxmap)
+{
+    CharContainer out;
+    preprocess_rxmap(rxmap, &out);
+    return out;
+}
+
+//@}
+
+} // namespace yml
+} // namespace c4
+
+#endif /* _C4_YML_PREPROCESS_HPP_ */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/preprocess.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/preprocess.cpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/preprocess.cpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifdef RYML_SINGLE_HDR_DEFINE_NOW
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/preprocess.hpp
+//#include "c4/yml/preprocess.hpp"
+#if !defined(C4_YML_PREPROCESS_HPP_) && !defined(_C4_YML_PREPROCESS_HPP_)
+#error "amalgamate: file c4/yml/preprocess.hpp must have been included at this point"
+#endif /* C4_YML_PREPROCESS_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/detail/parser_dbg.hpp
+//#include "c4/yml/detail/parser_dbg.hpp"
+#if !defined(C4_YML_DETAIL_PARSER_DBG_HPP_) && !defined(_C4_YML_DETAIL_PARSER_DBG_HPP_)
+#error "amalgamate: file c4/yml/detail/parser_dbg.hpp must have been included at this point"
+#endif /* C4_YML_DETAIL_PARSER_DBG_HPP_ */
+
+
+/** @file preprocess.hpp Functions for preprocessing YAML prior to parsing. */
+
+namespace c4 {
+namespace yml {
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+namespace {
+C4_ALWAYS_INLINE bool _is_idchar(char c)
+{
+    return (c >= 'a' && c <= 'z')
+        || (c >= 'A' && c <= 'Z')
+        || (c >= '0' && c <= '9')
+        || (c == '_' || c == '-' || c == '~' || c == '$');
+}
+
+typedef enum { kReadPending = 0, kKeyPending = 1, kValPending = 2 } _ppstate;
+C4_ALWAYS_INLINE _ppstate _next(_ppstate s)
+{
+    int n = (int)s + 1;
+    return (_ppstate)(n <= (int)kValPending ? n : 0);
+}
+} // empty namespace
+
+
+//-----------------------------------------------------------------------------
+
+size_t preprocess_rxmap(csubstr s, substr buf)
+{
+    detail::_SubstrWriter writer(buf);
+    _ppstate state = kReadPending;
+    size_t last = 0;
+
+    if(s.begins_with('{'))
+    {
+        RYML_CHECK(s.ends_with('}'));
+        s = s.offs(1, 1);
+    }
+
+    writer.append('{');
+
+    for(size_t i = 0; i < s.len; ++i)
+    {
+        const char curr = s[i];
+        const char next = i+1 < s.len ? s[i+1] : '\0';
+
+        if(curr == '\'' || curr == '"')
+        {
+            csubstr ss = s.sub(i).pair_range_esc(curr, '\\');
+            i += static_cast<size_t>(ss.end() - (s.str + i));
+            state = _next(state);
+        }
+        else if(state == kReadPending && _is_idchar(curr))
+        {
+            state = _next(state);
+        }
+
+        switch(state)
+        {
+        case kKeyPending:
+        {
+            if(curr == ':' && next == ' ')
+            {
+                state = _next(state);
+            }
+            else if(curr == ',' && next == ' ')
+            {
+                writer.append(s.range(last, i));
+                writer.append(": 1, ");
+                last = i + 2;
+            }
+            break;
+        }
+        case kValPending:
+        {
+            if(curr == '[' || curr == '{' || curr == '(')
+            {
+                csubstr ss = s.sub(i).pair_range_nested(curr, '\\');
+                i += static_cast<size_t>(ss.end() - (s.str + i));
+                state = _next(state);
+            }
+            else if(curr == ',' && next == ' ')
+            {
+                state = _next(state);
+            }
+            break;
+        }
+        default:
+            // nothing to do
+            break;
+        }
+    }
+
+    writer.append(s.sub(last));
+    if(state == kKeyPending)
+        writer.append(": 1");
+    writer.append('}');
+
+    return writer.pos;
+}
+
+
+} // namespace yml
+} // namespace c4
+
+#endif /* RYML_SINGLE_HDR_DEFINE_NOW */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/preprocess.cpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/detail/checks.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/detail/checks.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef C4_YML_DETAIL_CHECKS_HPP_
+#define C4_YML_DETAIL_CHECKS_HPP_
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/tree.hpp
+//#include "c4/yml/tree.hpp"
+#if !defined(C4_YML_TREE_HPP_) && !defined(_C4_YML_TREE_HPP_)
+#error "amalgamate: file c4/yml/tree.hpp must have been included at this point"
+#endif /* C4_YML_TREE_HPP_ */
+
+
+#ifdef __clang__
+#   pragma clang diagnostic push
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wtype-limits" // error: comparison of unsigned expression >= 0 is always true
+#elif defined(_MSC_VER)
+#   pragma warning(push)
+#   pragma warning(disable: 4296/*expression is always 'boolean_value'*/)
+#endif
+
+namespace c4 {
+namespace yml {
+
+
+void check_invariants(Tree const& t, size_t node=NONE);
+void check_free_list(Tree const& t);
+void check_arena(Tree const& t);
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+inline void check_invariants(Tree const& t, size_t node)
+{
+    if(node == NONE)
+    {
+        if(t.size() == 0) return;
+        node = t.root_id();
+    }
+
+    auto const& n = *t._p(node);
+#ifdef RYML_DBG
+    if(n.m_first_child != NONE || n.m_last_child != NONE)
+    {
+        printf("check(%zu): fc=%zu lc=%zu\n", node, n.m_first_child, n.m_last_child);
+    }
+    else
+    {
+        printf("check(%zu)\n", node);
+    }
+#endif
+
+    C4_CHECK(n.m_parent != node);
+    if(n.m_parent == NONE)
+    {
+        C4_CHECK(t.is_root(node));
+    }
+    else //if(n.m_parent != NONE)
+    {
+        C4_CHECK(t.has_child(n.m_parent, node));
+
+        auto const& p = *t._p(n.m_parent);
+        if(n.m_prev_sibling == NONE)
+        {
+            C4_CHECK(p.m_first_child == node);
+            C4_CHECK(t.first_sibling(node) == node);
+        }
+        else
+        {
+            C4_CHECK(p.m_first_child != node);
+            C4_CHECK(t.first_sibling(node) != node);
+        }
+
+        if(n.m_next_sibling == NONE)
+        {
+            C4_CHECK(p.m_last_child == node);
+            C4_CHECK(t.last_sibling(node) == node);
+        }
+        else
+        {
+            C4_CHECK(p.m_last_child != node);
+            C4_CHECK(t.last_sibling(node) != node);
+        }
+    }
+
+    C4_CHECK(n.m_first_child != node);
+    C4_CHECK(n.m_last_child != node);
+    if(n.m_first_child != NONE || n.m_last_child != NONE)
+    {
+        C4_CHECK(n.m_first_child != NONE);
+        C4_CHECK(n.m_last_child != NONE);
+    }
+
+    C4_CHECK(n.m_prev_sibling != node);
+    C4_CHECK(n.m_next_sibling != node);
+    if(n.m_prev_sibling != NONE)
+    {
+        C4_CHECK(t._p(n.m_prev_sibling)->m_next_sibling == node);
+        C4_CHECK(t._p(n.m_prev_sibling)->m_prev_sibling != node);
+    }
+    if(n.m_next_sibling != NONE)
+    {
+        C4_CHECK(t._p(n.m_next_sibling)->m_prev_sibling == node);
+        C4_CHECK(t._p(n.m_next_sibling)->m_next_sibling != node);
+    }
+
+    size_t count = 0;
+    for(size_t i = n.m_first_child; i != NONE; i = t.next_sibling(i))
+    {
+#ifdef RYML_DBG
+        printf("check(%zu):               descend to child[%zu]=%zu\n", node, count, i);
+#endif
+        auto const& ch = *t._p(i);
+        C4_CHECK(ch.m_parent == node);
+        C4_CHECK(ch.m_next_sibling != i);
+        ++count;
+    }
+    C4_CHECK(count == t.num_children(node));
+
+    if(n.m_prev_sibling == NONE && n.m_next_sibling == NONE)
+    {
+        if(n.m_parent != NONE)
+        {
+            C4_CHECK(t.num_children(n.m_parent) == 1);
+            C4_CHECK(t.num_siblings(node) == 1);
+        }
+    }
+
+    if(node == t.root_id())
+    {
+        C4_CHECK(t.size() == t.m_size);
+        C4_CHECK(t.capacity() == t.m_cap);
+        C4_CHECK(t.m_cap == t.m_size + t.slack());
+        check_free_list(t);
+        check_arena(t);
+    }
+
+    for(size_t i = t.first_child(node); i != NONE; i = t.next_sibling(i))
+    {
+        check_invariants(t, i);
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+inline void check_free_list(Tree const& t)
+{
+    if(t.m_free_head == NONE)
+    {
+        C4_CHECK(t.m_free_tail == t.m_free_head);
+        return;
+    }
+
+    C4_CHECK(t.m_free_head >= 0 && t.m_free_head < t.m_cap);
+    C4_CHECK(t.m_free_tail >= 0 && t.m_free_tail < t.m_cap);
+
+    auto const& head = *t._p(t.m_free_head);
+    //auto const& tail = *t._p(t.m_free_tail);
+
+    //C4_CHECK(head.m_prev_sibling == NONE);
+    //C4_CHECK(tail.m_next_sibling == NONE);
+
+    size_t count = 0;
+    for(size_t i = t.m_free_head, prev = NONE; i != NONE; i = t._p(i)->m_next_sibling)
+    {
+        auto const& elm = *t._p(i);
+        if(&elm != &head)
+        {
+            C4_CHECK(elm.m_prev_sibling == prev);
+        }
+        prev = i;
+        ++count;
+    }
+    C4_CHECK(count == t.slack());
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+inline void check_arena(Tree const& t)
+{
+    C4_CHECK(t.m_arena.len == 0 || (t.m_arena_pos >= 0 && t.m_arena_pos <= t.m_arena.len));
+    C4_CHECK(t.arena_size() == t.m_arena_pos);
+    C4_CHECK(t.arena_slack() + t.m_arena_pos == t.m_arena.len);
+}
+
+
+} /* namespace yml */
+} /* namespace c4 */
+
+#ifdef __clang__
+#   pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#   pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+#   pragma warning(pop)
+#endif
+
+#endif /* C4_YML_DETAIL_CHECKS_HPP_ */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/detail/checks.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/detail/print.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/detail/print.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef C4_YML_DETAIL_PRINT_HPP_
+#define C4_YML_DETAIL_PRINT_HPP_
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/tree.hpp
+//#include "c4/yml/tree.hpp"
+#if !defined(C4_YML_TREE_HPP_) && !defined(_C4_YML_TREE_HPP_)
+#error "amalgamate: file c4/yml/tree.hpp must have been included at this point"
+#endif /* C4_YML_TREE_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/node.hpp
+//#include "c4/yml/node.hpp"
+#if !defined(C4_YML_NODE_HPP_) && !defined(_C4_YML_NODE_HPP_)
+#error "amalgamate: file c4/yml/node.hpp must have been included at this point"
+#endif /* C4_YML_NODE_HPP_ */
+
+
+
+namespace c4 {
+namespace yml {
+
+
+inline size_t print_node(Tree const& p, size_t node, int level, size_t count, bool print_children)
+{
+    printf("[%zd]%*s[%zd] %p", count, (2*level), "", node, (void*)p.get(node));
+    if(p.is_root(node))
+    {
+        printf(" [ROOT]");
+    }
+    printf(" %s:", p.type_str(node));
+    if(p.has_key(node))
+    {
+        if(p.has_key_anchor(node))
+        {
+            csubstr ka = p.key_anchor(node);
+            printf(" &%.*s", (int)ka.len, ka.str);
+        }
+        if(p.has_key_tag(node))
+        {
+            csubstr kt = p.key_tag(node);
+            csubstr k  = p.key(node);
+            printf(" %.*s '%.*s'", (int)kt.len, kt.str, (int)k.len, k.str);
+        }
+        else
+        {
+            csubstr k  = p.key(node);
+            printf(" '%.*s'", (int)k.len, k.str);
+        }
+    }
+    else
+    {
+        RYML_ASSERT( ! p.has_key_tag(node));
+    }
+    if(p.has_val(node))
+    {
+        if(p.has_val_tag(node))
+        {
+            csubstr vt = p.val_tag(node);
+            csubstr v  = p.val(node);
+            printf(" %.*s '%.*s'", (int)vt.len, vt.str, (int)v.len, v.str);
+        }
+        else
+        {
+            csubstr v  = p.val(node);
+            printf(" '%.*s'", (int)v.len, v.str);
+        }
+    }
+    else
+    {
+        if(p.has_val_tag(node))
+        {
+            csubstr vt = p.val_tag(node);
+            printf(" %.*s", (int)vt.len, vt.str);
+        }
+    }
+    if(p.has_val_anchor(node))
+    {
+        auto &a = p.val_anchor(node);
+        printf(" valanchor='&%.*s'", (int)a.len, a.str);
+    }
+    printf(" (%zd sibs)", p.num_siblings(node));
+
+    ++count;
+
+    if(p.is_container(node))
+    {
+        printf(" %zd children:\n", p.num_children(node));
+        if(print_children)
+        {
+            for(size_t i = p.first_child(node); i != NONE; i = p.next_sibling(i))
+            {
+                count = print_node(p, i, level+1, count, print_children);
+            }
+        }
+    }
+    else
+    {
+        printf("\n");
+    }
+
+    return count;
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+inline void print_node(ConstNodeRef const& p, int level=0)
+{
+    print_node(*p.tree(), p.id(), level, 0, true);
+}
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+inline size_t print_tree(Tree const& p, size_t node=NONE)
+{
+    printf("--------------------------------------\n");
+    size_t ret = 0;
+    if(!p.empty())
+    {
+        if(node == NONE)
+            node = p.root_id();
+        ret = print_node(p, node, 0, 0, true);
+    }
+    printf("#nodes=%zd vs #printed=%zd\n", p.size(), ret);
+    printf("--------------------------------------\n");
+    return ret;
+}
+
+
+} /* namespace yml */
+} /* namespace c4 */
+
+
+#endif /* C4_YML_DETAIL_PRINT_HPP_ */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/detail/print.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/c4/yml/yml.hpp
+// https://github.com/biojppm/rapidyaml/src/c4/yml/yml.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _C4_YML_YML_HPP_
+#define _C4_YML_YML_HPP_
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/tree.hpp
+//#include "c4/yml/tree.hpp"
+#if !defined(C4_YML_TREE_HPP_) && !defined(_C4_YML_TREE_HPP_)
+#error "amalgamate: file c4/yml/tree.hpp must have been included at this point"
+#endif /* C4_YML_TREE_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/node.hpp
+//#include "c4/yml/node.hpp"
+#if !defined(C4_YML_NODE_HPP_) && !defined(_C4_YML_NODE_HPP_)
+#error "amalgamate: file c4/yml/node.hpp must have been included at this point"
+#endif /* C4_YML_NODE_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/emit.hpp
+//#include "c4/yml/emit.hpp"
+#if !defined(C4_YML_EMIT_HPP_) && !defined(_C4_YML_EMIT_HPP_)
+#error "amalgamate: file c4/yml/emit.hpp must have been included at this point"
+#endif /* C4_YML_EMIT_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/parse.hpp
+//#include "c4/yml/parse.hpp"
+#if !defined(C4_YML_PARSE_HPP_) && !defined(_C4_YML_PARSE_HPP_)
+#error "amalgamate: file c4/yml/parse.hpp must have been included at this point"
+#endif /* C4_YML_PARSE_HPP_ */
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/preprocess.hpp
+//#include "c4/yml/preprocess.hpp"
+#if !defined(C4_YML_PREPROCESS_HPP_) && !defined(_C4_YML_PREPROCESS_HPP_)
+#error "amalgamate: file c4/yml/preprocess.hpp must have been included at this point"
+#endif /* C4_YML_PREPROCESS_HPP_ */
+
+
+#endif // _C4_YML_YML_HPP_
+
+
+// (end https://github.com/biojppm/rapidyaml/src/c4/yml/yml.hpp)
+
+
+
+//********************************************************************************
+//--------------------------------------------------------------------------------
+// src/ryml.hpp
+// https://github.com/biojppm/rapidyaml/src/ryml.hpp
+//--------------------------------------------------------------------------------
+//********************************************************************************
+
+#ifndef _RYML_HPP_
+#define _RYML_HPP_
+
+// amalgamate: removed include of
+// https://github.com/biojppm/rapidyaml/src/c4/yml/yml.hpp
+//#include "c4/yml/yml.hpp"
+#if !defined(C4_YML_YML_HPP_) && !defined(_C4_YML_YML_HPP_)
+#error "amalgamate: file c4/yml/yml.hpp must have been included at this point"
+#endif /* C4_YML_YML_HPP_ */
+
+
+namespace ryml {
+using namespace c4::yml;
+using namespace c4;
+}
+
+#endif /* _RYML_HPP_ */
+
+
+// (end https://github.com/biojppm/rapidyaml/src/ryml.hpp)
+
+#endif /* _RYML_SINGLE_HEADER_AMALGAMATED_HPP_ */
+
diff --git a/src/include/simeng/kernel/Linux.hh b/src/include/simeng/kernel/Linux.hh
index 0908d59006..a37b96cda5 100644
--- a/src/include/simeng/kernel/Linux.hh
+++ b/src/include/simeng/kernel/Linux.hh
@@ -14,26 +14,26 @@ namespace kernel {
 /** Fixed-width definition of `stat`.
  * Defined by Linux kernel in include/uapi/asm-generic/stat.h */
 struct stat {
-  uint64_t dev;        // offset =   0
-  uint64_t ino;        // offset =   8
-  uint32_t mode;       // offset =  16
-  uint32_t nlink;      // offset =  20
-  uint32_t uid;        // offset =  24
-  uint32_t gid;        // offset =  28
-  uint64_t rdev;       // offset =  32
-  uint64_t padding1;   // offset =  40
-  int64_t size;        // offset =  48
-  int32_t blksize;     // offset =  56
-  uint32_t padding2;   // offset =  60
-  int64_t blocks;      // offset =  64
-  int64_t atime;       // offset =  72
-  uint64_t atimensec;  // offset =  80
-  int64_t mtime;       // offset =  88
-  uint64_t mtimensec;  // offset =  96
-  int64_t ctime;       // offset = 104
-  uint64_t ctimensec;  // offset = 112
-  uint32_t padding3;   // offset = 116
-  uint32_t padding4;   // offset = 124
+  uint64_t dev = 0;        // offset =   0
+  uint64_t ino = 0;        // offset =   8
+  uint32_t mode = 0;       // offset =  16
+  uint32_t nlink = 0;      // offset =  20
+  uint32_t uid = 0;        // offset =  24
+  uint32_t gid = 0;        // offset =  28
+  uint64_t rdev = 0;       // offset =  32
+  uint64_t padding1 = 0;   // offset =  40
+  int64_t size = 0;        // offset =  48
+  int32_t blksize = 0;     // offset =  56
+  uint32_t padding2 = 0;   // offset =  60
+  int64_t blocks = 0;      // offset =  64
+  int64_t atime = 0;       // offset =  72
+  uint64_t atimensec = 0;  // offset =  80
+  int64_t mtime = 0;       // offset =  88
+  uint64_t mtimensec = 0;  // offset =  96
+  int64_t ctime = 0;       // offset = 104
+  uint64_t ctimensec = 0;  // offset = 112
+  uint32_t padding3 = 0;   // offset = 116
+  uint32_t padding4 = 0;   // offset = 124
 };
 
 /** Fixed-width definition of `termios`.
@@ -89,7 +89,8 @@ struct LinuxProcessState {
   /** The clear_child_tid value. */
   uint64_t clearChildTid = 0;
 
-  /** The virtual file descriptor mapping table. */
+  /** The virtual file descriptor mapping table. Maps virtual file descriptors
+   * to host file descriptors */
   std::vector<int64_t> fileDescriptorTable;
   /** Set of deallocated virtual file descriptors available for reuse. */
   std::set<int64_t> freeFileDescriptors;
@@ -130,6 +131,9 @@ struct linux_dirent64 {
    to Linux system calls. */
 class Linux {
  public:
+  Linux(const std::string specialFiledirPath)
+      : specialFilesDir_(specialFiledirPath) {}
+
   /** Create a new Linux process running above this kernel. */
   void createProcess(const LinuxProcess& process);
 
@@ -156,7 +160,7 @@ class Linux {
                     int64_t flag);
 
   /** close syscall: close a file descriptor. */
-  int64_t close(int64_t fd);
+  int64_t close(int64_t vfd);
 
   /** newfstatat syscall: get file status; AKA fstatat. */
   int64_t newfstatat(int64_t dfd, const std::string& filename, stat& out,
@@ -201,7 +205,7 @@ class Linux {
                 off_t offset);
 
   /** openat syscall: open/create a file. */
-  int64_t openat(int64_t dirfd, const std::string& path, int64_t flags,
+  int64_t openat(int64_t vdfd, const std::string& pathname, int64_t flags,
                  uint16_t mode);
 
   /** readlinkat syscall: read value of a symbolic link. */
@@ -237,9 +241,11 @@ class Linux {
   static const size_t LINUX_PATH_MAX = 4096;
 
  private:
-  /** Resturn correct Dirfd depending on given pathname abd dirfd given to
-   * syscall. */
-  uint64_t getDirFd(int64_t dfd, std::string pathname);
+  /** Return the host directory file descriptor mapped to by the virtual dfd
+   * given to syscall. If vdfd is Linux::AT_FDCWD (-100) then Host::AT_FDCWD is
+   * returned
+   */
+  int64_t getHostDirFD(int64_t vdfd);
 
   /** If the given filepath points to a special file, the filepath is replaced
    * to point to the SimEng equivalent. */
@@ -252,7 +258,7 @@ class Linux {
   std::unordered_map<std::string, const std::string> specialPathTranslations_;
 
   /** Path to the root of the replacement special files. */
-  const std::string specialFilesDir_ = SIMENG_BUILD_DIR "/specialFiles";
+  const std::string specialFilesDir_;
 
   /** Vector of all currently supported special file paths & files.*/
   std::vector<std::string> supportedSpecialFiles_;
diff --git a/src/include/simeng/kernel/LinuxProcess.hh b/src/include/simeng/kernel/LinuxProcess.hh
index 2f13a7727d..9d3fcf1c25 100644
--- a/src/include/simeng/kernel/LinuxProcess.hh
+++ b/src/include/simeng/kernel/LinuxProcess.hh
@@ -3,7 +3,7 @@
 #include <memory>
 
 #include "simeng/Elf.hh"
-#include "yaml-cpp/yaml.h"
+#include "simeng/config/SimInfo.hh"
 
 namespace simeng {
 namespace kernel {
@@ -40,7 +40,7 @@ uint64_t alignToBoundary(uint64_t value, uint64_t boundary);
  *
  * The constructed process follows a typical layout:
  *
- * |---------------| <- start of stack
+ * |---------------| <- start/bottom of stack
  * |     Stack     |    stack grows downwards
  * |-v-----------v-|
  * |               |
@@ -63,18 +63,21 @@ class LinuxProcess {
   /** Construct a Linux process from a vector of command-line arguments.
    *
    * The first argument is a path to an executable ELF file. */
-  LinuxProcess(const std::vector<std::string>& commandLine, YAML::Node config);
+  LinuxProcess(const std::vector<std::string>& commandLine,
+               ryml::ConstNodeRef config = config::SimInfo::getConfig());
 
   /** Construct a Linux process from region of instruction memory, with the
-   * entry point fixed at 0. */
-  LinuxProcess(span<char> instructions, YAML::Node config);
+   * entry point fixed at 0 and source directory set to the default programs'.
+   * For use in test suites. */
+  LinuxProcess(span<const uint8_t> instructions,
+               ryml::ConstNodeRef config = config::SimInfo::getConfig());
 
   ~LinuxProcess();
 
   /** Get the address of the start of the heap region. */
   uint64_t getHeapStart() const;
 
-  /** Get the address of the top of the stack. */
+  /** Get the address of the bottom of the stack. */
   uint64_t getStackStart() const;
 
   /** Get the address of the start of the mmap region. */
@@ -92,8 +95,8 @@ class LinuxProcess {
   /** Get the entry point. */
   uint64_t getEntryPoint() const;
 
-  /** Get the initial stack pointer address. */
-  uint64_t getStackPointer() const;
+  /** Get the initial stack pointer. */
+  uint64_t getInitialStackPointer() const;
 
   /** Get the path of the executable. */
   std::string getPath() const;
@@ -132,7 +135,7 @@ class LinuxProcess {
   /** The page size of the process memory. */
   const uint64_t pageSize_ = 4096;
 
-  /** The address of the stack pointer. */
+  /** The address of the head/top of the stack */
   uint64_t stackPointer_;
 
   /** The process image size. */
diff --git a/src/include/simeng/FixedLatencyMemoryInterface.hh b/src/include/simeng/memory/FixedLatencyMemoryInterface.hh
similarity index 94%
rename from src/include/simeng/FixedLatencyMemoryInterface.hh
rename to src/include/simeng/memory/FixedLatencyMemoryInterface.hh
index dddbf9eca5..44cbf7adcc 100644
--- a/src/include/simeng/FixedLatencyMemoryInterface.hh
+++ b/src/include/simeng/memory/FixedLatencyMemoryInterface.hh
@@ -3,10 +3,12 @@
 #include <queue>
 #include <vector>
 
-#include "simeng/MemoryInterface.hh"
+#include "simeng/memory/MemoryInterface.hh"
 
 namespace simeng {
 
+namespace memory {
+
 /** A fixed-latency memory interface request. */
 struct FixedLatencyMemoryInterfaceRequest {
   /** Is this a write request? */
@@ -57,7 +59,7 @@ class FixedLatencyMemoryInterface : public MemoryInterface {
   /** Clear the completed reads. */
   void clearCompletedReads() override;
 
-  /** Returns true if there are any oustanding memory requests in-flight. */
+  /** Returns true if there are any outstanding memory requests in-flight. */
   bool hasPendingRequests() const override;
 
   /** Tick the memory model to process the request queue. */
@@ -86,4 +88,5 @@ class FixedLatencyMemoryInterface : public MemoryInterface {
   }
 };
 
+}  // namespace memory
 }  // namespace simeng
diff --git a/src/include/simeng/FlatMemoryInterface.hh b/src/include/simeng/memory/FlatMemoryInterface.hh
similarity index 88%
rename from src/include/simeng/FlatMemoryInterface.hh
rename to src/include/simeng/memory/FlatMemoryInterface.hh
index f85e2f8491..a1cb1ff8d4 100644
--- a/src/include/simeng/FlatMemoryInterface.hh
+++ b/src/include/simeng/memory/FlatMemoryInterface.hh
@@ -2,10 +2,12 @@
 
 #include <vector>
 
-#include "simeng/MemoryInterface.hh"
+#include "simeng/memory/MemoryInterface.hh"
 
 namespace simeng {
 
+namespace memory {
+
 /** A memory interface to a flat memory system. */
 class FlatMemoryInterface : public MemoryInterface {
  public:
@@ -27,7 +29,7 @@ class FlatMemoryInterface : public MemoryInterface {
   /** Clear the completed reads. */
   void clearCompletedReads() override;
 
-  /** Returns true if there are any oustanding memory requests in-flight. */
+  /** Returns true if there are any outstanding memory requests in-flight. */
   bool hasPendingRequests() const override;
 
   /** Tick: do nothing */
@@ -42,4 +44,5 @@ class FlatMemoryInterface : public MemoryInterface {
   std::vector<MemoryReadResult> completedReads_;
 };
 
+}  // namespace memory
 }  // namespace simeng
diff --git a/src/include/simeng/memory/MemoryAccessTarget.hh b/src/include/simeng/memory/MemoryAccessTarget.hh
new file mode 100644
index 0000000000..4962f7567f
--- /dev/null
+++ b/src/include/simeng/memory/MemoryAccessTarget.hh
@@ -0,0 +1,26 @@
+#pragma once
+
+namespace simeng {
+
+namespace memory {
+
+/** A generic memory access target; describes a region of memory to access. */
+struct MemoryAccessTarget {
+  /** The address to access. */
+  uint64_t address;
+  /** The number of bytes to access at `address`. */
+  uint16_t size;
+
+  /** Check for equality of two access targets. */
+  bool operator==(const MemoryAccessTarget& other) const {
+    return (address == other.address && size == other.size);
+  };
+
+  /** Check for inequality of two access targets. */
+  bool operator!=(const MemoryAccessTarget& other) const {
+    return !(other == *this);
+  }
+};
+
+}  // namespace memory
+}  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/MemoryInterface.hh b/src/include/simeng/memory/MemoryInterface.hh
similarity index 62%
rename from src/include/simeng/MemoryInterface.hh
rename to src/include/simeng/memory/MemoryInterface.hh
index a999b294b5..706e86c0e3 100644
--- a/src/include/simeng/MemoryInterface.hh
+++ b/src/include/simeng/memory/MemoryInterface.hh
@@ -2,11 +2,14 @@
 
 #include "simeng/RegisterValue.hh"
 #include "simeng/control.hh"
+#include "simeng/memory/MemoryReadResult.hh"
 #include "simeng/span.hh"
 #include "simeng/trace.hh"
 
 namespace simeng {
 
+namespace memory {
+
 /** The available memory interface types. */
 enum class MemInterfaceType {
   Flat,     // A zero access latency interface
@@ -15,34 +18,6 @@ enum class MemInterfaceType {
             // instantiation
 };
 
-/** A generic memory access target; describes a region of memory to access. */
-struct MemoryAccessTarget {
-  /** The address to access. */
-  uint64_t address;
-  /** The number of bytes to access at `address`. */
-  uint16_t size;
-
-  /** Check for equality of two access targets. */
-  bool operator==(const MemoryAccessTarget& other) const {
-    return (address == other.address && size == other.size);
-  };
-
-  /** Check for inequality of two access targets. */
-  bool operator!=(const MemoryAccessTarget& other) const {
-    return !(other == *this);
-  }
-};
-
-/** A structure used for the result of memory read operations. */
-struct MemoryReadResult {
-  /** The memory access that was requested. */
-  MemoryAccessTarget target;
-  /** The data returned by the request. */
-  RegisterValue data;
-  /** The request identifier provided by the requester. */
-  uint64_t requestId;
-};
-
 /** An abstract memory interface. Describes a connection to a memory system to
  * which data read/write requests may be made. */
 class MemoryInterface {
@@ -65,7 +40,7 @@ class MemoryInterface {
   /** Clear the completed reads. */
   virtual void clearCompletedReads() = 0;
 
-  /** Returns true if there are any oustanding memory requests in-flight. */
+  /** Returns true if there are any outstanding memory requests in-flight. */
   virtual bool hasPendingRequests() const = 0;
 
   /** Tick the memory interface to allow it to process internal tasks.
@@ -76,4 +51,5 @@ class MemoryInterface {
   virtual void tick() = 0;
 };
 
+}  // namespace memory
 }  // namespace simeng
diff --git a/src/include/simeng/memory/MemoryReadResult.hh b/src/include/simeng/memory/MemoryReadResult.hh
new file mode 100644
index 0000000000..3226c3cf47
--- /dev/null
+++ b/src/include/simeng/memory/MemoryReadResult.hh
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "simeng/RegisterValue.hh"
+#include "simeng/memory/MemoryAccessTarget.hh"
+
+namespace simeng {
+
+namespace memory {
+
+/** A structure used for the result of memory read operations. */
+struct MemoryReadResult {
+  /** The memory access that was requested. */
+  MemoryAccessTarget target;
+  /** The data returned by the request. */
+  RegisterValue data;
+  /** The request identifier provided by the requester. */
+  uint64_t requestId;
+};
+
+}  // namespace memory
+}  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/models/emulation/Core.hh b/src/include/simeng/models/emulation/Core.hh
index 9152c6df03..f1e38d7022 100644
--- a/src/include/simeng/models/emulation/Core.hh
+++ b/src/include/simeng/models/emulation/Core.hh
@@ -6,8 +6,6 @@
 
 #include "simeng/ArchitecturalRegisterFileSet.hh"
 #include "simeng/Core.hh"
-#include "simeng/MemoryInterface.hh"
-#include "simeng/RegisterFileSet.hh"
 #include "simeng/arch/Architecture.hh"
 #include "simeng/span.hh"
 
@@ -21,9 +19,9 @@ class Core : public simeng::Core {
   /** Construct an emulation-style core, providing memory interfaces for
    * instructions and data, along with the instruction entry point and an ISA to
    * use. */
-  Core(MemoryInterface& instructionMemory, MemoryInterface& dataMemory,
-       uint64_t entryPoint, uint64_t programByteLength,
-       const arch::Architecture& isa);
+  Core(memory::MemoryInterface& instructionMemory,
+       memory::MemoryInterface& dataMemory, uint64_t entryPoint,
+       uint64_t programByteLength, const arch::Architecture& isa);
 
   /** Tick the core. */
   void tick() override;
@@ -38,9 +36,6 @@ class Core : public simeng::Core {
   /** Retrieve the number of instructions retired. */
   uint64_t getInstructionsRetiredCount() const override;
 
-  /** Retrieve the simulated nanoseconds elapsed since the core started. */
-  uint64_t getSystemTimer() const override;
-
   /** Retrieve a map of statistics to report. */
   std::map<std::string, std::string> getStats() const override;
 
@@ -54,51 +49,24 @@ class Core : public simeng::Core {
   /** Process an active exception handler. */
   void processExceptionHandler();
 
-  /** Apply changes to the process state. */
-  void applyStateChange(const arch::ProcessStateChange& change);
-
   /** A memory interface to access instructions. */
-  MemoryInterface& instructionMemory_;
-
-  /** A memory interface to access data. */
-  MemoryInterface& dataMemory_;
-
-  /** The previously generated addresses. */
-  std::vector<simeng::MemoryAccessTarget> previousAddresses_;
-
-  /** The length of the available instruction memory. */
-  uint64_t programByteLength_;
-
-  /** The currently used ISA. */
-  const arch::Architecture& isa_;
-
-  /** The current program counter. */
-  uint64_t pc_ = 0;
-
-  /** The core's register file set. */
-  RegisterFileSet registerFileSet_;
+  memory::MemoryInterface& instructionMemory_;
 
   /** An architectural register file set, serving as a simple wrapper around the
    * register file set. */
   ArchitecturalRegisterFileSet architecturalRegisterFileSet_;
 
-  /** Whether or not the core has halted. */
-  bool hasHalted_ = false;
-
   /** A reusable macro-op vector to fill with uops. */
   MacroOp macroOp_;
 
-  /** An internal buffer for storing one or more uops. */
-  std::queue<std::shared_ptr<Instruction>> microOps_;
-
-  /** The active exception handler. */
-  std::shared_ptr<arch::ExceptionHandler> exceptionHandler_;
+  /** The previously generated addresses. */
+  std::vector<simeng::memory::MemoryAccessTarget> previousAddresses_;
 
-  /** Is the core waiting on a data read? */
-  unsigned int pendingReads_ = 0;
+  /** The current program counter. */
+  uint64_t pc_ = 0;
 
-  /** The number of times this core has been ticked. */
-  uint64_t ticks_ = 0;
+  /** The length of the available instruction memory. */
+  uint64_t programByteLength_ = 0;
 
   /** The number of instructions executed. */
   uint64_t instructionsExecuted_ = 0;
diff --git a/src/include/simeng/models/inorder/Core.hh b/src/include/simeng/models/inorder/Core.hh
index 6998d554a1..38595aa434 100644
--- a/src/include/simeng/models/inorder/Core.hh
+++ b/src/include/simeng/models/inorder/Core.hh
@@ -4,7 +4,7 @@
 
 #include "simeng/ArchitecturalRegisterFileSet.hh"
 #include "simeng/Core.hh"
-#include "simeng/FlatMemoryInterface.hh"
+#include "simeng/memory/FlatMemoryInterface.hh"
 #include "simeng/pipeline/DecodeUnit.hh"
 #include "simeng/pipeline/ExecuteUnit.hh"
 #include "simeng/pipeline/FetchUnit.hh"
@@ -20,9 +20,10 @@ class Core : public simeng::Core {
   /** Construct a core model, providing an ISA and branch predictor to use,
    * along with a pointer and size of instruction memory, and a pointer to
    * process memory. */
-  Core(MemoryInterface& instructionMemory, MemoryInterface& dataMemory,
-       uint64_t processMemorySize, uint64_t entryPoint,
-       const arch::Architecture& isa, BranchPredictor& branchPredictor);
+  Core(memory::MemoryInterface& instructionMemory,
+       memory::MemoryInterface& dataMemory, uint64_t processMemorySize,
+       uint64_t entryPoint, const arch::Architecture& isa,
+       BranchPredictor& branchPredictor);
 
   /** Tick the core. Ticks each of the pipeline stages sequentially, then ticks
    * the buffers between them. Checks for and executes pipeline flushes at the
@@ -39,9 +40,6 @@ class Core : public simeng::Core {
   /** Retrieve the number of instructions retired. */
   uint64_t getInstructionsRetiredCount() const override;
 
-  /** Retrieve the simulated nanoseconds elapsed since the core started. */
-  uint64_t getSystemTimer() const override;
-
   /** Generate a map of statistics to report. */
   std::map<std::string, std::string> getStats() const override;
 
@@ -52,8 +50,15 @@ class Core : public simeng::Core {
   /** Handle an exception raised during the cycle. */
   void handleException();
 
+  /** Process the active exception handler. */
+  void processExceptionHandler();
+
+  /** Handle requesting/execution of a load instruction. */
+  void handleLoad(const std::shared_ptr<Instruction>& instruction);
+
   /** Load and supply memory data requested by an instruction. */
   void loadData(const std::shared_ptr<Instruction>& instruction);
+
   /** Store data supplied by an instruction to memory. */
   void storeData(const std::shared_ptr<Instruction>& instruction);
 
@@ -64,33 +69,15 @@ class Core : public simeng::Core {
   /** Read pending registers for the most recently decoded instruction. */
   void readRegisters();
 
-  /** Process the active exception handler. */
-  void processExceptionHandler();
-
-  /** Apply changes to the process state. */
-  void applyStateChange(const arch::ProcessStateChange& change);
-
-  /** Handle requesting/execution of a load instruction. */
-  void handleLoad(const std::shared_ptr<Instruction>& instruction);
-
   /** Set traces to finished state if their instruction has been flushed */
   void flushTraces(const bool atDecode);
 
-  /** The process memory. */
-  MemoryInterface& dataMemory_;
-
-  /** A reference to the core's architecture. */
-  const arch::Architecture& isa_;
-
-  /** The core's register file set. */
-  RegisterFileSet registerFileSet_;
-
   /** An architectural register file set, serving as a simple wrapper around the
    * register file set. */
   ArchitecturalRegisterFileSet architecturalRegisterFileSet_;
 
-  /** The process memory. */
-  span<char> processMemory;
+  /** The previously generated addresses. */
+  std::queue<simeng::memory::MemoryAccessTarget> previousAddresses_;
 
   /** The buffer between fetch and decode. */
   pipeline::PipelineBuffer<MacroOp> fetchToDecodeBuffer_;
@@ -102,9 +89,6 @@ class Core : public simeng::Core {
   std::vector<pipeline::PipelineBuffer<std::shared_ptr<Instruction>>>
       completionSlots_;
 
-  /** The previously generated addresses. */
-  std::queue<simeng::MemoryAccessTarget> previousAddresses_;
-
   /** The fetch unit; fetches instructions from memory. */
   pipeline::FetchUnit fetchUnit_;
 
@@ -121,20 +105,11 @@ class Core : public simeng::Core {
   /** The number of times the pipeline has been flushed. */
   uint64_t flushes_ = 0;
 
-  /** The number of times this core has been ticked. */
-  uint64_t ticks_ = 0;
-
   /** Whether an exception was generated during the cycle. */
   bool exceptionGenerated_ = false;
 
   /** A pointer to the instruction responsible for generating the exception. */
   std::shared_ptr<Instruction> exceptionGeneratingInstruction_;
-
-  /** Whether the core has halted. */
-  bool hasHalted_ = false;
-
-  /** The active exception handler. */
-  std::shared_ptr<arch::ExceptionHandler> exceptionHandler_;
 };
 
 }  // namespace inorder
diff --git a/src/include/simeng/models/outoforder/Core.hh b/src/include/simeng/models/outoforder/Core.hh
index 3f802bf12b..27df7ab8d5 100644
--- a/src/include/simeng/models/outoforder/Core.hh
+++ b/src/include/simeng/models/outoforder/Core.hh
@@ -2,7 +2,6 @@
 
 #include "simeng/ArchitecturalRegisterFileSet.hh"
 #include "simeng/Core.hh"
-#include "simeng/MemoryInterface.hh"
 #include "simeng/pipeline/DecodeUnit.hh"
 #include "simeng/pipeline/DispatchIssueUnit.hh"
 #include "simeng/pipeline/ExecuteUnit.hh"
@@ -26,10 +25,11 @@ class Core : public simeng::Core {
  public:
   /** Construct a core model, providing the process memory, and an ISA, branch
    * predictor, and port allocator to use. */
-  Core(MemoryInterface& instructionMemory, MemoryInterface& dataMemory,
-       uint64_t processMemorySize, uint64_t entryPoint,
-       const arch::Architecture& isa, BranchPredictor& branchPredictor,
-       pipeline::PortAllocator& portAllocator, YAML::Node config);
+  Core(memory::MemoryInterface& instructionMemory,
+       memory::MemoryInterface& dataMemory, uint64_t processMemorySize,
+       uint64_t entryPoint, const arch::Architecture& isa,
+       BranchPredictor& branchPredictor, pipeline::PortAllocator& portAllocator,
+       ryml::ConstNodeRef config = config::SimInfo::getConfig());
 
   /** Tick the core. Ticks each of the pipeline stages sequentially, then ticks
    * the buffers between them. Checks for and executes pipeline flushes at the
@@ -46,9 +46,6 @@ class Core : public simeng::Core {
   /** Retrieve the number of instructions retired. */
   uint64_t getInstructionsRetiredCount() const override;
 
-  /** Retrieve the simulated nanoseconds elapsed since the core started. */
-  uint64_t getSystemTimer() const override;
-
   /** Generate a map of statistics to report. */
   std::map<std::string, std::string> getStats() const override;
 
@@ -62,33 +59,22 @@ class Core : public simeng::Core {
   /** Process the active exception handler. */
   void processExceptionHandler();
 
-  /** Apply changes to the process state. */
-  void applyStateChange(const arch::ProcessStateChange& change);
-
   /** Inspect units and flush pipelines if required. */
   void flushIfNeeded();
 
   /** Set traces to finished state if their instruction has been flushed */
   void flushTraces(const bool atDecode);
 
-  const arch::Architecture& isa_;
-
   const std::vector<simeng::RegisterFileStructure> physicalRegisterStructures_;
 
   const std::vector<uint16_t> physicalRegisterQuantities_;
 
-  /** The core's register file set. */
-  RegisterFileSet registerFileSet_;
-
   /** The core's register alias table. */
   pipeline::RegisterAliasTable registerAliasTable_;
 
   /** The mapped register file set. */
   pipeline::MappedRegisterFileSet mappedRegisterFileSet_;
 
-  /** The process memory. */
-  MemoryInterface& dataMemory_;
-
   /** The buffer between fetch and decode. */
   pipeline::PipelineBuffer<MacroOp> fetchToDecodeBuffer_;
 
@@ -108,15 +94,9 @@ class Core : public simeng::Core {
   std::vector<pipeline::PipelineBuffer<std::shared_ptr<Instruction>>>
       completionSlots_;
 
-  /** The core's load/store queue. */
-  pipeline::LoadStoreQueue loadStoreQueue_;
-
   /** The fetch unit; fetches instructions from memory. */
   pipeline::FetchUnit fetchUnit_;
 
-  /** The core's reorder buffer. */
-  pipeline::ReorderBuffer reorderBuffer_;
-
   /** The decode unit; decodes instructions into uops and reads operands. */
   pipeline::DecodeUnit decodeUnit_;
 
@@ -135,34 +115,31 @@ class Core : public simeng::Core {
   /** The writeback unit; writes uop results to the register files. */
   pipeline::WritebackUnit writebackUnit_;
 
+  /** The core's reorder buffer. */
+  pipeline::ReorderBuffer reorderBuffer_;
+
+  /** The core's load/store queue. */
+  pipeline::LoadStoreQueue loadStoreQueue_;
+
   /** The port allocator unit; allocates a port that an instruction will be
    * issued from based on a defined algorithm. */
   pipeline::PortAllocator& portAllocator_;
 
-  /** Clock frequency of core */
-  unsigned int clockFrequency_ = 2.5 * 1e9;
-
   /** Core commit width; maximum number of instruction that can be committed per
    * cycle. */
-  unsigned int commitWidth_ = 6;
+  uint64_t commitWidth_ = 0;
 
   /** The number of times the pipeline has been flushed. */
   uint64_t flushes_ = 0;
 
-  /** The number of times this core has been ticked. */
-  uint64_t ticks_ = 0;
-
   /** Whether an exception was generated during the cycle. */
   bool exceptionGenerated_ = false;
 
   /** A pointer to the instruction responsible for generating the exception. */
   std::shared_ptr<Instruction> exceptionGeneratingInstruction_;
 
-  /** Whether the core has halted. */
-  bool hasHalted_ = false;
-
-  /** The active exception handler. */
-  std::shared_ptr<arch::ExceptionHandler> exceptionHandler_;
+  /** Reference to the current branch predictor */
+  BranchPredictor& branchPredictor_;
 };
 
 }  // namespace outoforder
diff --git a/src/include/simeng/pipeline/A64FXPortAllocator.hh b/src/include/simeng/pipeline/A64FXPortAllocator.hh
index 74f27faf25..c713aaeb16 100644
--- a/src/include/simeng/pipeline/A64FXPortAllocator.hh
+++ b/src/include/simeng/pipeline/A64FXPortAllocator.hh
@@ -21,39 +21,46 @@ const uint8_t BR = 5;
  * described in the A64FX Microarchitecture manual. */
 class A64FXPortAllocator : public PortAllocator {
  public:
+  /** Constructor for the A64FXPortAllocator object. */
   A64FXPortAllocator(const std::vector<std::vector<uint16_t>>& portArrangement);
 
+  /** Allocate a port for the specified instruction group; returns the allocated
+   * port. */
   uint16_t allocate(const std::vector<uint16_t>& ports) override;
 
+  /** Inform the allocator that an instruction was issued to the specified port.
+   */
   void issued(uint16_t port) override;
 
+  /** Inform the allocator that an instruction will not issue to its
+   * allocated port. */
   void deallocate(uint16_t port) override;
 
-  /** A mapping from issye ports to instruction attribute */
-  uint8_t attributeMapping(const std::vector<uint16_t>& ports);
-
   /** Set function from DispatchIssueUnit to retrieve reservation
    * station sizes during execution. */
   void setRSSizeGetter(
-      std::function<void(std::vector<uint64_t>&)> rsSizes) override;
+      std::function<void(std::vector<uint32_t>&)> rsSizes) override;
 
   /** Tick the port allocator to allow it to process internal tasks. */
   void tick() override;
 
  private:
+  /** A mapping from issue ports to instruction attribute */
+  uint8_t attributeMapping(const std::vector<uint16_t>& ports);
+
   /** An approximate estimation of the index of an instruction within the input
    * buffer of the dispatch unit. Increments slot at each allocation thus cannot
    * account for nullptr entries in buffer.*/
   uint8_t dispatchSlot_;
 
   /** Get the current sizes an capacity of the reservation stations. */
-  std::function<void(std::vector<uint64_t>&)> rsSizes_;
+  std::function<void(std::vector<uint32_t>&)> rsSizes_;
 
   /** Mapping from reservation station to ports. */
   std::vector<std::vector<uint16_t>> rsToPort_;
 
-  /** Vector of free entires across all reservation stations. */
-  std::vector<uint64_t> freeEntries_;
+  /** Vector of free entries across all reservation stations. */
+  std::vector<uint32_t> freeEntries_;
 
   /** Reservation station classifications as detailed in manual. */
   /** RSE with most free entries. */
diff --git a/src/include/simeng/pipeline/BalancedPortAllocator.hh b/src/include/simeng/pipeline/BalancedPortAllocator.hh
index ccd550718d..c5ee7a48ce 100644
--- a/src/include/simeng/pipeline/BalancedPortAllocator.hh
+++ b/src/include/simeng/pipeline/BalancedPortAllocator.hh
@@ -34,7 +34,7 @@ class BalancedPortAllocator : public PortAllocator {
   /** Set function from DispatchIssueUnit to retrieve reservation
    * station sizes during execution. */
   void setRSSizeGetter(
-      std::function<void(std::vector<uint64_t>&)> rsSizes) override;
+      std::function<void(std::vector<uint32_t>&)> rsSizes) override;
 
   /** Tick the port allocator to allow it to process internal tasks. */
   void tick() override;
@@ -50,7 +50,7 @@ class BalancedPortAllocator : public PortAllocator {
   std::vector<uint16_t> weights;
 
   /** Get the current sizes an capacity of the reservation stations */
-  std::function<void(std::vector<uint64_t>&)> rsSizes_;
+  std::function<void(std::vector<uint32_t>&)> rsSizes_;
 };
 
 }  // namespace pipeline
diff --git a/src/include/simeng/pipeline/DispatchIssueUnit.hh b/src/include/simeng/pipeline/DispatchIssueUnit.hh
index ac71e34029..a26bf2c2b2 100644
--- a/src/include/simeng/pipeline/DispatchIssueUnit.hh
+++ b/src/include/simeng/pipeline/DispatchIssueUnit.hh
@@ -8,11 +8,11 @@
 #include <unordered_set>
 
 #include "simeng/Instruction.hh"
+#include "simeng/config/SimInfo.hh"
 #include "simeng/control.hh"
 #include "simeng/pipeline/PipelineBuffer.hh"
 #include "simeng/pipeline/PortAllocator.hh"
 #include "simeng/trace.hh"
-#include "yaml-cpp/yaml.h"
 
 namespace simeng {
 namespace pipeline {
@@ -29,12 +29,11 @@ struct ReservationStationPort {
 /** A reservation station */
 struct ReservationStation {
   /** Size of reservation station */
-  uint16_t capacity;
+  uint32_t capacity;
   /** Number of instructions that can be dispatched to this unit per cycle. */
   uint16_t dispatchRate;
-  /** Current number of non-stalled instructions
-   * in reservation station */
-  uint16_t currentSize;
+  /** Current number of instructions in reservation station */
+  uint32_t currentSize;
   /** Issue ports belonging to reservation station */
   std::vector<ReservationStationPort> ports;
 };
@@ -62,7 +61,7 @@ class DispatchIssueUnit {
       std::vector<PipelineBuffer<std::shared_ptr<Instruction>>>& issuePorts,
       const RegisterFileSet& registerFileSet, PortAllocator& portAllocator,
       const std::vector<uint16_t>& physicalRegisterStructure,
-      YAML::Node config);
+      ryml::ConstNodeRef config = config::SimInfo::getConfig());
 
   /** Ticks the dispatch/issue unit. Reads available input operands for
    * instructions and sets scoreboard flags for destination registers. */
@@ -77,9 +76,6 @@ class DispatchIssueUnit {
   void forwardOperands(const span<Register>& destinations,
                        const span<RegisterValue>& values);
 
-  /** Set the scoreboard entry for the provided register as ready. */
-  void setRegisterReady(Register reg);
-
   /** Clear the RS of all flushed instructions. */
   void purgeFlushed();
 
@@ -100,7 +96,7 @@ class DispatchIssueUnit {
   uint64_t getPortBusyStalls() const;
 
   /** Retrieve the current sizes and capacities of the reservation stations*/
-  void getRSSizes(std::vector<uint64_t>&) const;
+  void getRSSizes(std::vector<uint32_t>&) const;
 
  private:
   /** A buffer of instructions to dispatch and read operands for. */
diff --git a/src/include/simeng/pipeline/ExecuteUnit.hh b/src/include/simeng/pipeline/ExecuteUnit.hh
index f65e18e18e..d95585e1e5 100644
--- a/src/include/simeng/pipeline/ExecuteUnit.hh
+++ b/src/include/simeng/pipeline/ExecuteUnit.hh
@@ -3,7 +3,6 @@
 #include <deque>
 #include <functional>
 
-#include "simeng/BranchPredictor.hh"
 #include "simeng/Instruction.hh"
 #include "simeng/control.hh"
 #include "simeng/pipeline/PipelineBuffer.hh"
@@ -35,8 +34,7 @@ class ExecuteUnit {
       std::function<void(const std::shared_ptr<Instruction>&)> handleLoad,
       std::function<void(const std::shared_ptr<Instruction>&)> handleStore,
       std::function<void(const std::shared_ptr<Instruction>&)> raiseException,
-      BranchPredictor& predictor, bool pipelined = true,
-      const std::vector<uint16_t>& blockingGroups = {});
+      bool pipelined = true, const std::vector<uint16_t>& blockingGroups = {});
 
   /** Tick the execute unit. Places incoming instructions into the pipeline and
    * executes an instruction that has reached the head of the pipeline, if
@@ -50,20 +48,14 @@ class ExecuteUnit {
    * discovered misprediction. */
   uint64_t getFlushAddress() const;
 
-  /** Retrieve the sequence ID associated with the most recently discovered
+  /** Retrieve the instruction ID associated with the most recently discovered
    * misprediction. */
-  uint64_t getFlushSeqId() const;
+  uint64_t getFlushInsnId() const;
 
   /** Purge flushed instructions from the internal pipeline and clear any active
    * stall, if applicable. */
   void purgeFlushed();
 
-  /** Retrieve the number of branch instructions that have been executed. */
-  uint64_t getBranchExecutedCount() const;
-
-  /** Retrieve the number of branch mispredictions. */
-  uint64_t getBranchMispredictedCount() const;
-
   /** Retrieve the number of active execution cycles. */
   uint64_t getCycles() const;
 
@@ -93,10 +85,6 @@ class ExecuteUnit {
   /** A function handle called upon exception generation. */
   std::function<void(const std::shared_ptr<Instruction>&)> raiseException_;
 
-  /** A reference to the branch predictor, for updating with prediction results.
-   */
-  BranchPredictor& predictor_;
-
   /** Whether this unit is pipelined, or if all instructions should stall until
    * complete. */
   bool pipelined_;
@@ -131,12 +119,6 @@ class ExecuteUnit {
   /** The cycle this unit will become unstalled. */
   uint64_t stallUntil_ = 0;
 
-  /** The number of branch instructions that were executed. */
-  uint64_t branchesExecuted_ = 0;
-
-  /** The number of branch mispredictions that were observed. */
-  uint64_t branchMispredicts_ = 0;
-
   /** The number of active execution cycles that were observed. */
   uint64_t cycles_ = 0;
 };
diff --git a/src/include/simeng/pipeline/FetchUnit.hh b/src/include/simeng/pipeline/FetchUnit.hh
index b6f8f86f27..2dbf74a3da 100644
--- a/src/include/simeng/pipeline/FetchUnit.hh
+++ b/src/include/simeng/pipeline/FetchUnit.hh
@@ -2,9 +2,9 @@
 
 #include <queue>
 
-#include "simeng/MemoryInterface.hh"
 #include "simeng/arch/Architecture.hh"
 #include "simeng/control.hh"
+#include "simeng/memory/MemoryInterface.hh"
 #include "simeng/pipeline/PipelineBuffer.hh"
 #include "simeng/trace.hh"
 
@@ -22,7 +22,7 @@ enum class LoopBufferState {
 // Struct to hold information about a fetched instruction
 struct loopBufferEntry {
   // Encoding of the instruction
-  const uint64_t encoding;
+  const uint32_t encoding;
 
   // Size of the instruction
   const uint16_t instructionSize;
@@ -40,8 +40,9 @@ class FetchUnit {
  public:
   /** Construct a fetch unit with a reference to an output buffer, the ISA, and
    * the current branch predictor, and information on the instruction memory. */
-  FetchUnit(PipelineBuffer<MacroOp>& output, MemoryInterface& instructionMemory,
-            uint64_t programByteLength, uint64_t entryPoint, uint8_t blockSize,
+  FetchUnit(PipelineBuffer<MacroOp>& output,
+            memory::MemoryInterface& instructionMemory,
+            uint64_t programByteLength, uint64_t entryPoint, uint16_t blockSize,
             const arch::Architecture& isa, BranchPredictor& branchPredictor);
 
   ~FetchUnit();
@@ -70,6 +71,9 @@ class FetchUnit {
   /** Clear the loop buffer. */
   void flushLoopBuffer();
 
+  /** Retrieve the number of branch instructions that have been fetched. */
+  uint64_t getBranchFetchedCount() const;
+
  private:
   /** An output buffer connecting this unit to the decode unit. */
   PipelineBuffer<MacroOp>& output_;
@@ -78,7 +82,7 @@ class FetchUnit {
   uint64_t pc_ = 0;
 
   /** An interface to the instruction memory. */
-  MemoryInterface& instructionMemory_;
+  memory::MemoryInterface& instructionMemory_;
 
   /** The length of the available instruction memory. */
   uint64_t programByteLength_;
@@ -107,7 +111,7 @@ class FetchUnit {
   uint64_t branchStalls_ = 0;
 
   /** The size of a fetch block, in bytes. */
-  uint8_t blockSize_;
+  uint16_t blockSize_;
 
   /** A mask of the bits of the program counter to use for obtaining the block
    * address to fetch. */
@@ -117,7 +121,17 @@ class FetchUnit {
   uint8_t* fetchBuffer_;
 
   /** The amount of data currently in the fetch buffer. */
-  uint8_t bufferedBytes_ = 0;
+  uint16_t bufferedBytes_ = 0;
+
+  /** The number of branch instructions that were fetched. */
+  uint64_t branchesFetched_ = 0;
+
+  /** Let the following PipelineFetchUnitTest derived classes be a friend of
+   * this class to allow proper testing of 'tick' function. */
+  friend class PipelineFetchUnitTest_invalidMinBytesAtEndOfBuffer_Test;
+  friend class PipelineFetchUnitTest_minSizeInstructionAtEndOfBuffer_Test;
+  friend class PipelineFetchUnitTest_validMinSizeReadsDontComplete_Test;
+  friend class PipelineFetchUnitTest_invalidMinBytesreadsDontComplete_Test;
 };
 
 }  // namespace pipeline
diff --git a/src/include/simeng/pipeline/LoadStoreQueue.hh b/src/include/simeng/pipeline/LoadStoreQueue.hh
index a12ec9eb09..78a1fe48ab 100644
--- a/src/include/simeng/pipeline/LoadStoreQueue.hh
+++ b/src/include/simeng/pipeline/LoadStoreQueue.hh
@@ -7,8 +7,8 @@
 #include <unordered_map>
 
 #include "simeng/Instruction.hh"
-#include "simeng/MemoryInterface.hh"
 #include "simeng/control.hh"
+#include "simeng/memory/MemoryInterface.hh"
 #include "simeng/pipeline/PipelineBuffer.hh"
 #include "simeng/trace.hh"
 
@@ -21,7 +21,7 @@ enum accessType { LOAD = 0, STORE };
 /** A requestQueue_ entry. */
 struct requestEntry {
   /** The memory address(es) to be accessed. */
-  std::queue<simeng::MemoryAccessTarget> reqAddresses;
+  std::queue<simeng::memory::MemoryAccessTarget> reqAddresses;
   /** The instruction sending the request(s). */
   std::shared_ptr<Instruction> insn;
 };
@@ -34,9 +34,10 @@ class LoadStoreQueue {
    * for both load and store instructions, supplying completion slots for loads
    * and an operand forwarding handler. */
   LoadStoreQueue(
-      unsigned int maxCombinedSpace, MemoryInterface& memory,
+      unsigned int maxCombinedSpace, memory::MemoryInterface& memory,
       span<PipelineBuffer<std::shared_ptr<Instruction>>> completionSlots,
       std::function<void(span<Register>, span<RegisterValue>)> forwardOperands,
+      std::function<void(const std::shared_ptr<Instruction>&)> raiseException,
       bool exclusive = false, uint16_t loadBandwidth = UINT16_MAX,
       uint16_t storeBandwidth = UINT16_MAX,
       uint16_t permittedRequests = UINT16_MAX,
@@ -48,9 +49,10 @@ class LoadStoreQueue {
    * operand forwarding handler. */
   LoadStoreQueue(
       unsigned int maxLoadQueueSpace, unsigned int maxStoreQueueSpace,
-      MemoryInterface& memory,
+      memory::MemoryInterface& memory,
       span<PipelineBuffer<std::shared_ptr<Instruction>>> completionSlots,
       std::function<void(span<Register>, span<RegisterValue>)> forwardOperands,
+      std::function<void(const std::shared_ptr<Instruction>&)> raiseException,
       bool exclusive = false, uint16_t loadBandwidth = UINT16_MAX,
       uint16_t storeBandwidth = UINT16_MAX,
       uint16_t permittedRequests = UINT16_MAX,
@@ -121,6 +123,9 @@ class LoadStoreQueue {
   /** A function handler to call to forward the results of a completed load. */
   std::function<void(span<Register>, span<RegisterValue>)> forwardOperands_;
 
+  /** A function handle called upon exception generation. */
+  std::function<void(const std::shared_ptr<Instruction>&)> raiseException_;
+
   /** The maximum number of loads that can be in-flight. Undefined if this
    * is a combined queue. */
   unsigned int maxLoadQueueSpace_;
@@ -146,7 +151,7 @@ class LoadStoreQueue {
   unsigned int getCombinedSpace() const;
 
   /** A pointer to process memory. */
-  MemoryInterface& memory_;
+  memory::MemoryInterface& memory_;
 
   /** The load instruction associated with the most recently discovered memory
    * order violation. */
diff --git a/src/include/simeng/pipeline/M1PortAllocator.hh b/src/include/simeng/pipeline/M1PortAllocator.hh
index 1139f9bc8a..7bfaa94817 100644
--- a/src/include/simeng/pipeline/M1PortAllocator.hh
+++ b/src/include/simeng/pipeline/M1PortAllocator.hh
@@ -19,7 +19,7 @@ class M1PortAllocator : public PortAllocator {
    * a port type which denotes the matching requirements of said instruction
    * groups. */
   M1PortAllocator(const std::vector<std::vector<uint16_t>>& portArrangement,
-                  std::vector<std::pair<uint8_t, uint64_t>> rsArrangement);
+                  std::vector<std::pair<uint16_t, uint64_t>> rsArrangement);
 
   /** Allocate the lowest weighted port available for the specified instruction
    * group. Returns the allocated port, and increases the weight of the port.
@@ -35,7 +35,7 @@ class M1PortAllocator : public PortAllocator {
   /** Set function from DispatchIssueUnit to retrieve reservation
    * station sizes during execution. */
   void setRSSizeGetter(
-      std::function<void(std::vector<uint64_t>&)> rsSizes) override;
+      std::function<void(std::vector<uint32_t>&)> rsSizes) override;
 
   /** Tick the port allocator to allow it to process internal tasks. */
   void tick() override;
@@ -50,13 +50,13 @@ class M1PortAllocator : public PortAllocator {
    * that port. */
   std::vector<uint16_t> weights;
 
-  std::vector<uint64_t> rsFreeSpaces;
+  std::vector<uint32_t> rsFreeSpaces;
 
   /** Get the current capacity of the reservation stations */
-  std::function<void(std::vector<uint64_t>&)> rsSizes_;
+  std::function<void(std::vector<uint32_t>&)> rsSizes_;
 
   /** Mapping from port index to reservation station <index, size> */
-  std::vector<std::pair<uint8_t, uint64_t>> rsArrangement_;
+  std::vector<std::pair<uint16_t, uint64_t>> rsArrangement_;
 };
 
 }  // namespace pipeline
diff --git a/src/include/simeng/pipeline/PipelineBuffer.hh b/src/include/simeng/pipeline/PipelineBuffer.hh
index 3d1e0df1a8..6e128ae684 100644
--- a/src/include/simeng/pipeline/PipelineBuffer.hh
+++ b/src/include/simeng/pipeline/PipelineBuffer.hh
@@ -18,7 +18,7 @@ class PipelineBuffer {
  public:
   /** Construct a pipeline buffer of width `width`, and fill all slots with
    * `initialValue`. */
-  PipelineBuffer(int width, const T& initialValue)
+  PipelineBuffer(uint16_t width, const T& initialValue)
       : width(width), buffer(width * length, initialValue) {}
 
   /** Tick the buffer and move head/tail pointers, or do nothing if it's
@@ -71,11 +71,11 @@ class PipelineBuffer {
   void fill(const T& value) { std::fill(buffer.begin(), buffer.end(), value); }
 
   /** Get the width of the buffer slots. */
-  unsigned short getWidth() const { return width; }
+  uint16_t getWidth() const { return width; }
 
  private:
   /** The width of each row of slots. */
-  unsigned short width;
+  uint16_t width;
 
   /** The buffer. */
   std::vector<T> buffer;
diff --git a/src/include/simeng/pipeline/PortAllocator.hh b/src/include/simeng/pipeline/PortAllocator.hh
index 8d6f79a5f8..78e3a0c5c9 100644
--- a/src/include/simeng/pipeline/PortAllocator.hh
+++ b/src/include/simeng/pipeline/PortAllocator.hh
@@ -33,7 +33,7 @@ class PortAllocator {
   /** Set function from DispatchIssueUnit to retrieve reservation
    * station sizes during execution. */
   virtual void setRSSizeGetter(
-      std::function<void(std::vector<uint64_t>&)> rsSizes) = 0;
+      std::function<void(std::vector<uint32_t>&)> rsSizes) = 0;
 
   /** Tick the port allocator to allow it to process internal tasks. */
   virtual void tick() = 0;
diff --git a/src/include/simeng/pipeline/RegisterAliasTable.hh b/src/include/simeng/pipeline/RegisterAliasTable.hh
index e3a30ea7b1..43b8e0db4c 100644
--- a/src/include/simeng/pipeline/RegisterAliasTable.hh
+++ b/src/include/simeng/pipeline/RegisterAliasTable.hh
@@ -15,7 +15,7 @@ class RegisterAliasTable {
    * structure, and the corresponding numbers of physical registers that should
    * be available. */
   RegisterAliasTable(std::vector<RegisterFileStructure> architecturalStructure,
-                     std::vector<uint16_t> physicalStructure);
+                     std::vector<uint16_t> physicalRegisterCounts);
 
   /** Retrieve the current physical register assigned to the provided
    * architectural register. */
@@ -43,9 +43,6 @@ class RegisterAliasTable {
    * is reinstated to the mapping table, and the provided register is freed. */
   void rewind(Register physical);
 
-  /** Free the provided physical register. */
-  void free(Register physical);
-
  private:
   /** The register mapping tables. Holds a map of architectural -> physical
    * register mappings for each register type. */
diff --git a/src/include/simeng/pipeline/ReorderBuffer.hh b/src/include/simeng/pipeline/ReorderBuffer.hh
index 965032b0d3..1126a1045b 100644
--- a/src/include/simeng/pipeline/ReorderBuffer.hh
+++ b/src/include/simeng/pipeline/ReorderBuffer.hh
@@ -4,6 +4,7 @@
 #include <functional>
 
 #include "simeng/Instruction.hh"
+#include "simeng/branchpredictors/BranchPredictor.hh"
 #include "simeng/control.hh"
 #include "simeng/pipeline/LoadStoreQueue.hh"
 #include "simeng/pipeline/RegisterAliasTable.hh"
@@ -46,7 +47,7 @@ class ReorderBuffer {
   /** Constructs a reorder buffer of maximum size `maxSize`, supplying a
    * reference to the register alias table. */
   ReorderBuffer(
-      unsigned int maxSize, RegisterAliasTable& rat, LoadStoreQueue& lsq,
+      uint32_t maxSize, RegisterAliasTable& rat, LoadStoreQueue& lsq,
       std::function<void(const std::shared_ptr<Instruction>&)> raiseException,
       std::function<void(uint64_t branchAddress)> sendLoopBoundary,
       BranchPredictor& predictor, uint16_t loopBufSize,
@@ -58,10 +59,10 @@ class ReorderBuffer {
   void commitMicroOps(uint64_t insnId);
 
   /** Commit and remove up to `maxCommitSize` instructions. */
-  unsigned int commit(unsigned int maxCommitSize);
+  unsigned int commit(uint64_t maxCommitSize);
 
   /** Flush all instructions with a sequence ID greater than `afterSeqId`. */
-  void flush(uint64_t afterSeqId);
+  void flush(uint64_t afterInsnId);
 
   /** Retrieve the current size of the ROB. */
   unsigned int size() const;
@@ -77,9 +78,9 @@ class ReorderBuffer {
    * discovered memory order violation. */
   uint64_t getFlushAddress() const;
 
-  /** Retrieve the sequence ID associated with the most recently discovered
+  /** Retrieve the instruction ID associated with the most recently discovered
    * memory order violation. */
-  uint64_t getFlushSeqId() const;
+  uint64_t getFlushInsnId() const;
 
   /** Get the number of instructions the ROB has committed. */
   uint64_t getInstructionsCommittedCount() const;
@@ -87,6 +88,12 @@ class ReorderBuffer {
   /** Get the number of speculated loads which violated load-store ordering. */
   uint64_t getViolatingLoadsCount() const;
 
+  /** Retrieve the number of branch mispredictions. */
+  uint64_t getBranchMispredictedCount() const;
+
+  /** Retrieve the number of retired brancehs. */
+  uint64_t getRetiredBranchesCount() const;
+
  private:
   /** A reference to the register alias table. */
   RegisterAliasTable& rat_;
@@ -95,7 +102,7 @@ class ReorderBuffer {
   LoadStoreQueue& lsq_;
 
   /** The maximum size of the ROB. */
-  unsigned int maxSize_;
+  uint32_t maxSize_;
 
   /** A function to call upon exception generation. */
   std::function<void(std::shared_ptr<Instruction>)> raiseException_;
@@ -144,8 +151,14 @@ class ReorderBuffer {
   /** The number of instructions committed. */
   uint64_t instructionsCommitted_ = 0;
 
-  /** The number of speculatived loads which violated load-store ordering. */
+  /** The number of speculative loads which violated load-store ordering. */
   uint64_t loadViolations_ = 0;
+
+  /** The number of branch mispredictions that were observed. */
+  uint64_t branchMispredicts_ = 0;
+
+  /** The number of retired branch instructions */
+  uint64_t retiredBranches_ = 0;
 };
 
 }  // namespace pipeline
diff --git a/src/include/simeng/pipeline/WritebackUnit.hh b/src/include/simeng/pipeline/WritebackUnit.hh
index 05da7e349d..bd6d311684 100644
--- a/src/include/simeng/pipeline/WritebackUnit.hh
+++ b/src/include/simeng/pipeline/WritebackUnit.hh
@@ -3,6 +3,7 @@
 #include <functional>
 
 #include "simeng/Instruction.hh"
+#include "simeng/RegisterFileSet.hh"
 #include "simeng/control.hh"
 #include "simeng/pipeline/PipelineBuffer.hh"
 #include "simeng/trace.hh"
diff --git a/src/lib/AlwaysNotTakenPredictor.cc b/src/lib/AlwaysNotTakenPredictor.cc
deleted file mode 100644
index 9ad8d1e2e4..0000000000
--- a/src/lib/AlwaysNotTakenPredictor.cc
+++ /dev/null
@@ -1,16 +0,0 @@
-#include "simeng/AlwaysNotTakenPredictor.hh"
-
-namespace simeng {
-
-BranchPrediction AlwaysNotTakenPredictor::predict(uint64_t address,
-                                                  BranchType type,
-                                                  int64_t knownOffset) {
-  return {false, 0};
-}
-
-void AlwaysNotTakenPredictor::update(uint64_t address, bool taken,
-                                     uint64_t targetAddress, BranchType type) {}
-
-void AlwaysNotTakenPredictor::flush(uint64_t address) {}
-
-}  // namespace simeng
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index 4206054f6d..40e08c9407 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -14,8 +14,15 @@ set(SIMENG_SOURCES
     arch/riscv/Instruction_decode.cc
     arch/riscv/Instruction_execute.cc
     arch/riscv/InstructionMetadata.cc
+    branchpredictors/AlwaysNotTakenPredictor.cc
+    branchpredictors/GenericPredictor.cc
+    branchpredictors/PerceptronPredictor.cc
+    config/ModelConfig.cc
+    config/SimInfo.cc
     kernel/Linux.cc
     kernel/LinuxProcess.cc
+    memory/FixedLatencyMemoryInterface.cc
+    memory/FlatMemoryInterface.cc
     models/emulation/Core.cc
     models/inorder/Core.cc
     models/outoforder/Core.cc
@@ -32,17 +39,11 @@ set(SIMENG_SOURCES
     pipeline/RenameUnit.cc
     pipeline/ReorderBuffer.cc
     pipeline/WritebackUnit.cc
-    AlwaysNotTakenPredictor.cc
     ArchitecturalRegisterFileSet.cc
     CMakeLists.txt
     CoreInstance.cc
     control.cc
     Elf.cc
-    FixedLatencyMemoryInterface.cc
-    FlatMemoryInterface.cc
-    GenericPredictor.cc
-    Instruction.cc
-    ModelConfig.cc
     RegisterFileSet.cc
     RegisterValue.cc
     SpecialFileDirGen.cc
@@ -56,8 +57,10 @@ add_library(libsimeng SHARED ${SIMENG_SOURCES} ${SIMENG_HEADERS})
 set_target_properties(libsimeng PROPERTIES OUTPUT_NAME simeng)
 
 target_include_directories(libsimeng PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-target_include_directories(libsimeng PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
-target_link_libraries(libsimeng capstone yaml-cpp)
+target_include_directories(libsimeng PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+target_link_libraries(libsimeng capstone)
+# Only enable compiler warnings for our code
+target_compile_options(libsimeng PRIVATE ${SIMENG_COMPILE_OPTIONS})
 
 set_target_properties(libsimeng PROPERTIES VERSION ${SimEng_VERSION})
 set_target_properties(libsimeng PROPERTIES SOVERSION ${SimEng_VERSION_MAJOR})
@@ -68,4 +71,4 @@ install(TARGETS libsimeng DESTINATION lib)
 get_target_property(SIMENG_COMPILE_OPTIONS libsimeng COMPILE_OPTIONS)
 get_target_property(SIMENG_COMPILE_DEFINITIONS libsimeng COMPILE_DEFINITIONS)
 get_target_property(SIMENG_VERSION libsimeng VERSION)
-configure_file(${PROJECT_SOURCE_DIR}/src/include/simeng/version.hh.in ${PROJECT_SOURCE_DIR}/src/include/simeng/version.hh)
+configure_file(${PROJECT_SOURCE_DIR}/src/include/simeng/version.hh.in ${CMAKE_CURRENT_BINARY_DIR}/simeng/version.hh)
diff --git a/src/lib/CoreInstance.cc b/src/lib/CoreInstance.cc
index ddf53b20bf..46f8638286 100644
--- a/src/lib/CoreInstance.cc
+++ b/src/lib/CoreInstance.cc
@@ -3,23 +3,22 @@
 namespace simeng {
 
 CoreInstance::CoreInstance(std::string executablePath,
-                           std::vector<std::string> executableArgs) {
-  config_ = YAML::Load(DEFAULT_CONFIG);
+                           std::vector<std::string> executableArgs,
+                           ryml::ConstNodeRef config)
+    : config_(config),
+      kernel_(kernel::Linux(
+          config_["CPU-Info"]["Special-File-Dir-Path"].as<std::string>())) {
   generateCoreModel(executablePath, executableArgs);
 }
 
-CoreInstance::CoreInstance(std::string configPath, std::string executablePath,
-                           std::vector<std::string> executableArgs) {
-  config_ = simeng::ModelConfig(configPath).getConfigFile();
-  generateCoreModel(executablePath, executableArgs);
-}
-
-CoreInstance::CoreInstance(char* assembledSource, size_t sourceSize,
-                           std::string configPath) {
-  config_ = simeng::ModelConfig(configPath).getConfigFile();
-  source_ = assembledSource;
-  sourceSize_ = sourceSize;
-  assembledSource_ = true;
+CoreInstance::CoreInstance(uint8_t* assembledSource, size_t sourceSize,
+                           ryml::ConstNodeRef config)
+    : config_(config),
+      kernel_(kernel::Linux(
+          config_["CPU-Info"]["Special-File-Dir-Path"].as<std::string>())),
+      source_(assembledSource),
+      sourceSize_(sourceSize),
+      assembledSource_(true) {
   // Pass an empty string for executablePath and empty vector of strings for
   // executableArgs.
   generateCoreModel("", std::vector<std::string>{});
@@ -33,41 +32,40 @@ CoreInstance::~CoreInstance() {
 
 void CoreInstance::generateCoreModel(std::string executablePath,
                                      std::vector<std::string> executableArgs) {
-  setSimulationMode();
   createProcess(executablePath, executableArgs);
   // Check to see if either of the instruction or data memory interfaces should
   // be created. Don't create the core if either interface is marked as External
   // as they must be set manually prior to the core's creation.
 
   // Convert Data-Memory's Interface-Type value from a string to
-  // simeng::MemInterfaceType
+  // memory::MemInterfaceType
   std::string dType_string =
       config_["L1-Data-Memory"]["Interface-Type"].as<std::string>();
-  simeng::MemInterfaceType dType = simeng::MemInterfaceType::Flat;
+  memory::MemInterfaceType dType = memory::MemInterfaceType::Flat;
   if (dType_string == "Fixed") {
-    dType = simeng::MemInterfaceType::Fixed;
+    dType = memory::MemInterfaceType::Fixed;
   } else if (dType_string == "External") {
-    dType = simeng::MemInterfaceType::External;
+    dType = memory::MemInterfaceType::External;
   }
   // Create data memory if appropriate
-  if (dType == simeng::MemInterfaceType::External) {
+  if (dType == memory::MemInterfaceType::External) {
     setDataMemory_ = true;
   } else {
     createL1DataMemory(dType);
   }
 
   // Convert Instruction-Memory's Interface-Type value from a string to
-  // simeng::MemInterfaceType
+  // memory::MemInterfaceType
   std::string iType_string =
       config_["L1-Instruction-Memory"]["Interface-Type"].as<std::string>();
-  simeng::MemInterfaceType iType = simeng::MemInterfaceType::Flat;
+  memory::MemInterfaceType iType = memory::MemInterfaceType::Flat;
   if (iType_string == "Fixed") {
-    iType = simeng::MemInterfaceType::Fixed;
+    iType = memory::MemInterfaceType::Fixed;
   } else if (iType_string == "External") {
-    iType = simeng::MemInterfaceType::External;
+    iType = memory::MemInterfaceType::External;
   }
   // Create instruction memory if appropriate
-  if (iType == simeng::MemInterfaceType::External) {
+  if (iType == memory::MemInterfaceType::External) {
     setInstructionMemory_ = true;
   } else {
     createL1InstructionMemory(iType);
@@ -79,22 +77,6 @@ void CoreInstance::generateCoreModel(std::string executablePath,
   return;
 }
 
-void CoreInstance::setSimulationMode() {
-  // Get the simualtion mode as defined by the set configuration, defaulting to
-  // emulation
-  if (config_["Core"]["Simulation-Mode"].as<std::string>() ==
-      "inorderpipelined") {
-    mode_ = SimulationMode::InOrderPipelined;
-    modeString_ = "In-Order Pipelined";
-  } else if (config_["Core"]["Simulation-Mode"].as<std::string>() ==
-             "outoforder") {
-    mode_ = SimulationMode::OutOfOrder;
-    modeString_ = "Out-of-Order";
-  }
-
-  return;
-}
-
 void CoreInstance::createProcess(std::string executablePath,
                                  std::vector<std::string> executableArgs) {
   if (executablePath.length() > 0) {
@@ -103,8 +85,7 @@ void CoreInstance::createProcess(std::string executablePath,
     std::vector<std::string> commandLine = {executablePath};
     commandLine.insert(commandLine.end(), executableArgs.begin(),
                        executableArgs.end());
-    process_ =
-        std::make_unique<simeng::kernel::LinuxProcess>(commandLine, config_);
+    process_ = std::make_unique<kernel::LinuxProcess>(commandLine, config_);
 
     // Raise error if created process is not valid
     if (!process_->isValid()) {
@@ -114,8 +95,8 @@ void CoreInstance::createProcess(std::string executablePath,
     }
   } else if (assembledSource_) {
     // Create a process image from the source code assembled by LLVM.
-    process_ = std::make_unique<simeng::kernel::LinuxProcess>(
-        simeng::span<char>(source_, sourceSize_), config_);
+    process_ = std::make_unique<kernel::LinuxProcess>(
+        span<const uint8_t>(source_, sourceSize_), config_);
     // Raise error if created process is not valid
     if (!process_->isValid()) {
       std::cerr << "[SimEng:CoreInstance] Could not create process based on "
@@ -124,18 +105,12 @@ void CoreInstance::createProcess(std::string executablePath,
       exit(1);
     }
   } else {
-    // Create a process image from the set of instructions held in hex_
-    process_ = std::make_unique<simeng::kernel::LinuxProcess>(
-        simeng::span<char>(reinterpret_cast<char*>(hex_), sizeof(hex_)),
-        config_);
-
-    // Raise error if created process is not valid
-    if (!process_->isValid()) {
-      std::cerr << "[SimEng:CoreInstance] Could not create process based on "
-                   "supplied instruction span"
-                << std::endl;
-      exit(1);
-    }
+    // This case shouldn't be reached as the default program should always be
+    // provided
+    std::cerr << "[SimEng:CoreInstance] Unexpected parameters given to core "
+                 "instance. No default program and no assembled source"
+              << std::endl;
+    exit(1);
   }
 
   // Create the process memory space from the generated process image
@@ -156,15 +131,16 @@ void CoreInstance::createProcessMemory() {
 }
 
 void CoreInstance::createL1InstructionMemory(
-    const simeng::MemInterfaceType type) {
+    const memory::MemInterfaceType type) {
   // Create a L1I cache instance based on type supplied
-  if (type == simeng::MemInterfaceType::Flat) {
-    instructionMemory_ = std::make_shared<simeng::FlatMemoryInterface>(
+  if (type == memory::MemInterfaceType::Flat) {
+    instructionMemory_ = std::make_shared<memory::FlatMemoryInterface>(
         processMemory_.get(), processMemorySize_);
-  } else if (type == simeng::MemInterfaceType::Fixed) {
-    instructionMemory_ = std::make_shared<simeng::FixedLatencyMemoryInterface>(
-        processMemory_.get(), processMemorySize_,
-        config_["LSQ-L1-Interface"]["Access-Latency"].as<uint16_t>());
+  } else if (type == memory::MemInterfaceType::Fixed) {
+    uint16_t accessLat =
+        config_["LSQ-L1-Interface"]["Access-Latency"].as<uint16_t>();
+    instructionMemory_ = std::make_shared<memory::FixedLatencyMemoryInterface>(
+        processMemory_.get(), processMemorySize_, accessLat);
   } else {
     std::cerr
         << "[SimEng:CoreInstance] Unsupported memory interface type used in "
@@ -177,7 +153,7 @@ void CoreInstance::createL1InstructionMemory(
 }
 
 void CoreInstance::setL1InstructionMemory(
-    std::shared_ptr<simeng::MemoryInterface> memRef) {
+    std::shared_ptr<memory::MemoryInterface> memRef) {
   assert(setInstructionMemory_ &&
          "setL1InstructionMemory(...) called but the interface was created by "
          "the CoreInstance class.");
@@ -186,15 +162,16 @@ void CoreInstance::setL1InstructionMemory(
   return;
 }
 
-void CoreInstance::createL1DataMemory(const simeng::MemInterfaceType type) {
+void CoreInstance::createL1DataMemory(const memory::MemInterfaceType type) {
   // Create a L1D cache instance based on type supplied
-  if (type == simeng::MemInterfaceType::Flat) {
-    dataMemory_ = std::make_shared<simeng::FlatMemoryInterface>(
+  if (type == memory::MemInterfaceType::Flat) {
+    dataMemory_ = std::make_shared<memory::FlatMemoryInterface>(
         processMemory_.get(), processMemorySize_);
-  } else if (type == simeng::MemInterfaceType::Fixed) {
-    dataMemory_ = std::make_shared<simeng::FixedLatencyMemoryInterface>(
-        processMemory_.get(), processMemorySize_,
-        config_["LSQ-L1-Interface"]["Access-Latency"].as<uint16_t>());
+  } else if (type == memory::MemInterfaceType::Fixed) {
+    uint16_t accessLat =
+        config_["LSQ-L1-Interface"]["Access-Latency"].as<uint16_t>();
+    dataMemory_ = std::make_shared<memory::FixedLatencyMemoryInterface>(
+        processMemory_.get(), processMemorySize_, accessLat);
   } else {
     std::cerr << "[SimEng:CoreInstance] Unsupported memory interface type used "
                  "in createL1DataMemory()."
@@ -206,7 +183,7 @@ void CoreInstance::createL1DataMemory(const simeng::MemInterfaceType type) {
 }
 
 void CoreInstance::setL1DataMemory(
-    std::shared_ptr<simeng::MemoryInterface> memRef) {
+    std::shared_ptr<memory::MemoryInterface> memRef) {
   assert(setDataMemory_ &&
          "setL1DataMemory(...) called but the interface was created by the "
          "CoreInstance class.");
@@ -232,43 +209,79 @@ void CoreInstance::createCore() {
     exit(1);
   }
 
-  // Create the architecture, with knowledge of the kernel
-  if (config_["Core"]["ISA"].as<std::string>() == "rv64") {
-    arch_ =
-        std::make_unique<simeng::arch::riscv::Architecture>(kernel_, config_);
-  } else if (config_["Core"]["ISA"].as<std::string>() == "AArch64") {
-    arch_ =
-        std::make_unique<simeng::arch::aarch64::Architecture>(kernel_, config_);
+  // Create the architecture, with knowledge of the OS
+  if (config::SimInfo::getISA() == config::ISA::RV64) {
+    arch_ = std::make_unique<arch::riscv::Architecture>(kernel_);
+  } else if (config::SimInfo::getISA() == config::ISA::AArch64) {
+    arch_ = std::make_unique<arch::aarch64::Architecture>(kernel_);
   }
 
-  // Construct branch predictor object
-  predictor_ = std::make_unique<simeng::GenericPredictor>(config_);
+  std::string predictorType =
+      config_["Branch-Predictor"]["Type"].as<std::string>();
+  if (predictorType == "Generic") {
+    predictor_ = std::make_unique<GenericPredictor>();
+  } else if (predictorType == "Perceptron") {
+    predictor_ = std::make_unique<PerceptronPredictor>();
+  }
 
-  // Extract port arrangement from config file
+  // Extract the port arrangement from the config file
   auto config_ports = config_["Ports"];
-  std::vector<std::vector<uint16_t>> portArrangement(config_ports.size());
-  for (size_t i = 0; i < config_ports.size(); i++) {
-    auto config_groups = config_ports[i]["Instruction-Group-Support"];
+  std::vector<std::vector<uint16_t>> portArrangement(
+      config_ports.num_children());
+  for (size_t i = 0; i < config_ports.num_children(); i++) {
+    auto config_groups = config_ports[i]["Instruction-Group-Support-Nums"];
     // Read groups in associated port
-    for (size_t j = 0; j < config_groups.size(); j++) {
-      portArrangement[i].push_back(config_groups[j].as<uint16_t>());
+    for (size_t j = 0; j < config_groups.num_children(); j++) {
+      uint16_t grp = config_groups[j].as<uint16_t>();
+      portArrangement[i].push_back(grp);
     }
   }
-  portAllocator_ = std::make_unique<simeng::pipeline::BalancedPortAllocator>(
-      portArrangement);
+
+  // Initialise the desired port allocator
+  std::string portAllocatorType =
+      config_["Port-Allocator"]["Type"].as<std::string>();
+  if (portAllocatorType == "Balanced") {
+    portAllocator_ =
+        std::make_unique<pipeline::BalancedPortAllocator>(portArrangement);
+  } else if (portAllocatorType == "A64FX") {
+    portAllocator_ =
+        std::make_unique<pipeline::A64FXPortAllocator>(portArrangement);
+  } else if (portAllocatorType == "M1") {
+    // Extract the reservation station arrangement from the config file
+    auto config_rs = config_["Reservation-Stations"];
+    std::vector<std::pair<uint16_t, uint64_t>> rsArrangement;
+    for (size_t i = 0; i < config_rs.num_children(); i++) {
+      auto config_rs_ports = config_rs[i]["Port-Nums"];
+      for (size_t j = 0; j < config_rs_ports.num_children(); j++) {
+        uint16_t port = config_rs_ports[j].as<uint16_t>();
+        if (static_cast<uint16_t>(rsArrangement.size()) < port + 1) {
+          rsArrangement.resize(port + 1);
+        }
+        rsArrangement[port] = {i, config_rs[i]["Size"].as<uint64_t>()};
+      }
+    }
+    portAllocator_ = std::make_unique<pipeline::M1PortAllocator>(
+        portArrangement, rsArrangement);
+  } else {
+    std::cout << "[SimEng:CoreInstnce] Invalid Port Allocator type selected."
+              << std::endl;
+    exit(EXIT_FAILURE);
+  }
 
   // Construct the core object based on the defined simulation mode
   uint64_t entryPoint = process_->getEntryPoint();
-  if (mode_ == SimulationMode::Emulation) {
-    core_ = std::make_shared<simeng::models::emulation::Core>(
+  if (config::SimInfo::getSimMode() == config::SimulationMode::Emulation) {
+    core_ = std::make_shared<models::emulation::Core>(
         *instructionMemory_, *dataMemory_, entryPoint, processMemorySize_,
         *arch_);
-  } else if (mode_ == SimulationMode::InOrderPipelined) {
-    core_ = std::make_shared<simeng::models::inorder::Core>(
+  } else if (config::SimInfo::getSimMode() ==
+             config::SimulationMode::InOrderPipelined) {
+    core_ = std::make_shared<models::inorder::Core>(
         *instructionMemory_, *dataMemory_, processMemorySize_, entryPoint,
         *arch_, *predictor_);
-  } else if (mode_ == SimulationMode::OutOfOrder) {
-    core_ = std::make_shared<simeng::models::outoforder::Core>(
+  } else if (config::SimInfo::getSimMode() ==
+             config::SimulationMode::Outoforder) {
+    core_ = std::make_shared<models::outoforder::Core>(
         *instructionMemory_, *dataMemory_, processMemorySize_, entryPoint,
         *arch_, *predictor_, *portAllocator_, config_);
   }
@@ -280,8 +293,8 @@ void CoreInstance::createCore() {
 
 void CoreInstance::createSpecialFileDirectory() {
   // Create the Special Files directory if indicated to do so in Config
-  if (config_["CPU-Info"]["Generate-Special-Dir"].as<bool>() == true) {
-    simeng::SpecialFileDirGen SFdir = simeng::SpecialFileDirGen(config_);
+  if (config::SimInfo::getGenSpecFiles()) {
+    SpecialFileDirGen SFdir = SpecialFileDirGen();
     // Remove any current special files dir
     SFdir.RemoveExistingSFDir();
     // Create new special files dir
@@ -291,13 +304,7 @@ void CoreInstance::createSpecialFileDirectory() {
   return;
 }
 
-const SimulationMode CoreInstance::getSimulationMode() const { return mode_; }
-
-const std::string CoreInstance::getSimulationModeString() const {
-  return modeString_;
-}
-
-std::shared_ptr<simeng::Core> CoreInstance::getCore() const {
+std::shared_ptr<Core> CoreInstance::getCore() const {
   if (core_ == nullptr) {
     std::cerr
         << "[SimEng:CoreInstance] Core object not constructed. If either data "
@@ -310,7 +317,7 @@ std::shared_ptr<simeng::Core> CoreInstance::getCore() const {
   return core_;
 }
 
-std::shared_ptr<simeng::MemoryInterface> CoreInstance::getDataMemory() const {
+std::shared_ptr<memory::MemoryInterface> CoreInstance::getDataMemory() const {
   if (setDataMemory_ && (dataMemory_ == nullptr)) {
     std::cerr << "[SimEng:CoreInstance] `External` data memory object not set."
               << std::endl;
@@ -319,7 +326,7 @@ std::shared_ptr<simeng::MemoryInterface> CoreInstance::getDataMemory() const {
   return dataMemory_;
 }
 
-std::shared_ptr<simeng::MemoryInterface> CoreInstance::getInstructionMemory()
+std::shared_ptr<memory::MemoryInterface> CoreInstance::getInstructionMemory()
     const {
   if (setInstructionMemory_ && (instructionMemory_ == nullptr)) {
     std::cerr
@@ -334,12 +341,10 @@ std::shared_ptr<char> CoreInstance::getProcessImage() const {
   return processMemory_;
 }
 
-const uint64_t CoreInstance::getProcessImageSize() const {
+uint64_t CoreInstance::getProcessImageSize() const {
   return processMemorySize_;
 }
 
-const uint64_t CoreInstance::getHeapStart() const {
-  return process_->getHeapStart();
-};
+uint64_t CoreInstance::getHeapStart() const { return process_->getHeapStart(); }
 
 }  // namespace simeng
diff --git a/src/lib/GenericPredictor.cc b/src/lib/GenericPredictor.cc
deleted file mode 100644
index d9188a9e4a..0000000000
--- a/src/lib/GenericPredictor.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-#include "simeng/GenericPredictor.hh"
-
-#include <iostream>
-
-namespace simeng {
-
-GenericPredictor::GenericPredictor(YAML::Node config)
-    : btbBits_(config["Branch-Predictor"]["BTB-Tag-Bits"].as<uint64_t>()),
-      btb_(1 << btbBits_,
-           {config["Branch-Predictor"]["Fallback-Static-Predictor"]
-                .as<uint16_t>(),
-            0}),
-      satCntBits_(
-          config["Branch-Predictor"]["Saturating-Count-Bits"].as<uint64_t>()),
-      globalHistoryLength_(
-          config["Branch-Predictor"]["Global-History-Length"].as<uint64_t>()),
-      rasSize_(config["Branch-Predictor"]["RAS-entries"].as<uint64_t>()) {
-  // Alter globalHistoryLength_ value to better suit required format in update()
-  globalHistoryLength_ = (1 << globalHistoryLength_) - 1;
-}
-
-GenericPredictor::~GenericPredictor() {
-  btb_.clear();
-  ras_.clear();
-  rasHistory_.clear();
-}
-
-BranchPrediction GenericPredictor::predict(uint64_t address, BranchType type,
-                                           int64_t knownOffset) {
-  // Get index via an XOR hash between the global history and the lower btbBits_
-  // bits of the instruction address
-  uint64_t hashedIndex = (address & ((1 << btbBits_) - 1)) ^ globalHistory_;
-  btbHistory_[address] = hashedIndex;
-
-  // Get prediction from BTB
-  bool direction =
-      btb_[hashedIndex].first < (1 << (satCntBits_ - 1)) ? false : true;
-  uint64_t target =
-      (knownOffset != 0) ? address + knownOffset : btb_[hashedIndex].second;
-  BranchPrediction prediction = {direction, target};
-
-  // Ammend prediction based on branch type
-  if (type == BranchType::Unconditional) {
-    prediction.taken = true;
-  } else if (type == BranchType::Return) {
-    prediction.taken = true;
-    // Return branches can use the RAS if an entry is available
-    if (ras_.size() > 0) {
-      prediction.target = ras_.back();
-      // Record top of RAS used for target prediction
-      rasHistory_[address] = ras_.back();
-      ras_.pop_back();
-    }
-  } else if (type == BranchType::SubroutineCall) {
-    prediction.taken = true;
-    // Subroutine call branches must push their associated return address to RAS
-    if (ras_.size() >= rasSize_) {
-      ras_.pop_front();
-    }
-    ras_.push_back(address + 4);
-    // Record that this address is a branch-and-link instruction
-    rasHistory_[address] = 0;
-  } else if (type == BranchType::Conditional) {
-    if (!prediction.taken) prediction.target = address + 4;
-  }
-  return prediction;
-}
-
-void GenericPredictor::update(uint64_t address, bool taken,
-                              uint64_t targetAddress, BranchType type) {
-  // Get previous index calculated for the instruction address supplied
-  uint64_t hashedIndex = btbHistory_[address];
-
-  // Calculate 2-bit saturating counter value
-  uint8_t satCntVal = btb_[hashedIndex].first;
-  // Only alter value if it would transition to a valid state
-  if (!((satCntVal == (1 << satCntBits_) - 1) && taken) &&
-      !(satCntVal == 0 && !taken)) {
-    satCntVal += taken ? 1 : -1;
-  }
-
-  // Update BTB entry
-  btb_[hashedIndex] = {satCntVal, targetAddress};
-
-  // Update global history value with new direction
-  globalHistory_ = ((globalHistory_ << 1) | taken) & globalHistoryLength_;
-  return;
-}
-
-void GenericPredictor::flush(uint64_t address) {
-  // If address interacted with RAS, rewind entry
-  auto it = rasHistory_.find(address);
-  if (it != rasHistory_.end()) {
-    uint64_t target = it->second;
-    if (target != 0) {
-      // If history entry belongs to a return instruction, push target back onto
-      // stack
-      if (ras_.size() >= rasSize_) {
-        ras_.pop_front();
-      }
-      ras_.push_back(target);
-    } else {
-      // If history entry belongs to a branch-and-link instruction, pop target
-      // off of stack
-      if (ras_.size()) {
-        ras_.pop_back();
-      }
-    }
-    rasHistory_.erase(it);
-  }
-}
-
-}  // namespace simeng
diff --git a/src/lib/ModelConfig.cc b/src/lib/ModelConfig.cc
deleted file mode 100644
index 84e71ced5b..0000000000
--- a/src/lib/ModelConfig.cc
+++ /dev/null
@@ -1,782 +0,0 @@
-#include "simeng/ModelConfig.hh"
-
-#include <cmath>
-
-namespace simeng {
-
-/** RISC-V opcodes. Each opcode represents a unique RISC-V operation. */
-namespace RISCVOpcode {
-#define GET_INSTRINFO_ENUM
-#include "RISCVGenInstrInfo.inc"
-}  // namespace RISCVOpcode
-
-/** AArch64 opcodes. Each opcode represents a unique AArch64 operation. */
-namespace AARCH64Opcode {
-#define GET_INSTRINFO_ENUM
-#include "AArch64GenInstrInfo.inc"
-}  // namespace AARCH64Opcode
-
-ModelConfig::ModelConfig(std::string path) {
-  // Ensure the file exists
-  std::ifstream file(path);
-  if (!file.is_open()) {
-    std::cerr << "[SimEng:ModelConfig] Could not read " << path << std::endl;
-    exit(1);
-  }
-  file.close();
-
-  // Read in the config file
-  configFile_ = YAML::LoadFile(path);
-
-  // Check if the config file inherits values from a base config
-  inherit();
-
-  // Validate the inputted config file
-  validate();
-}
-
-YAML::Node ModelConfig::getConfigFile() { return configFile_; }
-
-void ModelConfig::inherit() {
-  // Check if the config file includes a inheritted file
-  if (!configFile_["Inherit-From"]) {
-    return;
-  } else {
-    std::cerr << "[SimEng:ModelConfig] Config inheritance not yet supported"
-              << std::endl;
-    exit(1);
-    // TODO: Merge files
-  }
-  return;
-}
-
-void ModelConfig::validate() {
-  // Loop through expected fields and ensure a valid value exists
-  std::vector<std::string> subFields;
-  std::string root = "";
-  int validISA;
-
-  // Core
-  root = "Core";
-  subFields = {"ISA",
-               "Simulation-Mode",
-               "Clock-Frequency",
-               "Timer-Frequency",
-               "Micro-Operations",
-               "Vector-Length",
-               "Streaming-Vector-Length"};
-  validISA = nodeChecker<std::string>(
-      configFile_[root][subFields[0]], subFields[0],
-      std::vector<std::string>({"AArch64", "rv64"}), ExpectedValue::String);
-  nodeChecker<std::string>(configFile_[root][subFields[1]], subFields[1],
-                           {"emulation", "inorderpipelined", "outoforder"},
-                           ExpectedValue::String);
-  nodeChecker<float>(configFile_[root][subFields[2]], subFields[2],
-                     std::make_pair(0.f, 10.f), ExpectedValue::Float);
-  nodeChecker<uint32_t>(configFile_[root][subFields[3]], subFields[3],
-                        std::make_pair(1, UINT32_MAX), ExpectedValue::UInteger,
-                        100);
-  nodeChecker<bool>(configFile_[root][subFields[4]], subFields[4],
-                    std::make_pair(false, true), ExpectedValue::Bool, false);
-  nodeChecker<uint16_t>(configFile_[root][subFields[5]], subFields[5],
-                        {128, 256, 384, 512, 640, 768, 896, 1024, 1152, 1280,
-                         1408, 1536, 1664, 1792, 1920, 2048},
-                        ExpectedValue::UInteger, 512);
-  nodeChecker<uint16_t>(configFile_[root][subFields[6]], subFields[6],
-                        {128, 256, 384, 512, 640, 768, 896, 1024, 1152, 1280,
-                         1408, 1536, 1664, 1792, 1920, 2048},
-                        ExpectedValue::UInteger, 512);
-  subFields.clear();
-
-  // First check that the ISA config option is valid, this protects reads from
-  // the ISA config option as well as everything that depends on them. This
-  // includes uses of groupOptions_ and groupMapping_ as these are dependent on
-  // the ISA
-  if (validISA == 1) {
-    // Generate groupOptions_ and groupMapping_
-    createGroupMapping();
-
-    // Ports
-    std::vector<std::string> portNames;
-    std::map<std::string, bool> portLinked;
-    root = "Ports";
-    size_t num_ports = configFile_[root].size();
-    if (!num_ports) {
-      missing_ << "\t- " << root << "\n";
-    }
-    for (size_t i = 0; i < num_ports; i++) {
-      YAML::Node port_node = configFile_[root][i];
-      // Get port number into a string format
-      char port_msg[10];
-      sprintf(port_msg, "Port %zu ", i);
-      std::string port_num = std::string(port_msg);
-      // Check for existence of Portname field and record name
-      if (nodeChecker<std::string>(port_node["Portname"], port_num + "Portname",
-                                   std::vector<std::string>{},
-                                   ExpectedValue::String)) {
-        std::string name = port_node["Portname"].as<std::string>();
-        // Ensure port name is unique
-        if (std::find(portNames.begin(), portNames.end(), name) ==
-            portNames.end()) {
-          portNames.push_back(name);
-          portLinked.insert({name, false});
-        } else {
-          invalid_ << "\t- " << port_num << "name \"" << name
-                   << "\" already used\n";
-        }
-      }
-      // Check for existence of Instruction-Support field
-      if (!(port_node["Instruction-Support"].IsDefined()) ||
-          port_node["Instruction-Support"].IsNull()) {
-        missing_ << "\t- " << port_num << "Instruction-Support\n";
-        continue;
-      }
-      uint16_t groupIndex = 0;
-      uint16_t opcodeIndex = 0;
-      for (size_t j = 0; j < port_node["Instruction-Support"].size(); j++) {
-        YAML::Node group = port_node["Instruction-Support"][j];
-        // Get group number into a string format
-        char group_msg[10];
-        sprintf(group_msg, "Group %zu ", j);
-        std::string group_num = std::string(group_msg);
-        // Check for existence of instruction group
-        if (group.as<std::string>()[0] == '~') {
-          // Extract opcode and store in config option
-          uint16_t opcode = std::stoi(group.as<std::string>().substr(
-              1, group.as<std::string>().size()));
-          configFile_["Ports"][i]["Instruction-Opcode-Support"][opcodeIndex] =
-              opcode;
-          if (configFile_["Core"]["ISA"].as<std::string>() == "rv64") {
-            // Ensure opcode is between the bounds of 0 and Capstones'
-            // RISCV_INSTRUCTION_LIST_END
-            boundChecker(
-                configFile_["Ports"][i]["Instruction-Opcode-Support"]
-                           [opcodeIndex],
-                port_num + group_num,
-                std::make_pair(0, static_cast<int>(
-                                      RISCVOpcode::RISCV_INSTRUCTION_LIST_END)),
-                ExpectedValue::UInteger);
-          } else if (configFile_["Core"]["ISA"].as<std::string>() ==
-                     "AArch64") {
-            // Ensure opcode is between the bounds of 0 and Capstones'
-            // AArch64_INSTRUCTION_LIST_END
-            boundChecker(
-                configFile_["Ports"][i]["Instruction-Opcode-Support"]
-                           [opcodeIndex],
-                port_num + group_num,
-                std::make_pair(
-                    0, static_cast<int>(
-                           AARCH64Opcode::AArch64_INSTRUCTION_LIST_END)),
-                ExpectedValue::UInteger);
-          }
-
-          opcodeIndex++;
-        } else if (nodeChecker<std::string>(group, port_num + group_num,
-                                            groupOptions_,
-                                            ExpectedValue::String)) {
-          configFile_["Ports"][i]["Instruction-Group-Support"][groupIndex] =
-              unsigned(groupMapping_[group.as<std::string>()]);
-          groupIndex++;
-        }
-      }
-    }
-
-    // Reservation-Stations
-    root = "Reservation-Stations";
-    size_t num_rs = configFile_[root].size();
-    if (!num_rs) {
-      missing_ << "\t- " << root << "\n";
-    }
-    for (size_t i = 0; i < num_rs; i++) {
-      YAML::Node rs = configFile_[root][i];
-      // Get rs number into a string format
-      char rs_msg[25];
-      sprintf(rs_msg, "Reservation Station %zu ", i);
-      std::string rs_num = std::string(rs_msg);
-      nodeChecker<uint16_t>(rs["Size"], rs_num + "Size",
-                            std::make_pair(1, UINT16_MAX),
-                            ExpectedValue::UInteger);
-      nodeChecker<uint16_t>(rs["Dispatch-Rate"], rs_num + "Dispatch-Rate",
-                            std::make_pair(1, UINT16_MAX),
-                            ExpectedValue::UInteger);
-      // Check for existence of Ports field
-      if (!(rs["Ports"].IsDefined()) || rs["Ports"].IsNull()) {
-        missing_ << "\t- " << rs_num << "Ports\n";
-        continue;
-      }
-      for (size_t j = 0; j < rs["Ports"].size(); j++) {
-        YAML::Node port_node = rs["Ports"][j];
-        // Get port index into a string format
-        char port_msg[25];
-        sprintf(port_msg, "Port %zu ", j);
-        std::string port_num = std::string(port_msg);
-        if (nodeChecker<std::string>(port_node, rs_num + port_num + "Portname",
-                                     portNames, ExpectedValue::String)) {
-          // Change port name to port index
-          for (size_t k = 0; k < portNames.size(); k++) {
-            if (port_node.as<std::string>() == portNames[k]) {
-              configFile_["Reservation-Stations"][i]["Ports"][j] = unsigned(k);
-              portLinked[portNames[k]] = true;
-              break;
-            }
-          }
-        }
-      }
-    }
-    // Ensure all ports have an associated reservation station
-    for (auto& port : portLinked) {
-      if (!port.second) {
-        missing_ << "\t- " << port.first
-                 << " has no associated reservation station\n";
-      }
-    }
-
-    // TODO make as many subfields as possible generic to avoid repeated code
-    // e.g. AArch64 FloatingPoint/SVE-Count -> FloatingPoint-Count
-    if (configFile_["Core"]["ISA"].as<std::string>() == "rv64") {
-      // Register-Set
-      root = "Register-Set";
-      subFields = {"GeneralPurpose-Count", "FloatingPoint-Count"};
-      nodeChecker<uint16_t>(configFile_[root][subFields[0]], subFields[0],
-                            std::make_pair(32, UINT16_MAX),
-                            ExpectedValue::UInteger);
-      nodeChecker<uint16_t>(configFile_[root][subFields[1]], subFields[1],
-                            std::make_pair(32, UINT16_MAX),
-                            ExpectedValue::UInteger);
-    } else if (configFile_["Core"]["ISA"].as<std::string>() == "AArch64") {
-      // Register-Set
-      root = "Register-Set";
-      subFields = {"GeneralPurpose-Count", "FloatingPoint/SVE-Count",
-                   "Predicate-Count", "Conditional-Count", "Matrix-Count"};
-      nodeChecker<uint16_t>(configFile_[root][subFields[0]], subFields[0],
-                            std::make_pair(32, UINT16_MAX),
-                            ExpectedValue::UInteger);
-      nodeChecker<uint16_t>(configFile_[root][subFields[1]], subFields[1],
-                            std::make_pair(32, UINT16_MAX),
-                            ExpectedValue::UInteger);
-      nodeChecker<uint16_t>(configFile_[root][subFields[2]], subFields[2],
-                            std::make_pair(17, UINT16_MAX),
-                            ExpectedValue::UInteger, 17);
-      nodeChecker<uint16_t>(configFile_[root][subFields[3]], subFields[3],
-                            std::make_pair(1, UINT16_MAX),
-                            ExpectedValue::UInteger);
-      nodeChecker<uint16_t>(configFile_[root][subFields[4]], subFields[4],
-                            std::make_pair(1, UINT16_MAX),
-                            ExpectedValue::UInteger, 1);
-    }
-
-    subFields.clear();
-
-    // Execution-Units
-    root = "Execution-Units";
-    subFields = {"Pipelined", "Blocking-Groups"};
-    size_t num_units = configFile_[root].size();
-    if (!num_units) {
-      missing_ << "\t- " << root << "\n";
-    } else if (num_ports != num_units) {
-      invalid_
-          << "\t- Number of issue ports and execution units should be equal\n";
-    }
-    for (size_t i = 0; i < num_units; i++) {
-      char euNum[50];
-      sprintf(euNum, "Execution Unit %zu ", i);
-      YAML::Node euNode = configFile_[root][i];
-      nodeChecker<bool>(configFile_[root][i][subFields[0]],
-                        (std::string(euNum) + subFields[0]),
-                        std::vector<bool>{false, true}, ExpectedValue::Bool);
-      if (euNode[subFields[1]].IsDefined() &&
-          !(euNode[subFields[1]].IsNull())) {
-        // Compile set of blocking groups into a queue
-        std::queue<uint16_t> blockingGroups;
-        for (size_t j = 0; j < euNode[subFields[1]].size(); j++) {
-          char bgNum[50];
-          sprintf(bgNum, "Blocking group %zu", j);
-          if (nodeChecker<std::string>(
-                  configFile_[root][i][subFields[1]][j],
-                  (std::string(euNum) + std::string(bgNum)), groupOptions_,
-                  ExpectedValue::String)) {
-            uint16_t mappedGroup =
-                groupMapping_[euNode[subFields[1]][j].as<std::string>()];
-            blockingGroups.push(mappedGroup);
-            configFile_["Execution-Units"][i]["Blocking-Groups"][j] =
-                mappedGroup;
-          }
-        }
-        // Expand set of blocking groups to include those that inherit from the
-        // user defined set
-        uint16_t config_index =
-            configFile_["Execution-Units"][i]["Blocking-Groups"].size();
-        while (blockingGroups.size()) {
-          // Determine if there's any inheritance
-          if (arch::aarch64::groupInheritance.find(blockingGroups.front()) !=
-              arch::aarch64::groupInheritance.end()) {
-            std::vector<uint16_t> inheritedGroups =
-                arch::aarch64::groupInheritance.at(blockingGroups.front());
-            for (int k = 0; k < inheritedGroups.size(); k++) {
-              blockingGroups.push(inheritedGroups[k]);
-              configFile_["Execution-Units"][i]["Blocking-Groups"]
-                         [config_index] = inheritedGroups[k];
-              config_index++;
-            }
-          }
-          blockingGroups.pop();
-        }
-      }
-    }
-    subFields.clear();
-  }
-
-  // Fetch
-  root = "Fetch";
-  subFields = {"Fetch-Block-Size", "Loop-Buffer-Size",
-               "Loop-Detection-Threshold"};
-  if (nodeChecker<uint16_t>(configFile_[root][subFields[0]], subFields[0],
-                            std::make_pair(4, UINT16_MAX),
-                            ExpectedValue::UInteger)) {
-    uint16_t block_size = configFile_[root][subFields[0]].as<uint16_t>();
-    // Ensure fetch block size is a power of 2
-    if ((block_size & (block_size - 1)) == 0) {
-      uint8_t alignment_bits = log2(block_size);
-      configFile_[root]["Fetch-Block-Alignment-Bits"] =
-          unsigned(alignment_bits);
-    } else {
-      invalid_ << "\t- Fetch-Block-Size must be a power of 2\n";
-    }
-  }
-  nodeChecker<uint16_t>(configFile_[root][subFields[1]], subFields[1],
-                        std::make_pair(0, UINT16_MAX), ExpectedValue::UInteger);
-  nodeChecker<uint16_t>(configFile_[root][subFields[2]], subFields[2],
-                        std::make_pair(0, UINT16_MAX), ExpectedValue::UInteger);
-  subFields.clear();
-
-  // Process-Image
-  root = "Process-Image";
-  subFields = {"Heap-Size", "Stack-Size"};
-  // Default heap size is 1024 * 1024 * 10 = 10MiB
-  nodeChecker<uint64_t>(configFile_[root][subFields[0]], subFields[0],
-                        std::make_pair(1, UINT64_MAX), ExpectedValue::UInteger,
-                        10485760);
-  // Default stack size is 1024 * 1024 = 1MiB
-  nodeChecker<uint64_t>(configFile_[root][subFields[1]], subFields[1],
-                        std::make_pair(1, UINT64_MAX), ExpectedValue::UInteger,
-                        1048576);
-  subFields.clear();
-
-  // Branch-Predictor
-  root = "Branch-Predictor";
-  subFields = {"BTB-Tag-Bits", "Saturating-Count-Bits", "Global-History-Length",
-               "RAS-entries", "Fallback-Static-Predictor"};
-  nodeChecker<uint64_t>(configFile_[root][subFields[0]], subFields[0],
-                        std::make_pair(1, UINT64_MAX), ExpectedValue::UInteger);
-  nodeChecker<uint64_t>(configFile_[root][subFields[2]], subFields[2],
-                        std::make_pair(0, 64), ExpectedValue::UInteger);
-  nodeChecker<uint64_t>(configFile_[root][subFields[3]], subFields[3],
-                        std::make_pair(1, UINT64_MAX), ExpectedValue::UInteger);
-  if (nodeChecker<std::string>(
-          configFile_[root][subFields[4]], subFields[4],
-          std::vector<std::string>{"Always-Taken", "Always-Not-Taken"},
-          ExpectedValue::String)) {
-    // If the Saturating-Count-Bits option is valid, set fallback static
-    // prediction to weakest value of the specific direction (i.e weakly taken
-    // or weakly not-taken)
-    if (nodeChecker<uint64_t>(configFile_[root][subFields[1]], subFields[1],
-                              std::make_pair(1, UINT64_MAX),
-                              ExpectedValue::UInteger)) {
-      // Calculate saturation counter boundary between weakly taken and
-      // not-taken. `(2 ^ num_sat_cnt_bits) / 2` gives the weakly taken state
-      // value
-      uint16_t weaklyTaken =
-          std::pow(2, (configFile_[root][subFields[1]].as<uint64_t>() - 1));
-      // Swap Fallback-Static-Predictor scheme out for equivalent saturating
-      // counter value
-      configFile_[root][subFields[4]] =
-          (configFile_[root][subFields[4]].as<std::string>() == "Always-Taken")
-              ? weaklyTaken
-              : (weaklyTaken - 1);
-    }
-  }
-  subFields.clear();
-
-  // Data Memory
-  root = "L1-Data-Memory";
-  subFields = {"Interface-Type"};
-  nodeChecker<std::string>(
-      configFile_[root][subFields[0]], root + " " + subFields[0],
-      std::vector<std::string>{"Flat", "Fixed", "External"},
-      ExpectedValue::String);
-  // Currently, fixed instruction memory interfaces are unsupported for
-  // emulation and inorder simulation modes
-  if (configFile_[root][subFields[0]].as<std::string>() != "Flat") {
-    std::string mode = configFile_["Core"]["Simulation-Mode"].as<std::string>();
-    if (mode == "emulation" || mode == "inorderpipelined") {
-      invalid_ << "\t- Non-Flat data memory interface types are "
-                  "currently unsupported for 'emulation' and "
-                  "'inorderpipelined' simulation modes\n";
-    }
-  }
-
-  // Instruction Memory
-  root = "L1-Instruction-Memory";
-  subFields = {"Interface-Type"};
-  nodeChecker<std::string>(
-      configFile_[root][subFields[0]], root + " " + subFields[0],
-      std::vector<std::string>{"Flat", "Fixed", "External"},
-      ExpectedValue::String);
-  // Currently, fixed instruction memory interfaces are unsupported
-  if (configFile_[root][subFields[0]].as<std::string>() != "Flat") {
-    invalid_ << "\t- Non-Flat instruction memory interface types are currently "
-                "unsupported\n";
-  }
-
-  // LSQ-L1-Interface
-  root = "LSQ-L1-Interface";
-  subFields = {"Access-Latency",
-               "Exclusive",
-               "Load-Bandwidth",
-               "Store-Bandwidth",
-               "Permitted-Requests-Per-Cycle",
-               "Permitted-Loads-Per-Cycle",
-               "Permitted-Stores-Per-Cycle"};
-  nodeChecker<uint16_t>(configFile_[root][subFields[0]], subFields[0],
-                        std::make_pair(1, UINT16_MAX), ExpectedValue::UInteger,
-                        1);
-  nodeChecker<bool>(configFile_[root][subFields[1]], subFields[1],
-                    std::vector<bool>{true, false}, ExpectedValue::Bool, false);
-  nodeChecker<uint16_t>(configFile_[root][subFields[2]], subFields[2],
-                        std::make_pair(1, UINT16_MAX), ExpectedValue::UInteger,
-                        UINT16_MAX);
-  nodeChecker<uint16_t>(configFile_[root][subFields[3]], subFields[3],
-                        std::make_pair(1, UINT16_MAX), ExpectedValue::UInteger,
-                        UINT16_MAX);
-  nodeChecker<uint16_t>(configFile_[root][subFields[4]], subFields[4],
-                        std::make_pair(1, UINT16_MAX), ExpectedValue::UInteger,
-                        UINT16_MAX);
-  nodeChecker<uint16_t>(configFile_[root][subFields[5]], subFields[5],
-                        std::make_pair(1, UINT16_MAX), ExpectedValue::UInteger,
-                        UINT16_MAX);
-  nodeChecker<uint16_t>(configFile_[root][subFields[6]], subFields[6],
-                        std::make_pair(1, UINT16_MAX), ExpectedValue::UInteger,
-                        UINT16_MAX);
-  subFields.clear();
-
-  // Queue-Sizes
-  root = "Queue-Sizes";
-  subFields = {"ROB", "Load", "Store"};
-  nodeChecker<unsigned int>(configFile_[root][subFields[0]], subFields[0],
-                            std::make_pair(1, UINT_MAX),
-                            ExpectedValue::UInteger);
-  nodeChecker<unsigned int>(configFile_[root][subFields[1]], subFields[1],
-                            std::make_pair(1, UINT_MAX),
-                            ExpectedValue::UInteger);
-  nodeChecker<unsigned int>(configFile_[root][subFields[2]], subFields[2],
-                            std::make_pair(1, UINT_MAX),
-                            ExpectedValue::UInteger);
-  subFields.clear();
-
-  // Pipeline-Widths
-  root = "Pipeline-Widths";
-  subFields = {"Commit", "FrontEnd", "LSQ-Completion"};
-  nodeChecker<unsigned int>(configFile_[root][subFields[0]], subFields[0],
-                            std::make_pair(1, UINT_MAX),
-                            ExpectedValue::UInteger);
-  nodeChecker<unsigned int>(configFile_[root][subFields[1]], subFields[1],
-                            std::make_pair(1, UINT_MAX),
-                            ExpectedValue::UInteger);
-  nodeChecker<unsigned int>(configFile_[root][subFields[2]], subFields[2],
-                            std::make_pair(1, UINT_MAX),
-                            ExpectedValue::UInteger);
-  subFields.clear();
-
-  // Latencies
-  root = "Latencies";
-  subFields = {"Instruction-Groups", "Execution-Latency",
-               "Execution-Throughput"};
-  for (size_t i = 0; i < configFile_[root].size(); i++) {
-    char latNum[50];
-    sprintf(latNum, "Latency group %zu ", i);
-    YAML::Node latNode = configFile_[root][i];
-    YAML::Node grpNode = latNode[subFields[0]];
-    if (grpNode.IsDefined() && !(grpNode.IsNull())) {
-      uint16_t groupIndex = 0;
-      uint16_t opcodeIndex = 0;
-      for (size_t j = 0; j < grpNode.size(); j++) {
-        char grpNum[50];
-        sprintf(grpNum, "Instruction group %zu ", j);
-        // Determine whether the value is an opcode or an instruction-group
-        // value
-        if (grpNode[j].as<std::string>()[0] == '~') {
-          // Extract opcode and store in config option
-          uint16_t opcode = std::stoi(grpNode[j].as<std::string>().substr(
-              1, grpNode[j].as<std::string>().size()));
-          configFile_[root][i]["Instruction-Opcode"][opcodeIndex] = opcode;
-          // Ensure opcode is between the bounds of 0 and Capstones'
-          // AArch64_INSTRUCTION_LIST_END
-          boundChecker(configFile_[root][i]["Instruction-Opcode"][opcodeIndex],
-                       (std::string(latNum) + std::string(grpNum)),
-                       std::make_pair(0, 4516), ExpectedValue::UInteger);
-          opcodeIndex++;
-        } else if (nodeChecker<std::string>(
-                       grpNode[j], (std::string(latNum) + std::string(grpNum)),
-                       groupOptions_, ExpectedValue::String)) {
-          // Map latency Instruction-Group to integer value
-          configFile_[root][i]["Instruction-Group"][groupIndex] =
-              groupMapping_[grpNode[j].as<std::string>()];
-          groupIndex++;
-        }
-      }
-    } else {
-      missing_ << "\t- " << (std::string(latNum) + subFields[0]) << "\n";
-    }
-    nodeChecker<uint16_t>(
-        latNode[subFields[1]], (std::string(latNum) + subFields[1]),
-        std::make_pair(1, UINT16_MAX), ExpectedValue::UInteger);
-    nodeChecker<uint16_t>(
-        latNode[subFields[2]], (std::string(latNum) + subFields[2]),
-        std::make_pair(1, UINT16_MAX), ExpectedValue::UInteger);
-  }
-  subFields.clear();
-
-  // CPU-Info
-  root = "CPU-Info";
-  subFields = {"Generate-Special-Dir",
-               "Core-Count",
-               "Socket-Count",
-               "SMT",
-               "BogoMIPS",
-               "Features",
-               "CPU-Implementer",
-               "CPU-Architecture",
-               "CPU-Variant",
-               "CPU-Part",
-               "CPU-Revision",
-               "Package-Count"};
-  nodeChecker<bool>(configFile_[root][subFields[0]], subFields[0],
-                    std::vector<bool>{false, true}, ExpectedValue::Bool, false);
-  nodeChecker<unsigned int>(configFile_[root][subFields[1]], subFields[1],
-                            std::make_pair(1, UINT_MAX),
-                            ExpectedValue::UInteger, 1);
-  nodeChecker<unsigned int>(configFile_[root][subFields[2]], subFields[2],
-                            std::make_pair(1, UINT_MAX),
-                            ExpectedValue::UInteger, 1);
-  nodeChecker<unsigned int>(configFile_[root][subFields[3]], subFields[3],
-                            std::make_pair(1, UINT_MAX),
-                            ExpectedValue::UInteger, 1);
-  nodeChecker<float>(configFile_[root][subFields[4]], subFields[4],
-                     std::make_pair(0.0f, std::numeric_limits<float>::max()),
-                     ExpectedValue::Float, 0.0f);
-  nodeChecker<std::string>(configFile_[root][subFields[5]], subFields[5],
-                           std::vector<std::string>(), ExpectedValue::String,
-                           "");
-  nodeChecker<std::string>(configFile_[root][subFields[6]], subFields[6],  //
-                           std::vector<std::string>(), ExpectedValue::String,
-                           "0x0");
-  nodeChecker<unsigned int>(configFile_[root][subFields[7]], subFields[7],
-                            std::make_pair(0, UINT_MAX),
-                            ExpectedValue::UInteger, 0);
-  nodeChecker<std::string>(configFile_[root][subFields[8]], subFields[8],  //
-                           std::vector<std::string>(), ExpectedValue::String,
-                           "0x0");
-  nodeChecker<std::string>(configFile_[root][subFields[9]], subFields[9],  //
-                           std::vector<std::string>(), ExpectedValue::String,
-                           "0x0");
-  nodeChecker<unsigned int>(configFile_[root][subFields[10]], subFields[10],
-                            std::make_pair(0, UINT_MAX),
-                            ExpectedValue::UInteger, 0x0);
-  if (nodeChecker<unsigned int>(configFile_[root][subFields[11]], subFields[11],
-                                std::make_pair(1, UINT_MAX),
-                                ExpectedValue::UInteger, 1)) {
-    uint64_t package_count = configFile_[root][subFields[11]].as<uint64_t>();
-    uint64_t core_count = configFile_[root][subFields[1]].as<uint64_t>();
-    // Ensure package_count size is a less than or equal to the core count, and
-    // that the core count can be divided by the package count
-    if (!((package_count <= core_count) && (core_count % package_count == 0))) {
-      invalid_
-          << "\t- Package-Count must be a Less-than or equal to Core-Count, "
-             "and Core-Count must be divisible by Package-Count.";
-    }
-  }
-  subFields.clear();
-
-  std::string missingStr = missing_.str();
-  std::string invalidStr = invalid_.str();
-  // Print all missing fields
-  if (missingStr.length()) {
-    std::cerr << "[SimEng:ModelConfig] The following fields are missing from "
-                 "the provided "
-                 "configuration file:\n"
-              << missingStr << std::endl;
-  }
-  // Print all invalid values
-  if (invalidStr.length()) {
-    std::cerr << "[SimEng:ModelConfig] The following values are invalid for "
-                 "their associated field:\n"
-              << invalidStr << std::endl;
-  }
-  if (missingStr.length() || invalidStr.length()) exit(1);
-  return;
-}
-
-void ModelConfig::createGroupMapping() {
-  if (configFile_["Core"]["ISA"].as<std::string>() == "AArch64") {
-    groupOptions_ = {"INT",
-                     "INT_SIMPLE",
-                     "INT_SIMPLE_ARTH",
-                     "INT_SIMPLE_ARTH_NOSHIFT",
-                     "INT_SIMPLE_LOGICAL",
-                     "INT_SIMPLE_LOGICAL_NOSHIFT",
-                     "INT_SIMPLE_CMP",
-                     "INT_SIMPLE_CVT",
-                     "INT_MUL",
-                     "INT_DIV_OR_SQRT",
-                     "LOAD_INT",
-                     "STORE_ADDRESS_INT",
-                     "STORE_DATA_INT",
-                     "STORE_INT",
-                     "FP",
-                     "FP_SIMPLE",
-                     "FP_SIMPLE_ARTH",
-                     "FP_SIMPLE_ARTH_NOSHIFT",
-                     "FP_SIMPLE_LOGICAL",
-                     "FP_SIMPLE_LOGICAL_NOSHIFT",
-                     "FP_SIMPLE_CMP",
-                     "FP_SIMPLE_CVT",
-                     "FP_MUL",
-                     "FP_DIV_OR_SQRT",
-                     "SCALAR",
-                     "SCALAR_SIMPLE",
-                     "SCALAR_SIMPLE_ARTH",
-                     "SCALAR_SIMPLE_ARTH_NOSHIFT",
-                     "SCALAR_SIMPLE_LOGICAL",
-                     "SCALAR_SIMPLE_LOGICAL_NOSHIFT",
-                     "SCALAR_SIMPLE_CMP",
-                     "SCALAR_SIMPLE_CVT",
-                     "SCALAR_MUL",
-                     "SCALAR_DIV_OR_SQRT",
-                     "LOAD_SCALAR",
-                     "STORE_ADDRESS_SCALAR",
-                     "STORE_DATA_SCALAR",
-                     "STORE_SCALAR",
-                     "VECTOR",
-                     "VECTOR_SIMPLE",
-                     "VECTOR_SIMPLE_ARTH",
-                     "VECTOR_SIMPLE_ARTH_NOSHIFT",
-                     "VECTOR_SIMPLE_LOGICAL",
-                     "VECTOR_SIMPLE_LOGICAL_NOSHIFT",
-                     "VECTOR_SIMPLE_CMP",
-                     "VECTOR_SIMPLE_CVT",
-                     "VECTOR_MUL",
-                     "VECTOR_DIV_OR_SQRT",
-                     "LOAD_VECTOR",
-                     "STORE_ADDRESS_VECTOR",
-                     "STORE_DATA_VECTOR",
-                     "STORE_VECTOR",
-                     "SVE",
-                     "SVE_SIMPLE",
-                     "SVE_SIMPLE_ARTH",
-                     "SVE_SIMPLE_ARTH_NOSHIFT",
-                     "SVE_SIMPLE_LOGICAL",
-                     "SVE_SIMPLE_LOGICAL_NOSHIFT",
-                     "SVE_SIMPLE_CMP",
-                     "SVE_SIMPLE_CVT",
-                     "SVE_MUL",
-                     "SVE_DIV_OR_SQRT",
-                     "LOAD_SVE",
-                     "STORE_ADDRESS_SVE",
-                     "STORE_DATA_SVE",
-                     "STORE_SVE",
-                     "PREDICATE",
-                     "LOAD",
-                     "STORE_ADDRESS",
-                     "STORE_DATA",
-                     "STORE",
-                     "BRANCH",
-                     "SME",
-                     "SME_SIMPLE",
-                     "SME_SIMPLE_ARTH",
-                     "SME_SIMPLE_ARTH_NOSHIFT",
-                     "SME_SIMPLE_LOGICAL",
-                     "SME_SIMPLE_LOGICAL_NOSHIFT",
-                     "SME_SIMPLE_CMP",
-                     "SME_SIMPLE_CVT",
-                     "SME_MUL",
-                     "SME_DIV_OR_SQRT",
-                     "LOAD_SME",
-                     "STORE_ADDRESS_SME",
-                     "STORE_DATA_SME",
-                     "STORE_SME"};
-  } else if (configFile_["Core"]["ISA"].as<std::string>() == "rv64") {
-    groupOptions_ = {"INT",
-                     "INT_SIMPLE",
-                     "INT_SIMPLE_ARTH",
-                     "INT_SIMPLE_CMP",
-                     "INT_SIMPLE_LOGICAL",
-                     "INT_SIMPLE_SHIFT",
-                     "INT_MUL",
-                     "INT_DIV",
-                     "LOAD_INT",
-                     "STORE_INT",
-                     "LOAD",
-                     "STORE",
-                     "BRANCH"};
-  }
-  // ISA instruction group namespaces contain a set of contiguous assigned
-  // uint16_t starting from 0. Therefore, the index of each groupOptions_ entry
-  // is also its <isa>::InstructionGroups value (assuming groupOptions_ is
-  // ordered exactly as <isa>::InstructionGroups is).
-  for (int grp = 0; grp < groupOptions_.size(); grp++) {
-    groupMapping_[groupOptions_[grp]] = grp;
-  }
-}
-
-template <typename T>
-int ModelConfig::nodeChecker(const YAML::Node& node, const std::string& field,
-                             const std::vector<T>& value_set,
-                             uint8_t expected) {
-  // Check for the existence of the given node
-  if (!(node.IsDefined()) || node.IsNull()) {
-    missing_ << "\t- " << field << "\n";
-    return 0;
-  }
-
-  return setChecker(node, field, value_set, expected);
-}
-
-template <typename T>
-int ModelConfig::nodeChecker(YAML::Node node, const std::string& field,
-                             const std::vector<T>& value_set, uint8_t expected,
-                             T default_value) {
-  // Check for the existence of the given node
-  if (!(node.IsDefined()) || node.IsNull()) {
-    node = default_value;
-    return 1;
-  }
-
-  return setChecker(node, field, value_set, expected);
-}
-
-template <typename T>
-int ModelConfig::nodeChecker(const YAML::Node& node, const std::string& field,
-                             const std::pair<T, T>& bounds, uint8_t expected) {
-  // Check for the existence of the given node
-  if (!(node.IsDefined()) || node.IsNull()) {
-    missing_ << "\t- " << field << "\n";
-    return 0;
-  }
-
-  return boundChecker(node, field, bounds, expected);
-}
-
-template <typename T>
-int ModelConfig::nodeChecker(YAML::Node node, const std::string& field,
-                             const std::pair<T, T>& bounds, uint8_t expected,
-                             const T& default_value) {
-  // Check for the existence of the given node
-  if (!(node.IsDefined()) || node.IsNull()) {
-    node = default_value;
-    return 1;
-  }
-
-  return boundChecker(node, field, bounds, expected);
-}
-
-}  // namespace simeng
diff --git a/src/lib/RegisterFileSet.cc b/src/lib/RegisterFileSet.cc
index c48e3c3b75..a195af1bca 100644
--- a/src/lib/RegisterFileSet.cc
+++ b/src/lib/RegisterFileSet.cc
@@ -4,17 +4,6 @@
 
 namespace simeng {
 
-std::ostream& operator<<(std::ostream& os, Register const& reg) {
-  return os << reg.tag;
-}
-
-bool Register::operator==(const Register& other) const {
-  return (other.type == type && other.tag == tag);
-}
-bool Register::operator!=(const Register& other) const {
-  return !(other == *this);
-}
-
 RegisterFileSet::RegisterFileSet(
     std::vector<RegisterFileStructure> registerFileStructures)
     : registerFiles(registerFileStructures.size()) {
diff --git a/src/lib/SpecialFileDirGen.cc b/src/lib/SpecialFileDirGen.cc
index 0a3d937117..6360eb536b 100644
--- a/src/lib/SpecialFileDirGen.cc
+++ b/src/lib/SpecialFileDirGen.cc
@@ -4,58 +4,110 @@
 
 namespace simeng {
 
-SpecialFileDirGen::SpecialFileDirGen(YAML::Node config) {
-  // Import all values from config file
-  core_count = config["CPU-Info"]["Core-Count"].as<uint64_t>();
-  socket_count = config["CPU-Info"]["Socket-Count"].as<uint64_t>();
-  smt = config["CPU-Info"]["SMT"].as<uint64_t>();
-  bogoMIPS = config["CPU-Info"]["BogoMIPS"].as<float>();
-  features = config["CPU-Info"]["Features"].as<std::string>();
-  cpu_implementer = config["CPU-Info"]["CPU-Implementer"].as<std::string>();
-  cpu_architecture = config["CPU-Info"]["CPU-Architecture"].as<uint64_t>();
-  cpu_variant = config["CPU-Info"]["CPU-Variant"].as<std::string>();
-  cpu_part = config["CPU-Info"]["CPU-Part"].as<std::string>();
-  cpu_revision = config["CPU-Info"]["CPU-Revision"].as<uint64_t>();
-  package_count = config["CPU-Info"]["Package-Count"].as<uint64_t>();
+// Wrapper around calls to "system(command)". Checks that a shell is available
+// before calling "system" and checking the output for any issues.
+// ensureExitSuccess is used to check for a successful termination status (0)
+// from the child shell, defaults to true
+int systemWrapper(const std::string& command,
+                  const bool ensureExitSuccess = true) {
+  // Check that there is a shell available
+  if (!system(NULL)) {
+    std::cerr
+        << "[SimEng:SpecialFileDirGen] Shell unavailable, can't call system"
+        << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  int output = system(command.c_str());
+
+  if (output == -1) {
+    std::cerr << "[SimEng:SpecialFileDirGen] Child process could not be "
+                 "created, or its status could "
+                 "not be retrieved. errno = "
+              << errno << std::endl;
+    exit(EXIT_FAILURE);
+  } else if (WIFEXITED(output) && WEXITSTATUS(output) == 127) {
+    std::cerr << "[SimEng:SpecialFileDirGen] Shell command could not be "
+                 "executed in child shell"
+              << std::endl;
+    exit(EXIT_FAILURE);
+  } else {
+    if (ensureExitSuccess) {
+      if (WIFEXITED(output) && WEXITSTATUS(output) == 0) {
+        // Success
+        return output;
+      } else if (WIFSIGNALED(output)) {
+        std::cerr << "[SimEng:SpecialFileDirGen] Child process terminated by "
+                     "signal: "
+                  << WTERMSIG(output) << " when running command: " << command
+                  << std::endl;
+      } else {
+        // Macros providing more information can be found in "man 2 waitpid"
+        std::cerr << "[SimEng:SpecialFileDirGen] Call to system(" << command
+                  << ") returned failure. Return value: " << output
+                  << ", if exited: " << WIFEXITED(output)
+                  << " , exit status: " << WEXITSTATUS(output) << std::endl;
+      }
+      exit(EXIT_FAILURE);
+    }
+
+    // Success
+    return output;
+  }
 }
 
+SpecialFileDirGen::SpecialFileDirGen(ryml::ConstNodeRef config)
+    : specialFilesDir_(
+          config["CPU-Info"]["Special-File-Dir-Path"].as<std::string>()),
+      coreCount_(config["CPU-Info"]["Core-Count"].as<uint64_t>()),
+      socketCount_(config["CPU-Info"]["Socket-Count"].as<uint64_t>()),
+      smt_(config["CPU-Info"]["SMT"].as<uint64_t>()),
+      bogoMIPS_(config["CPU-Info"]["BogoMIPS"].as<float>()),
+      features_(config["CPU-Info"]["Features"].as<std::string>()),
+      cpuImplementer_(config["CPU-Info"]["CPU-Implementer"].as<std::string>()),
+      cpuArchitecture_(config["CPU-Info"]["CPU-Architecture"].as<uint64_t>()),
+      cpuVariant_(config["CPU-Info"]["CPU-Variant"].as<std::string>()),
+      cpuPart_(config["CPU-Info"]["CPU-Part"].as<std::string>()),
+      cpuRevision_(config["CPU-Info"]["CPU-Revision"].as<uint64_t>()),
+      packageCount_(config["CPU-Info"]["Package-Count"].as<uint64_t>()) {}
+
 void SpecialFileDirGen::RemoveExistingSFDir() {
   const std::string exist_input = "[ ! -d " + specialFilesDir_ + " ]";
-  if (system(exist_input.c_str())) {
+  if (systemWrapper(exist_input, false)) {
     const std::string rm_input = "rm -r " + specialFilesDir_;
-    system(rm_input.c_str());
+    systemWrapper(rm_input);
   }
-  const std::string mk_input = "mkdir " + specialFilesDir_;
-  system(mk_input.c_str());
   return;
 }
 
 void SpecialFileDirGen::GenerateSFDir() {
+  // Create root special files directory
+  systemWrapper("mkdir -p " + specialFilesDir_);
   // Define frequently accessed root directories in special file tree
   const std::string proc_dir = specialFilesDir_ + "/proc/";
   const std::string online_dir = specialFilesDir_ + "/sys/devices/system/cpu/";
   const std::string cpu_base_dir =
       specialFilesDir_ + "/sys/devices/system/cpu/cpu";
 
-  system(("mkdir " + proc_dir).c_str());
-  system(("mkdir " + specialFilesDir_ + "/sys/").c_str());
-  system(("mkdir " + specialFilesDir_ + "/sys/devices/").c_str());
-  system(("mkdir " + specialFilesDir_ + "/sys/devices/system/").c_str());
-  system(("mkdir " + online_dir).c_str());
+  systemWrapper("mkdir " + proc_dir);
+  systemWrapper("mkdir " + specialFilesDir_ + "/sys/");
+  systemWrapper("mkdir " + specialFilesDir_ + "/sys/devices/");
+  systemWrapper("mkdir " + specialFilesDir_ + "/sys/devices/system/");
+  systemWrapper("mkdir " + online_dir);
 
   // Create '/proc/cpuinfo' file.
   std::ofstream cpuinfo_File(proc_dir + "cpuinfo");
-  for (int i = 0; i < core_count * socket_count * smt; i++) {
+  for (uint64_t i = 0; i < coreCount_ * socketCount_ * smt_; i++) {
     cpuinfo_File << "processor\t: " + std::to_string(i) + "\nBogoMIPS\t: " +
-                        std::to_string(bogoMIPS).erase(
-                            std::to_string(bogoMIPS).length() - 4) +
-                        "\nFeatures\t: " + features +
-                        "\nCPU implementer\t: " + cpu_implementer +
+                        std::to_string(bogoMIPS_).erase(
+                            std::to_string(bogoMIPS_).length() - 4) +
+                        "\nFeatures\t: " + features_ +
+                        "\nCPU implementer\t: " + cpuImplementer_ +
                         "\nCPU architecture: " +
-                        std::to_string(cpu_architecture) +
-                        "\nCPU variant\t: " + cpu_variant +
-                        "\nCPU part\t: " + cpu_part +
-                        "\nCPU revision\t: " + std::to_string(cpu_revision) +
+                        std::to_string(cpuArchitecture_) +
+                        "\nCPU variant\t: " + cpuVariant_ +
+                        "\nCPU part\t: " + cpuPart_ +
+                        "\nCPU revision\t: " + std::to_string(cpuRevision_) +
                         "\n\n";
   }
   cpuinfo_File.close();
@@ -63,7 +115,7 @@ void SpecialFileDirGen::GenerateSFDir() {
   // Create '/proc/stat' file.
   std::ofstream stat_File(proc_dir + "stat");
   stat_File << "cpu  0 0 0 0 0 0 0 0 0 0\n";
-  for (int i = 0; i < core_count * socket_count * smt; i++) {
+  for (uint64_t i = 0; i < coreCount_ * socketCount_ * smt_; i++) {
     stat_File << "cpu" + std::to_string(i) + " 0 0 0 0 0 0 0 0 0 0\n";
   }
   stat_File << "intr 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 "
@@ -83,40 +135,39 @@ void SpecialFileDirGen::GenerateSFDir() {
 
   // Create '/sys/devices/system/cpu/online' file.
   std::ofstream online_File(online_dir + "online");
-  online_File << "0-" + std::to_string(core_count * socket_count * smt - 1) +
+  online_File << "0-" + std::to_string(coreCount_ * socketCount_ * smt_ - 1) +
                      "\n";
   online_File.close();
 
   // Create sub directory for each CPU core and required files.
-  for (int i = 0; i < core_count * socket_count * smt; i++) {
-    system(("mkdir " + cpu_base_dir + std::to_string(i) + "/").c_str());
-    system(
-        ("mkdir " + cpu_base_dir + std::to_string(i) + "/topology/").c_str());
+  for (uint64_t i = 0; i < coreCount_ * socketCount_ * smt_; i++) {
+    systemWrapper("mkdir " + cpu_base_dir + std::to_string(i) + "/");
+    systemWrapper("mkdir " + cpu_base_dir + std::to_string(i) + "/topology/");
   }
 
   // Create '/sys/devices/system/cpu/cpuX/topology/{core_id,
   // physical_package_id}' files
-  uint64_t cores_per_package = core_count / package_count;
+  uint64_t cores_per_package = coreCount_ / packageCount_;
   uint64_t current_package_id = 0;
-  for (int s = 0; s < socket_count; s++) {
-    for (int c = 0; c < core_count; c++) {
+  for (uint64_t s = 0; s < socketCount_; s++) {
+    for (uint64_t c = 0; c < coreCount_; c++) {
       if (c % cores_per_package == 0 && c != 0) {
         current_package_id += 1;
       }
-      for (int t = 0; t < smt; t++) {
+      for (uint64_t t = 0; t < smt_; t++) {
         // core_id File generation
         std::ofstream core_id_file(
             cpu_base_dir +
-            std::to_string(c + (t * core_count) + (s * smt * core_count)) +
+            std::to_string(c + (t * coreCount_) + (s * smt_ * coreCount_)) +
             "/topology/core_id");
         core_id_file << (c % cores_per_package) +
-                            (s * core_count * socket_count * smt);
+                            (s * coreCount_ * socketCount_ * smt_);
         core_id_file.close();
 
         // physical_package_id File generation
         std::ofstream phys_package_id_file(
             cpu_base_dir +
-            std::to_string(c + (t * core_count) + (s * smt * core_count)) +
+            std::to_string(c + (t * coreCount_) + (s * smt_ * coreCount_)) +
             "/topology/physical_package_id");
         phys_package_id_file << current_package_id;
         phys_package_id_file.close();
diff --git a/src/lib/arch/aarch64/Architecture.cc b/src/lib/arch/aarch64/Architecture.cc
index 469e45d932..df3227269d 100644
--- a/src/lib/arch/aarch64/Architecture.cc
+++ b/src/lib/arch/aarch64/Architecture.cc
@@ -7,60 +7,56 @@ namespace simeng {
 namespace arch {
 namespace aarch64 {
 
-std::unordered_map<uint32_t, Instruction> Architecture::decodeCache;
-std::unordered_map<uint32_t, std::string> disasmCache;
-std::forward_list<InstructionMetadata> Architecture::metadataCache;
-uint64_t Architecture::SVCRval_;
-
-Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
-    : linux_(kernel),
-      microDecoder_(std::make_unique<MicroDecoder>(config)),
+Architecture::Architecture(kernel::Linux& kernel, ryml::ConstNodeRef config)
+    : arch::Architecture(kernel),
+      microDecoder_(std::make_unique<MicroDecoder>()),
       VL_(config["Core"]["Vector-Length"].as<uint64_t>()),
       SVL_(config["Core"]["Streaming-Vector-Length"].as<uint64_t>()),
-      vctModulo_((config["Core"]["Clock-Frequency"].as<float>() * 1e9) /
-                 (config["Core"]["Timer-Frequency"].as<uint32_t>() * 1e6)) {
-  if (cs_open(CS_ARCH_ARM64, CS_MODE_ARM, &capstoneHandle) != CS_ERR_OK) {
+      vctModulo_((config["Core"]["Clock-Frequency-GHz"].as<float>() * 1e9) /
+                 (config["Core"]["Timer-Frequency-MHz"].as<uint32_t>() * 1e6)) {
+  if (cs_open(CS_ARCH_AARCH64, CS_MODE_ARM, &capstoneHandle_) != CS_ERR_OK) {
     std::cerr << "[SimEng:Architecture] Could not create capstone handle"
               << std::endl;
     exit(1);
   }
 
-  cs_option(capstoneHandle, CS_OPT_DETAIL, CS_OPT_ON);
+  cs_option(capstoneHandle_, CS_OPT_DETAIL, CS_OPT_ON);
+  // This second Capstone option reverses instruction aliases, and instead
+  // means all operand information is that of the "real" underlying instruction.
+  cs_option(capstoneHandle_, CS_OPT_DETAIL, CS_OPT_DETAIL_REAL);
 
   // Generate zero-indexed system register map
-  systemRegisterMap_[ARM64_SYSREG_DCZID_EL0] = systemRegisterMap_.size();
-  systemRegisterMap_[ARM64_SYSREG_FPCR] = systemRegisterMap_.size();
-  systemRegisterMap_[ARM64_SYSREG_FPSR] = systemRegisterMap_.size();
-  systemRegisterMap_[ARM64_SYSREG_TPIDR_EL0] = systemRegisterMap_.size();
-  systemRegisterMap_[ARM64_SYSREG_MIDR_EL1] = systemRegisterMap_.size();
-  systemRegisterMap_[ARM64_SYSREG_CNTVCT_EL0] = systemRegisterMap_.size();
-  systemRegisterMap_[ARM64_SYSREG_PMCCNTR_EL0] = systemRegisterMap_.size();
+  std::vector<uint64_t> sysRegs = config::SimInfo::getSysRegVec();
+  for (size_t i = 0; i < sysRegs.size(); i++) {
+    systemRegisterMap_[sysRegs[i]] = systemRegisterMap_.size();
+  }
 
   // Get Virtual Counter Timer and Processor Cycle Counter system registers.
   VCTreg_ = {
       RegisterType::SYSTEM,
-      static_cast<uint16_t>(getSystemRegisterTag(ARM64_SYSREG_CNTVCT_EL0))};
+      static_cast<uint16_t>(getSystemRegisterTag(AARCH64_SYSREG_CNTVCT_EL0))};
   PCCreg_ = {
       RegisterType::SYSTEM,
-      static_cast<uint16_t>(getSystemRegisterTag(ARM64_SYSREG_PMCCNTR_EL0))};
+      static_cast<uint16_t>(getSystemRegisterTag(AARCH64_SYSREG_PMCCNTR_EL0))};
 
-  // Instantiate an ExecutionInfo entry for each group in the InstructionGroup
-  // namespace.
+  // Instantiate an ExecutionInfo entry for each group in the
+  // InstructionGroup namespace.
   for (int i = 0; i < NUM_GROUPS; i++) {
     groupExecutionInfo_[i] = {1, 1, {}};
   }
   // Extract execution latency/throughput for each group
   std::vector<uint8_t> inheritanceDistance(NUM_GROUPS, UINT8_MAX);
-  for (size_t i = 0; i < config["Latencies"].size(); i++) {
-    YAML::Node port_node = config["Latencies"][i];
+  for (size_t i = 0; i < config["Latencies"].num_children(); i++) {
+    ryml::ConstNodeRef port_node = config["Latencies"][i];
     uint16_t latency = port_node["Execution-Latency"].as<uint16_t>();
     uint16_t throughput = port_node["Execution-Throughput"].as<uint16_t>();
-    for (size_t j = 0; j < port_node["Instruction-Group"].size(); j++) {
-      uint16_t group = port_node["Instruction-Group"][j].as<uint16_t>();
+    for (size_t j = 0; j < port_node["Instruction-Group-Nums"].num_children();
+         j++) {
+      uint16_t group = port_node["Instruction-Group-Nums"][j].as<uint16_t>();
       groupExecutionInfo_[group].latency = latency;
       groupExecutionInfo_[group].stallCycles = throughput;
-      // Set zero inheritance distance for latency assignment as it's explicitly
-      // defined
+      // Set zero inheritance distance for latency assignment as it's
+      // explicitly defined
       inheritanceDistance[group] = 0;
       // Add inherited support for those appropriate groups
       std::queue<uint16_t> groups;
@@ -69,10 +65,10 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
       uint8_t distance = 1;
       while (groups.size()) {
         // Determine if there's any inheritance
-        if (groupInheritance.find(groups.front()) != groupInheritance.end()) {
+        if (groupInheritance_.find(groups.front()) != groupInheritance_.end()) {
           std::vector<uint16_t> inheritedGroups =
-              groupInheritance.at(groups.front());
-          for (int k = 0; k < inheritedGroups.size(); k++) {
+              groupInheritance_.at(groups.front());
+          for (size_t k = 0; k < inheritedGroups.size(); k++) {
             // Determine if this group has inherited latency values from a
             // smaller distance
             if (inheritanceDistance[inheritedGroups[k]] > distance) {
@@ -88,8 +84,9 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
       }
     }
     // Store any opcode-based latency override
-    for (size_t j = 0; j < port_node["Instruction-Opcode"].size(); j++) {
-      uint16_t opcode = port_node["Instruction-Opcode"][j].as<uint16_t>();
+    for (size_t j = 0; j < port_node["Instruction-Opcodes"].num_children();
+         j++) {
+      uint16_t opcode = port_node["Instruction-Opcodes"][j].as<uint16_t>();
       opcodeExecutionInfo_[opcode].latency = latency;
       opcodeExecutionInfo_[opcode].stallCycles = throughput;
     }
@@ -97,25 +94,27 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
 
   // ports entries in the groupExecutionInfo_ entries only apply for models
   // using the outoforder core archetype
-  if (config["Core"]["Simulation-Mode"].as<std::string>() == "outoforder") {
+  if (config::SimInfo::getSimMode() == config::SimulationMode::Outoforder) {
     // Create mapping between instructions groups and the ports that support
     // them
-    for (size_t i = 0; i < config["Ports"].size(); i++) {
+    for (size_t i = 0; i < config["Ports"].num_children(); i++) {
       // Store which ports support which groups
-      YAML::Node group_node = config["Ports"][i]["Instruction-Group-Support"];
-      for (size_t j = 0; j < group_node.size(); j++) {
+      ryml::ConstNodeRef group_node =
+          config["Ports"][i]["Instruction-Group-Support-Nums"];
+      for (size_t j = 0; j < group_node.num_children(); j++) {
         uint16_t group = group_node[j].as<uint16_t>();
-        uint8_t newPort = static_cast<uint8_t>(i);
+        uint16_t newPort = static_cast<uint16_t>(i);
         groupExecutionInfo_[group].ports.push_back(newPort);
         // Add inherited support for those appropriate groups
         std::queue<uint16_t> groups;
         groups.push(group);
         while (groups.size()) {
           // Determine if there's any inheritance
-          if (groupInheritance.find(groups.front()) != groupInheritance.end()) {
+          if (groupInheritance_.find(groups.front()) !=
+              groupInheritance_.end()) {
             std::vector<uint16_t> inheritedGroups =
-                groupInheritance.at(groups.front());
-            for (int k = 0; k < inheritedGroups.size(); k++) {
+                groupInheritance_.at(groups.front());
+            for (size_t k = 0; k < inheritedGroups.size(); k++) {
               groupExecutionInfo_[inheritedGroups[k]].ports.push_back(newPort);
               groups.push(inheritedGroups[k]);
             }
@@ -124,37 +123,32 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
         }
       }
       // Store any opcode-based port support override
-      YAML::Node opcode_node = config["Ports"][i]["Instruction-Opcode-Support"];
-      for (size_t j = 0; j < opcode_node.size(); j++) {
-        // If latency information hasn't been defined, set to zero as to inform
-        // later access to use group defined latencies instead
+      ryml::ConstNodeRef opcode_node =
+          config["Ports"][i]["Instruction-Opcode-Support"];
+      for (size_t j = 0; j < opcode_node.num_children(); j++) {
+        // If latency information hasn't been defined, set to zero as to
+        // inform later access to use group defined latencies instead
         uint16_t opcode = opcode_node[j].as<uint16_t>();
-        opcodeExecutionInfo_.try_emplace(
-            opcode, simeng::arch::aarch64::ExecutionInfo{0, 0, {}});
+        opcodeExecutionInfo_.try_emplace(opcode, ExecutionInfo{0, 0, {}});
         opcodeExecutionInfo_[opcode].ports.push_back(static_cast<uint8_t>(i));
       }
     }
   }
 }
-Architecture::~Architecture() {
-  cs_close(&capstoneHandle);
-  decodeCache.clear();
-  metadataCache.clear();
-  groupExecutionInfo_.clear();
-  SVCRval_ = 0;
-}
 
-uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
+Architecture::~Architecture() { cs_close(&capstoneHandle_); }
+
+uint8_t Architecture::predecode(const uint8_t* ptr, uint16_t bytesAvailable,
                                 uint64_t instructionAddress, MacroOp& output,
                                 std::string& disasm) const {
   // Check that instruction address is 4-byte aligned as required by Armv9.2-a
   if (instructionAddress & 0x3) {
     // Consume 1-byte and raise a misaligned PC exception
     auto metadata = InstructionMetadata((uint8_t*)ptr, 1);
-    metadataCache.emplace_front(metadata);
+    metadataCache_.emplace_front(metadata);
     output.resize(1);
     auto& uop = output[0];
-    uop = std::make_shared<Instruction>(*this, metadataCache.front(),
+    uop = std::make_shared<Instruction>(*this, metadataCache_.front(),
                                         InstructionException::MisalignedPC);
     uop->setInstructionAddress(instructionAddress);
     // Return non-zero value to avoid fatal error
@@ -170,8 +164,8 @@ uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
   memcpy(&insn, ptr, 4);
 
   // Try to find the decoding in the decode cache
-  auto iter = decodeCache.find(insn);
-  if (iter == decodeCache.end()) {
+  auto iter = decodeCache_.find(insn);
+  if (iter == decodeCache_.end()) {
     // No decoding present. Generate a fresh decoding, and add to cache
     cs_insn rawInsn;
     cs_detail rawDetail;
@@ -183,7 +177,7 @@ uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
     const uint8_t* encoding = reinterpret_cast<const uint8_t*>(ptr);
 
     bool success =
-        cs_disasm_iter(capstoneHandle, &encoding, &size, &address, &rawInsn);
+        cs_disasm_iter(capstoneHandle_, &encoding, &size, &address, &rawInsn);
 
     // Construct a disassembly string
     std::string m(rawInsn.mnemonic);
@@ -200,19 +194,19 @@ uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
         success ? InstructionMetadata(rawInsn) : InstructionMetadata(encoding);
 
     // Cache the metadata
-    metadataCache.push_front(metadata);
+    metadataCache_.push_front(metadata);
 
     // Create an instruction using the metadata
-    Instruction newInsn(*this, metadataCache.front(), MicroOpInfo());
+    Instruction newInsn(*this, metadataCache_.front(), MicroOpInfo());
     // Set execution information for this instruction
     newInsn.setExecutionInfo(getExecutionInfo(newInsn));
     // Cache the instruction
-    iter = decodeCache.insert({insn, newInsn}).first;
+    iter = decodeCache_.insert({insn, newInsn}).first;
   }
 
   // Split instruction into 1 or more defined micro-ops
   uint8_t num_ops = microDecoder_->decode(*this, iter->first, iter->second,
-                                          output, capstoneHandle);
+                                          output, capstoneHandle_);
 
   // Set instruction address and branch prediction for each micro-op generated
   for (int i = 0; i < num_ops; i++) {
@@ -223,42 +217,6 @@ uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
   return 4;
 }
 
-ExecutionInfo Architecture::getExecutionInfo(Instruction& insn) const {
-  // Asusme no opcode-based override
-  ExecutionInfo exeInfo = groupExecutionInfo_.at(insn.getGroup());
-  if (opcodeExecutionInfo_.find(insn.getMetadata().opcode) !=
-      opcodeExecutionInfo_.end()) {
-    // Replace with overrided values
-    ExecutionInfo overrideInfo =
-        opcodeExecutionInfo_.at(insn.getMetadata().opcode);
-    if (overrideInfo.latency != 0) exeInfo.latency = overrideInfo.latency;
-    if (overrideInfo.stallCycles != 0)
-      exeInfo.stallCycles = overrideInfo.stallCycles;
-    if (overrideInfo.ports.size()) exeInfo.ports = overrideInfo.ports;
-  }
-  return exeInfo;
-}
-
-std::shared_ptr<arch::ExceptionHandler> Architecture::handleException(
-    const std::shared_ptr<simeng::Instruction>& instruction, const Core& core,
-    MemoryInterface& memory) const {
-  return std::make_shared<ExceptionHandler>(instruction, core, memory, linux_);
-}
-
-std::vector<RegisterFileStructure> Architecture::getRegisterFileStructures()
-    const {
-  uint16_t numSysRegs = static_cast<uint16_t>(systemRegisterMap_.size());
-  const uint16_t ZAsize = static_cast<uint16_t>(SVL_ / 8);  // Convert to bytes
-  return {
-      {8, 32},          // General purpose
-      {256, 32},        // Vector
-      {32, 17},         // Predicate
-      {1, 1},           // NZCV
-      {8, numSysRegs},  // System
-      {256, ZAsize},    // Matrix (Each row is a register)
-  };
-}
-
 int32_t Architecture::getSystemRegisterTag(uint16_t reg) const {
   // Check below is done for speculative instructions that may be passed into
   // the function but will not be executed. If such invalid speculative
@@ -267,8 +225,10 @@ int32_t Architecture::getSystemRegisterTag(uint16_t reg) const {
   return systemRegisterMap_.at(reg);
 }
 
-uint16_t Architecture::getNumSystemRegisters() const {
-  return static_cast<uint16_t>(systemRegisterMap_.size());
+std::shared_ptr<arch::ExceptionHandler> Architecture::handleException(
+    const std::shared_ptr<simeng::Instruction>& instruction, const Core& core,
+    memory::MemoryInterface& memory) const {
+  return std::make_shared<ExceptionHandler>(instruction, core, memory, linux_);
 }
 
 ProcessStateChange Architecture::getInitialState() const {
@@ -286,7 +246,7 @@ ProcessStateChange Architecture::getInitialState() const {
   // but is disabled due to bit 4 being set
   changes.modifiedRegisters.push_back(
       {RegisterType::SYSTEM,
-       static_cast<uint16_t>(getSystemRegisterTag(ARM64_SYSREG_DCZID_EL0))});
+       static_cast<uint16_t>(getSystemRegisterTag(AARCH64_SYSREG_DCZID_EL0))});
   changes.modifiedRegisterValues.push_back(static_cast<uint64_t>(0b10100));
 
   return changes;
@@ -294,9 +254,7 @@ ProcessStateChange Architecture::getInitialState() const {
 
 uint8_t Architecture::getMaxInstructionSize() const { return 4; }
 
-uint64_t Architecture::getVectorLength() const { return VL_; }
-
-uint64_t Architecture::getStreamingVectorLength() const { return SVL_; }
+uint8_t Architecture::getMinInstructionSize() const { return 4; }
 
 void Architecture::updateSystemTimerRegisters(RegisterFileSet* regFile,
                                               const uint64_t iterations) const {
@@ -308,38 +266,25 @@ void Architecture::updateSystemTimerRegisters(RegisterFileSet* regFile,
   }
 }
 
-std::vector<RegisterFileStructure>
-Architecture::getConfigPhysicalRegisterStructure(YAML::Node config) const {
-  // Matrix-Count multiplied by (SVL/8) as internal representation of
-  // ZA is a block of row-vector-registers. Therefore we need to
-  // convert physical counts from whole-ZA to rows-in-ZA.
-  uint16_t matCount =
-      config["Register-Set"]["Matrix-Count"].as<uint16_t>() *
-      (config["Core"]["Streaming-Vector-Length"].as<uint16_t>() / 8);
-  return {
-      {8, config["Register-Set"]["GeneralPurpose-Count"].as<uint16_t>()},
-      {256, config["Register-Set"]["FloatingPoint/SVE-Count"].as<uint16_t>()},
-      {32, config["Register-Set"]["Predicate-Count"].as<uint16_t>()},
-      {1, config["Register-Set"]["Conditional-Count"].as<uint16_t>()},
-      {8, getNumSystemRegisters()},
-      {256, matCount}};
+ExecutionInfo Architecture::getExecutionInfo(const Instruction& insn) const {
+  // Assume no opcode-based override
+  ExecutionInfo exeInfo = groupExecutionInfo_.at(insn.getGroup());
+  if (opcodeExecutionInfo_.find(insn.getMetadata().opcode) !=
+      opcodeExecutionInfo_.end()) {
+    // Replace with overrided values
+    ExecutionInfo overrideInfo =
+        opcodeExecutionInfo_.at(insn.getMetadata().opcode);
+    if (overrideInfo.latency != 0) exeInfo.latency = overrideInfo.latency;
+    if (overrideInfo.stallCycles != 0)
+      exeInfo.stallCycles = overrideInfo.stallCycles;
+    if (overrideInfo.ports.size()) exeInfo.ports = overrideInfo.ports;
+  }
+  return exeInfo;
 }
 
-std::vector<uint16_t> Architecture::getConfigPhysicalRegisterQuantities(
-    YAML::Node config) const {
-  // Matrix-Count multiplied by (SVL/8) as internal representation of
-  // ZA is a block of row-vector-registers. Therefore we need to
-  // convert physical counts from whole-ZA to rows-in-ZA.
-  uint16_t matCount =
-      config["Register-Set"]["Matrix-Count"].as<uint16_t>() *
-      (config["Core"]["Streaming-Vector-Length"].as<uint16_t>() / 8);
-  return {config["Register-Set"]["GeneralPurpose-Count"].as<uint16_t>(),
-          config["Register-Set"]["FloatingPoint/SVE-Count"].as<uint16_t>(),
-          config["Register-Set"]["Predicate-Count"].as<uint16_t>(),
-          config["Register-Set"]["Conditional-Count"].as<uint16_t>(),
-          getNumSystemRegisters(),
-          matCount};
-}
+uint64_t Architecture::getVectorLength() const { return VL_; }
+
+uint64_t Architecture::getStreamingVectorLength() const { return SVL_; }
 
 /** The SVCR value is stored in Architecture to allow the value to be
  * retrieved within execution pipeline. This prevents adding an implicit
@@ -351,6 +296,12 @@ void Architecture::setSVCRval(const uint64_t newVal) const {
   SVCRval_ = newVal;
 }
 
+// 0th bit of SVCR register determines if streaming-mode is enabled.
+bool Architecture::isStreamingModeEnabled() const { return SVCRval_ & 1; }
+
+// 1st bit of SVCR register determines if ZA register is enabled.
+bool Architecture::isZARegisterEnabled() const { return SVCRval_ & 2; }
+
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
diff --git a/src/lib/arch/aarch64/ExceptionHandler.cc b/src/lib/arch/aarch64/ExceptionHandler.cc
index 88f796f495..ae98dddb1a 100644
--- a/src/lib/arch/aarch64/ExceptionHandler.cc
+++ b/src/lib/arch/aarch64/ExceptionHandler.cc
@@ -15,9 +15,9 @@ namespace aarch64 {
 
 ExceptionHandler::ExceptionHandler(
     const std::shared_ptr<simeng::Instruction>& instruction, const Core& core,
-    MemoryInterface& memory, kernel::Linux& linux_)
+    memory::MemoryInterface& memory, kernel::Linux& linux_)
     : instruction_(*static_cast<Instruction*>(instruction.get())),
-      core(core),
+      core_(core),
       memory_(memory),
       linux_(linux_) {
   resumeHandling_ = [this]() { return init(); };
@@ -27,7 +27,7 @@ bool ExceptionHandler::tick() { return resumeHandling_(); }
 
 bool ExceptionHandler::init() {
   InstructionException exception = instruction_.getException();
-  const auto& registerFileSet = core.getArchitecturalRegisterFileSet();
+  const auto& registerFileSet = core_.getArchitecturalRegisterFileSet();
 
   if (exception == InstructionException::SupervisorCall) {
     // Retrieve syscall ID held in register x8
@@ -109,7 +109,7 @@ bool ExceptionHandler::init() {
         uint64_t count = registerFileSet.get(R2).get<uint64_t>();
 
         return readBufferThen(bufPtr, count, [=]() {
-          int64_t totalRead = linux_.getdents64(fd, dataBuffer.data(), count);
+          int64_t totalRead = linux_.getdents64(fd, dataBuffer_.data(), count);
           ProcessStateChange stateChange = {
               ChangeType::REPLACEMENT, {R0}, {totalRead}};
           // Check for failure
@@ -117,23 +117,18 @@ bool ExceptionHandler::init() {
             return concludeSyscall(stateChange);
           }
 
-          int64_t bytesRemaining = totalRead;
           // Get pointer and size of the buffer
           uint64_t iDst = bufPtr;
-          uint64_t iLength = bytesRemaining;
-          if (iLength > bytesRemaining) {
-            iLength = bytesRemaining;
-          }
-          bytesRemaining -= iLength;
           // Write data for this buffer in 128-byte chunks
-          auto iSrc = reinterpret_cast<const char*>(dataBuffer.data());
-          while (iLength > 0) {
-            uint8_t len = iLength > 128 ? 128 : static_cast<uint8_t>(iLength);
+          auto iSrc = reinterpret_cast<const char*>(dataBuffer_.data());
+          while (totalRead > 0) {
+            uint8_t len =
+                totalRead > 128 ? 128 : static_cast<uint8_t>(totalRead);
             stateChange.memoryAddresses.push_back({iDst, len});
             stateChange.memoryAddressValues.push_back({iSrc, len});
             iDst += len;
             iSrc += len;
-            iLength -= len;
+            totalRead -= len;
           }
           return concludeSyscall(stateChange);
         });
@@ -151,7 +146,7 @@ bool ExceptionHandler::init() {
         uint64_t bufPtr = registerFileSet.get(R1).get<uint64_t>();
         uint64_t count = registerFileSet.get(R2).get<uint64_t>();
         return readBufferThen(bufPtr, count, [=]() {
-          int64_t totalRead = linux_.read(fd, dataBuffer.data(), count);
+          int64_t totalRead = linux_.read(fd, dataBuffer_.data(), count);
           ProcessStateChange stateChange = {
               ChangeType::REPLACEMENT, {R0}, {totalRead}};
           // Check for failure
@@ -159,17 +154,13 @@ bool ExceptionHandler::init() {
             return concludeSyscall(stateChange);
           }
 
-          int64_t bytesRemaining = totalRead;
           // Get pointer and size of the buffer
           uint64_t iDst = bufPtr;
-          uint64_t iLength = bytesRemaining;
-          if (iLength > bytesRemaining) {
-            iLength = bytesRemaining;
-          }
-          bytesRemaining -= iLength;
+          // totalRead not negative due to above check so cast is safe
+          uint64_t iLength = static_cast<uint64_t>(totalRead);
 
           // Write data for this buffer in 128-byte chunks
-          auto iSrc = reinterpret_cast<const char*>(dataBuffer.data());
+          auto iSrc = reinterpret_cast<const char*>(dataBuffer_.data());
           while (iLength > 0) {
             uint8_t len = iLength > 128 ? 128 : static_cast<uint8_t>(iLength);
             stateChange.memoryAddresses.push_back({iDst, len});
@@ -186,7 +177,7 @@ bool ExceptionHandler::init() {
         uint64_t bufPtr = registerFileSet.get(R1).get<uint64_t>();
         uint64_t count = registerFileSet.get(R2).get<uint64_t>();
         return readBufferThen(bufPtr, count, [=]() {
-          int64_t retval = linux_.write(fd, dataBuffer.data(), count);
+          int64_t retval = linux_.write(fd, dataBuffer_.data(), count);
           ProcessStateChange stateChange = {
               ChangeType::REPLACEMENT, {R0}, {retval}};
           return concludeSyscall(stateChange);
@@ -210,7 +201,7 @@ bool ExceptionHandler::init() {
         // generates the memory write requests.
         auto invokeKernel = [=]() {
           // The iov structure has been read into `dataBuffer`
-          uint64_t* iovdata = reinterpret_cast<uint64_t*>(dataBuffer.data());
+          uint64_t* iovdata = reinterpret_cast<uint64_t*>(dataBuffer_.data());
 
           // Allocate buffers to hold the data read by the kernel
           std::vector<std::vector<uint8_t>> buffers(iovcnt);
@@ -236,7 +227,8 @@ bool ExceptionHandler::init() {
           }
 
           // Build list of memory write operations
-          int64_t bytesRemaining = totalRead;
+          // totalRead not negative due to above check so cast is safe
+          uint64_t bytesRemaining = static_cast<uint64_t>(totalRead);
           for (int64_t i = 0; i < iovcnt; i++) {
             // Get pointer and size of the buffer
             uint64_t iDst = iovdata[i * 2 + 0];
@@ -281,8 +273,8 @@ bool ExceptionHandler::init() {
         // Create the final handler in the chain, which invokes the kernel
         std::function<bool()> last = [=]() {
           // Rebuild the iovec structures using pointers to `dataBuffer` data
-          uint64_t* iovdata = reinterpret_cast<uint64_t*>(dataBuffer.data());
-          uint8_t* bufferPtr = dataBuffer.data() + iovcnt * 16;
+          uint64_t* iovdata = reinterpret_cast<uint64_t*>(dataBuffer_.data());
+          uint8_t* bufferPtr = dataBuffer_.data() + iovcnt * 16;
           for (int64_t i = 0; i < iovcnt; i++) {
             iovdata[i * 2 + 0] = reinterpret_cast<uint64_t>(bufferPtr);
 
@@ -292,7 +284,7 @@ bool ExceptionHandler::init() {
           }
 
           // Invoke the kernel
-          int64_t retval = linux_.writev(fd, dataBuffer.data(), iovcnt);
+          int64_t retval = linux_.writev(fd, dataBuffer_.data(), iovcnt);
           ProcessStateChange stateChange = {
               ChangeType::REPLACEMENT, {R0}, {retval}};
           return concludeSyscall(stateChange);
@@ -301,7 +293,7 @@ bool ExceptionHandler::init() {
         // Build the chain of buffer loads backwards through the iov buffers
         for (int64_t i = iovcnt - 1; i >= 0; i--) {
           last = [=]() {
-            uint64_t* iovdata = reinterpret_cast<uint64_t*>(dataBuffer.data());
+            uint64_t* iovdata = reinterpret_cast<uint64_t*>(dataBuffer_.data());
             uint64_t ptr = iovdata[i * 2 + 0];
             uint64_t len = iovdata[i * 2 + 1];
             return readBufferThen(ptr, len, last);
@@ -334,20 +326,21 @@ bool ExceptionHandler::init() {
         int64_t flag = registerFileSet.get(R3).get<int64_t>();
 
         char* filename = new char[kernel::Linux::LINUX_PATH_MAX];
-        return readStringThen(
-            filename, filenamePtr, kernel::Linux::LINUX_PATH_MAX,
-            [=](auto length) {
-              // Invoke the kernel
-              kernel::stat statOut;
-              uint64_t retval = linux_.newfstatat(dfd, filename, statOut, flag);
-              ProcessStateChange stateChange = {
-                  ChangeType::REPLACEMENT, {R0}, {retval}};
-              delete[] filename;
-              stateChange.memoryAddresses.push_back(
-                  {statbufPtr, sizeof(statOut)});
-              stateChange.memoryAddressValues.push_back(statOut);
-              return concludeSyscall(stateChange);
-            });
+        return readStringThen(filename, filenamePtr,
+                              kernel::Linux::LINUX_PATH_MAX, [=](auto length) {
+                                // Invoke the kernel
+                                kernel::stat statOut;
+                                uint64_t retval = linux_.newfstatat(
+                                    dfd, filename, statOut, flag);
+                                ProcessStateChange stateChange = {
+                                    ChangeType::REPLACEMENT, {R0}, {retval}};
+                                delete[] filename;
+                                stateChange.memoryAddresses.push_back(
+                                    {statbufPtr, sizeof(statOut)});
+                                stateChange.memoryAddressValues.push_back(
+                                    {statOut, sizeof(statOut)});
+                                return concludeSyscall(stateChange);
+                              });
 
         break;
       }
@@ -397,7 +390,7 @@ bool ExceptionHandler::init() {
       }
       case 113: {  // clock_gettime
         uint64_t clkId = registerFileSet.get(R0).get<uint64_t>();
-        uint64_t systemTimer = core.getSystemTimer();
+        uint64_t systemTimer = core_.getSystemTimer();
 
         uint64_t seconds;
         uint64_t nanoseconds;
@@ -431,9 +424,9 @@ bool ExceptionHandler::init() {
           // Currently, only a single CPU bitmask is supported
           if (bitmask != 1) {
             printException(instruction_);
-            std::cout
-                << "Unexpected CPU affinity mask returned in exception handler"
-                << std::endl;
+            std::cout << "\n[SimEng:ExceptionHandler] Unexpected CPU affinity "
+                         "mask returned in exception handler"
+                      << std::endl;
             return fatal();
           }
           uint64_t retval = (pid == 0) ? 1 : 0;
@@ -453,14 +446,14 @@ bool ExceptionHandler::init() {
       }
       case 134: {  // rt_sigaction
         // TODO: Implement syscall logic. Ignored for now as it's assumed the
-        // current use of this syscall is to setup error handlers. Simualted
+        // current use of this syscall is to setup error handlers. Simulated
         // code is expected to work so no need for these handlers.
         stateChange = {ChangeType::REPLACEMENT, {R0}, {0ull}};
         break;
       }
       case 135: {  // rt_sigprocmask
         // TODO: Implement syscall logic. Ignored for now as it's assumed the
-        // current use of this syscall is to setup error handlers. Simualted
+        // current use of this syscall is to setup error handlers. Simulated
         // code is expected to work so no need for these handlers.
         stateChange = {ChangeType::REPLACEMENT, {R0}, {0ull}};
         break;
@@ -502,7 +495,7 @@ bool ExceptionHandler::init() {
       case 169: {  // gettimeofday
         uint64_t tvPtr = registerFileSet.get(R0).get<uint64_t>();
         uint64_t tzPtr = registerFileSet.get(R1).get<uint64_t>();
-        uint64_t systemTimer = core.getSystemTimer();
+        uint64_t systemTimer = core_.getSystemTimer();
 
         kernel::timeval tv;
         kernel::timeval tz;
@@ -620,15 +613,16 @@ bool ExceptionHandler::init() {
         uint64_t bufPtr = registerFileSet.get(R0).get<uint64_t>();
         size_t buflen = registerFileSet.get(R1).get<size_t>();
 
-        char buf[buflen];
+        std::vector<char> buf;
         for (size_t i = 0; i < buflen; i++) {
-          buf[i] = (uint8_t)rand();
+          buf.push_back((uint8_t)rand());
         }
 
         stateChange = {ChangeType::REPLACEMENT, {R0}, {(uint64_t)buflen}};
 
         stateChange.memoryAddresses.push_back({bufPtr, (uint8_t)buflen});
-        stateChange.memoryAddressValues.push_back(RegisterValue(buf, buflen));
+        stateChange.memoryAddressValues.push_back(
+            RegisterValue(buf.data(), buflen));
 
         break;
       }
@@ -649,57 +643,86 @@ bool ExceptionHandler::init() {
   } else if (exception == InstructionException::StreamingModeUpdate ||
              exception == InstructionException::ZAregisterStatusUpdate ||
              exception == InstructionException::SMZAUpdate) {
-    // Retrieve register file structure from architecture
-    auto regFileStruct =
-        instruction_.getArchitecture().getRegisterFileStructures();
+    // Get Architecture
+    const Architecture& arch = instruction_.getArchitecture();
+    // Retrieve register file structure from SimInfo
+    auto regFileStruct = config::SimInfo::getArchRegStruct();
     // Retrieve metadata from architecture
     auto metadata = instruction_.getMetadata();
 
-    // Update SVCR value
-    const uint64_t svcrBits = static_cast<uint64_t>(metadata.operands[0].svcr);
-    const uint8_t imm = metadata.operands[1].imm;
-    const uint64_t currSVCR = instruction_.getArchitecture().getSVCRval();
     uint64_t newSVCR = 0;
-
-    if (imm == 0) {
-      // Zero out relevant bits dictated by svcrBits
-      const uint64_t mask = 0xFFFFFFFFFFFFFFFF ^ svcrBits;
-      newSVCR = currSVCR & mask;
-    } else if (imm == 1) {
-      // Enable relevant bits, dictated by svcrBits
-      const uint64_t mask = 0xFFFFFFFFFFFFFFFF & svcrBits;
-      newSVCR = currSVCR | mask;
+    const uint64_t currSVCR = arch.getSVCRval();
+
+    // Check if exception was called by AArch64_MSR (msr systemreg, xt) or
+    // AArch64_MSRpstatesvcrImm1 (msr svcr<sm|za|smza>, #imm)
+    if (metadata.opcode == Opcode::AArch64_MSR) {
+      newSVCR = instruction_.getSourceOperands()[0].get<uint64_t>();
+    } else if (metadata.opcode == Opcode::AArch64_MSRpstatesvcrImm1) {
+      // Ensure operand metadata is as expected
+      assert(metadata.operands[0].type == AARCH64_OP_SYSALIAS);
+      assert(metadata.operands[0].sysop.sub_type == AARCH64_OP_SVCR);
+      // extract SVCR bits
+      const uint64_t svcrBits =
+          static_cast<uint64_t>(metadata.operands[0].sysop.alias.svcr);
+      // Ensure SVCR Bits are valid
+      assert(svcrBits == AARCH64_SVCR_SVCRSM ||
+             svcrBits == AARCH64_SVCR_SVCRZA ||
+             svcrBits == AARCH64_SVCR_SVCRSMZA);
+
+      const uint64_t imm = metadata.operands[1].imm;
+      assert((imm == 0 || imm == 1) &&
+             "[SimEng:ExceptionHandler] SVCR Instruction invalid - Imm value "
+             "can only be 0 or 1");
+      // Zero out SM & ZA bits as appropriate
+      newSVCR = currSVCR & ~(svcrBits);
+      // Update only relevant bits of SVCR
+      newSVCR = newSVCR | (svcrBits * imm);
     } else {
-      // Invalid instruction
-      assert("SVCR Instruction invalid - Imm value can only be 0 or 1");
+      std::cerr << "[SimEng::ExceptionHandler] SVCR system register exception "
+                   "triggered by incorrect instruction. Opcode "
+                << metadata.opcode << std::endl;
+      exit(1);
     }
-    instruction_.getArchitecture().setSVCRval(newSVCR);
+    arch.setSVCRval(newSVCR);
 
     // Initialise vectors for all registers & values
     std::vector<Register> regs;
     std::vector<RegisterValue> regValues;
 
-    // Add Vector/Predicate registers + 0 values (zeroed out on Streaming Mode
-    // context switch)
-    if (exception != InstructionException::ZAregisterStatusUpdate) {
-      for (uint16_t i = 0; i < regFileStruct[RegisterType::VECTOR].quantity;
-           i++) {
-        regs.push_back({RegisterType::VECTOR, i});
-        regValues.push_back(RegisterValue(0, 256));
-        if (i < regFileStruct[RegisterType::PREDICATE].quantity) {
-          regs.push_back({RegisterType::PREDICATE, i});
-          regValues.push_back(RegisterValue(0, 32));
+    // If SVCR.ZA has changed state then zero out ZA and ZT0 registers
+    if (exception != InstructionException::StreamingModeUpdate) {
+      if ((newSVCR & AARCH64_SVCR_SVCRZA) != (currSVCR & AARCH64_SVCR_SVCRZA)) {
+        for (uint16_t i = 0; i < regFileStruct[RegisterType::MATRIX].quantity;
+             i++) {
+          regs.push_back({RegisterType::MATRIX, i});
+          regValues.push_back(RegisterValue(0, 256));
         }
+        regs.push_back({RegisterType::TABLE, 0});
+        regValues.push_back(RegisterValue(0, 64));
       }
     }
-    // Zero out ZA register (zeroed out on ZA-reg context switch)
-    if (exception != InstructionException::StreamingModeUpdate) {
-      for (uint16_t i = 0; i < regFileStruct[RegisterType::MATRIX].quantity;
-           i++) {
-        regs.push_back({RegisterType::MATRIX, i});
-        regValues.push_back(RegisterValue(0, 256));
+    // If SVCR.SM has changed state then zero out SVE, NEON, Predicate
+    // registers, else don't
+    if (exception != InstructionException::ZAregisterStatusUpdate) {
+      if ((newSVCR & AARCH64_SVCR_SVCRSM) != (currSVCR & AARCH64_SVCR_SVCRSM)) {
+        for (uint16_t i = 0; i < regFileStruct[RegisterType::VECTOR].quantity;
+             i++) {
+          regs.push_back({RegisterType::VECTOR, i});
+          regValues.push_back(RegisterValue(0, 256));
+          if (i < regFileStruct[RegisterType::PREDICATE].quantity) {
+            regs.push_back({RegisterType::PREDICATE, i});
+            regValues.push_back(RegisterValue(0, 32));
+          }
+        }
       }
     }
+
+    // Update SVCR system register in regFile
+    regs.push_back({RegisterType::SYSTEM,
+                    static_cast<uint16_t>(
+                        arch.getSystemRegisterTag(AARCH64_SYSREG_SVCR))});
+    regValues.push_back(RegisterValue(newSVCR, 8));
+
     ProcessStateChange stateChange = {ChangeType::REPLACEMENT, regs, regValues};
     return concludeSyscall(stateChange);
   }
@@ -770,7 +793,7 @@ void ExceptionHandler::readLinkAt(span<char> path) {
     return;
   }
 
-  const auto& registerFileSet = core.getArchitecturalRegisterFileSet();
+  const auto& registerFileSet = core_.getArchitecturalRegisterFileSet();
   const auto dirfd = registerFileSet.get(R0).get<int64_t>();
   const auto bufAddress = registerFileSet.get(R2).get<uint64_t>();
   const auto bufSize = registerFileSet.get(R3).get<uint64_t>();
@@ -823,7 +846,7 @@ bool ExceptionHandler::readBufferThen(uint64_t ptr, uint64_t length,
   auto completedReads = memory_.getCompletedReads();
   auto response =
       std::find_if(completedReads.begin(), completedReads.end(),
-                   [&](const MemoryReadResult& response) {
+                   [&](const memory::MemoryReadResult& response) {
                      return response.requestId == instruction_.getSequenceId();
                    });
   if (response == completedReads.end()) {
@@ -834,7 +857,7 @@ bool ExceptionHandler::readBufferThen(uint64_t ptr, uint64_t length,
   assert(response->data && "unhandled failed read in exception handler");
   uint8_t bytesRead = response->target.size;
   const uint8_t* data = response->data.getAsVector<uint8_t>();
-  dataBuffer.insert(dataBuffer.end(), data, data + bytesRead);
+  dataBuffer_.insert(dataBuffer_.end(), data, data + bytesRead);
   memory_.clearCompletedReads();
 
   // If there is more data, rerun this function for next chunk
@@ -860,7 +883,7 @@ void ExceptionHandler::printException(const Instruction& insn) const {
   std::cout << "[SimEng:ExceptionHandler] Encountered ";
   switch (exception) {
     case InstructionException::EncodingUnallocated:
-      std::cout << "illegal instruction";
+      std::cout << "unallocated instruction encoding";
       break;
     case InstructionException::ExecutionNotYetImplemented:
       std::cout << "execution not-yet-implemented";
@@ -927,6 +950,12 @@ void ExceptionHandler::printException(const Instruction& insn) const {
   std::cout << std::endl;
   std::cout << "[SimEng:ExceptionHandler]       opcode ID: " << metadata.opcode
             << std::endl;
+
+  std::string extraInformation = metadata.getExceptionString();
+  if (!extraInformation.empty()) {
+    std::cout << "[SimEng:ExceptionHandler]     Extra information: "
+              << extraInformation << std::endl;
+  }
 }
 
 bool ExceptionHandler::fatal() {
diff --git a/src/lib/arch/aarch64/Instruction.cc b/src/lib/arch/aarch64/Instruction.cc
index 4b4a41db2a..755d0235b8 100644
--- a/src/lib/arch/aarch64/Instruction.cc
+++ b/src/lib/arch/aarch64/Instruction.cc
@@ -8,13 +8,13 @@ namespace simeng {
 namespace arch {
 namespace aarch64 {
 
-const Register Instruction::ZERO_REGISTER = {RegisterType::GENERAL,
-                                             (uint16_t)-1};
-
 Instruction::Instruction(const Architecture& architecture,
                          const InstructionMetadata& metadata,
                          MicroOpInfo microOpInfo)
-    : architecture_(architecture), metadata(metadata) {
+    : architecture_(architecture),
+      metadata_(metadata),
+      exception_(metadata.getMetadataException()) {
+  exceptionEncountered_ = metadata.getMetadataExceptionEncountered();
   isMicroOp_ = microOpInfo.isMicroOp;
   microOpcode_ = microOpInfo.microOpcode;
   dataSize_ = microOpInfo.dataSize;
@@ -26,53 +26,69 @@ Instruction::Instruction(const Architecture& architecture,
 Instruction::Instruction(const Architecture& architecture,
                          const InstructionMetadata& metadata,
                          InstructionException exception)
-    : architecture_(architecture), metadata(metadata) {
+    : architecture_(architecture), metadata_(metadata) {
   exception_ = exception;
   exceptionEncountered_ = true;
 }
 
-InstructionException Instruction::getException() const { return exception_; }
+const span<Register> Instruction::getSourceRegisters() const {
+  return {const_cast<Register*>(sourceRegisters_.data()), sourceRegisterCount_};
+}
 
-const span<Register> Instruction::getOperandRegisters() const {
-  return {const_cast<Register*>(sourceRegisters.data()), sourceRegisterCount};
+const span<RegisterValue> Instruction::getSourceOperands() const {
+  return {const_cast<RegisterValue*>(sourceValues_.data()),
+          sourceRegisterCount_};
 }
+
 const span<Register> Instruction::getDestinationRegisters() const {
-  return {const_cast<Register*>(destinationRegisters.data()),
-          destinationRegisterCount};
-}
-bool Instruction::isOperandReady(int index) const {
-  return static_cast<bool>(operands[index]);
+  return {const_cast<Register*>(destinationRegisters_.data()),
+          destinationRegisterCount_};
 }
 
-void Instruction::renameSource(uint8_t i, Register renamed) {
-  sourceRegisters[i] = renamed;
+void Instruction::renameSource(uint16_t i, Register renamed) {
+  sourceRegisters_[i] = renamed;
 }
-void Instruction::renameDestination(uint8_t i, Register renamed) {
-  destinationRegisters[i] = renamed;
+
+void Instruction::renameDestination(uint16_t i, Register renamed) {
+  destinationRegisters_[i] = renamed;
 }
 
-void Instruction::supplyOperand(uint8_t i, const RegisterValue& value) {
+void Instruction::supplyOperand(uint16_t i, const RegisterValue& value) {
   assert(!canExecute() &&
          "Attempted to provide an operand to a ready-to-execute instruction");
   assert(value.size() > 0 &&
          "Attempted to provide an uninitialised RegisterValue");
 
-  operands[i] = value;
-  operandsPending--;
+  sourceValues_[i] = value;
+  sourceOperandsPending_--;
+}
+
+bool Instruction::isOperandReady(int index) const {
+  return static_cast<bool>(sourceValues_[index]);
+}
+
+const span<RegisterValue> Instruction::getResults() const {
+  return {const_cast<RegisterValue*>(results_.data()),
+          destinationRegisterCount_};
+}
+
+span<const memory::MemoryAccessTarget> Instruction::getGeneratedAddresses()
+    const {
+  return {memoryAddresses_.data(), memoryAddresses_.size()};
 }
 
 void Instruction::supplyData(uint64_t address, const RegisterValue& data) {
-  for (size_t i = 0; i < memoryAddresses.size(); i++) {
-    if (memoryAddresses[i].address == address && !memoryData[i]) {
+  for (size_t i = 0; i < memoryAddresses_.size(); i++) {
+    if (memoryAddresses_[i].address == address && !memoryData_[i]) {
       if (!data) {
         // Raise exception for failed read
         // TODO: Move this logic to caller and distinguish between different
         // memory faults (e.g. bus error, page fault, seg fault)
         exception_ = InstructionException::DataAbort;
         exceptionEncountered_ = true;
-        memoryData[i] = RegisterValue(0, memoryAddresses[i].size);
+        memoryData_[i] = RegisterValue(0, memoryAddresses_[i].size);
       } else {
-        memoryData[i] = data;
+        memoryData_[i] = data;
       }
       dataPending_--;
       return;
@@ -81,95 +97,68 @@ void Instruction::supplyData(uint64_t address, const RegisterValue& data) {
 }
 
 span<const RegisterValue> Instruction::getData() const {
-  return {memoryData.data(), memoryData.size()};
+  return {memoryData_.data(), memoryData_.size()};
 }
 
-bool Instruction::canExecute() const { return (operandsPending == 0); }
-
-const span<RegisterValue> Instruction::getResults() const {
-  return {const_cast<RegisterValue*>(results.data()), destinationRegisterCount};
-}
-
-bool Instruction::isStoreAddress() const { return isStoreAddress_; }
-bool Instruction::isStoreData() const { return isStoreData_; }
-bool Instruction::isLoad() const { return isLoad_; }
-bool Instruction::isBranch() const { return isBranch_; }
-
-void Instruction::setMemoryAddresses(
-    const std::vector<MemoryAccessTarget>& addresses) {
-  memoryData.resize(addresses.size());
-  memoryAddresses = addresses;
-  dataPending_ = addresses.size();
-}
+BranchType Instruction::getBranchType() const { return branchType_; }
 
-void Instruction::setMemoryAddresses(
-    std::vector<MemoryAccessTarget>&& addresses) {
-  dataPending_ = addresses.size();
-  memoryData.resize(addresses.size());
-  memoryAddresses = std::move(addresses);
-}
+int64_t Instruction::getKnownOffset() const { return knownOffset_; }
 
-void Instruction::setMemoryAddresses(MemoryAccessTarget address) {
-  dataPending_ = 1;
-  memoryData.resize(1);
-  memoryAddresses.push_back(address);
+bool Instruction::isStoreAddress() const {
+  return isInstruction(InsnType::isStoreAddress);
 }
 
-span<const MemoryAccessTarget> Instruction::getGeneratedAddresses() const {
-  return {memoryAddresses.data(), memoryAddresses.size()};
+bool Instruction::isStoreData() const {
+  return isInstruction(InsnType::isStoreData);
 }
 
-std::tuple<bool, uint64_t> Instruction::checkEarlyBranchMisprediction() const {
-  assert(
-      !executed_ &&
-      "Early branch misprediction check shouldn't be called after execution");
-
-  if (!isBranch()) {
-    // Instruction isn't a branch; if predicted as taken, it will require a
-    // flush
-    return {prediction_.taken, instructionAddress_ + 4};
-  }
+bool Instruction::isLoad() const { return isInstruction(InsnType::isLoad); }
 
-  // Not enough information to determine this was a misprediction
-  return {false, 0};
-}
-
-BranchType Instruction::getBranchType() const { return branchType_; }
-
-int64_t Instruction::getKnownOffset() const { return knownOffset_; }
+bool Instruction::isBranch() const { return isInstruction(InsnType::isBranch); }
 
 uint16_t Instruction::getGroup() const {
   // Use identifiers to decide instruction group
   // Set base
   uint16_t base = InstructionGroups::INT;
-  if (isScalarData_)
+  if (isInstruction(InsnType::isScalarData))
     base = InstructionGroups::SCALAR;
-  else if (isVectorData_)
+  else if (isInstruction(InsnType::isVectorData))
     base = InstructionGroups::VECTOR;
-  else if (isSVEData_)
+  else if (isInstruction(InsnType::isSVEData))
     base = InstructionGroups::SVE;
-  else if (isSMEData_)
+  else if (isInstruction(InsnType::isSMEData))
     base = InstructionGroups::SME;
 
-  if (isLoad_) return base + 10;
-  if (isStoreAddress_) return base + 11;
-  if (isStoreData_) return base + 12;
-  if (isBranch_) return InstructionGroups::BRANCH;
-  if (isPredicate_) return InstructionGroups::PREDICATE;
-  if (isDivideOrSqrt_) return base + 9;
-  if (isMultiply_) return base + 8;
-  if (isConvert_) return base + 7;
-  if (isCompare_) return base + 6;
-  if (isLogical_) {
-    if (isNoShift_) return base + 5;
-    return base + 4;
+  if (isInstruction(InsnType::isLoad)) return base + 10;
+  if (isInstruction(InsnType::isStoreAddress)) return base + 11;
+  if (isInstruction(InsnType::isStoreData)) return base + 12;
+  if (isInstruction(InsnType::isBranch)) return InstructionGroups::BRANCH;
+  if (isInstruction(InsnType::isPredicate)) return InstructionGroups::PREDICATE;
+  if (isInstruction(InsnType::isDivideOrSqrt)) return base + 9;
+  if (isInstruction(InsnType::isMultiply)) return base + 8;
+  if (isInstruction(InsnType::isConvert)) return base + 7;
+  if (isInstruction(InsnType::isCompare)) return base + 6;
+  if (isInstruction(InsnType::isLogical)) {
+    if (isInstruction(InsnType::isShift)) return base + 4;
+    return base + 5;
   }
-  if (isNoShift_) return base + 3;
-  return base + 2;  // Default return is {Data type}_SIMPLE_ARTH
+  if (isInstruction(InsnType::isShift)) return base + 2;
+  return base + 3;  // Default return is {Data type}_SIMPLE_ARTH
+}
+
+bool Instruction::canExecute() const { return (sourceOperandsPending_ == 0); }
+
+const std::vector<uint16_t>& Instruction::getSupportedPorts() {
+  if (supportedPorts_.size() == 0) {
+    exception_ = InstructionException::NoAvailablePort;
+    exceptionEncountered_ = true;
+  }
+  return supportedPorts_;
 }
 
 void Instruction::setExecutionInfo(const ExecutionInfo& info) {
-  if (isLoad_ || isStoreAddress_) {
+  if (isInstruction(InsnType::isLoad) ||
+      isInstruction(InsnType::isStoreAddress)) {
     lsqExecutionLatency_ = info.latency;
   } else {
     latency_ = info.latency;
@@ -177,77 +166,16 @@ void Instruction::setExecutionInfo(const ExecutionInfo& info) {
   stallCycles_ = info.stallCycles;
   supportedPorts_ = info.ports;
 }
-const std::vector<uint16_t>& Instruction::getSupportedPorts() {
-  if (supportedPorts_.size() == 0) {
-    exception_ = InstructionException::NoAvailablePort;
-    exceptionEncountered_ = true;
-  }
-  return supportedPorts_;
-}
 
-const InstructionMetadata& Instruction::getMetadata() const { return metadata; }
+const InstructionMetadata& Instruction::getMetadata() const {
+  return metadata_;
+}
 
 const Architecture& Instruction::getArchitecture() const {
   return architecture_;
 }
 
-/** Extend `value` according to `extendType`, and left-shift the result by
- * `shift` */
-uint64_t Instruction::extendValue(uint64_t value, uint8_t extendType,
-                                  uint8_t shift) const {
-  if (extendType == ARM64_EXT_INVALID && shift == 0) {
-    // Special case: an invalid shift type with a shift amount of 0 implies an
-    // identity operation
-    return value;
-  }
-
-  uint64_t extended;
-  switch (extendType) {
-    case ARM64_EXT_UXTB:
-      extended = static_cast<uint8_t>(value);
-      break;
-    case ARM64_EXT_UXTH:
-      extended = static_cast<uint16_t>(value);
-      break;
-    case ARM64_EXT_UXTW:
-      extended = static_cast<uint32_t>(value);
-      break;
-    case ARM64_EXT_UXTX:
-      extended = value;
-      break;
-    case ARM64_EXT_SXTB:
-      extended = static_cast<int8_t>(value);
-      break;
-    case ARM64_EXT_SXTH:
-      extended = static_cast<int16_t>(value);
-      break;
-    case ARM64_EXT_SXTW:
-      extended = static_cast<int32_t>(value);
-      break;
-    case ARM64_EXT_SXTX:
-      extended = value;
-      break;
-    default:
-      assert(false && "Invalid extension type");
-      return 0;
-  }
-
-  return extended << shift;
-}
-
-/** Extend `value` using extension/shifting rules defined in `op`. */
-uint64_t Instruction::extendOffset(uint64_t value,
-                                   const cs_arm64_op& op) const {
-  if (op.ext == 0) {
-    if (op.shift.value == 0) {
-      return value;
-    }
-    if (op.shift.type == 1) {
-      return extendValue(value, ARM64_EXT_UXTX, op.shift.value);
-    }
-  }
-  return extendValue(value, op.ext, op.shift.value);
-}
+InstructionException Instruction::getException() const { return exception_; }
 
 }  // namespace aarch64
 }  // namespace arch
diff --git a/src/lib/arch/aarch64/InstructionMetadata.cc b/src/lib/arch/aarch64/InstructionMetadata.cc
index c2ef75b6b8..56e438a3d8 100644
--- a/src/lib/arch/aarch64/InstructionMetadata.cc
+++ b/src/lib/arch/aarch64/InstructionMetadata.cc
@@ -13,13 +13,13 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
       implicitSourceCount(insn.detail->regs_read_count),
       implicitDestinationCount(insn.detail->regs_write_count),
       groupCount(insn.detail->groups_count),
-      cc(insn.detail->arm64.cc - 1),
-      setsFlags(insn.detail->arm64.update_flags),
-      writeback(insn.detail->arm64.writeback),
-      operandCount(insn.detail->arm64.op_count) {
+      cc(insn.detail->aarch64.cc),
+      setsFlags(insn.detail->aarch64.update_flags),
+      isAlias(insn.is_alias),
+      operandCount(insn.detail->aarch64.op_count) {
   std::memcpy(encoding, insn.bytes, sizeof(encoding));
   // Copy printed output
-  std::strncpy(mnemonic, insn.mnemonic, CS_MNEMONIC_SIZE);
+  mnemonic = std::string(insn.mnemonic);
   operandStr = std::string(insn.op_str);
 
   // Copy register/group/operand information
@@ -28,12 +28,20 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
   std::memcpy(implicitDestinations, insn.detail->regs_write,
               sizeof(uint16_t) * implicitDestinationCount);
   std::memcpy(groups, insn.detail->groups, sizeof(uint8_t) * groupCount);
-  std::memcpy(operands, insn.detail->arm64.operands,
-              sizeof(cs_arm64_op) * operandCount);
+  std::memcpy(operands, insn.detail->aarch64.operands,
+              sizeof(cs_aarch64_op) * operandCount);
 
   // Fix some inaccuracies in the decoded metadata
   switch (opcode) {
-    case Opcode::AArch64_ADR_LSL_ZZZ_D_0:
+    case Opcode::AArch64_FMOVXDHighr:  // Example bytecode - 4100af9e
+      // FMOVXDHighr incorrectly flags destination as WRITE only
+      operands[0].access = CS_AC_READ | CS_AC_WRITE;
+      break;
+    case Opcode::AArch64_FCVTNv4i32:  // Example bytecode - 0168614e
+      // Wrong access type for destination operand
+      operands[0].access = CS_AC_WRITE;
+      break;
+    case Opcode::AArch64_ADR_LSL_ZZZ_D_0:  // example bytecode = c8a0e704
       [[fallthrough]];
     case Opcode::AArch64_ADR_LSL_ZZZ_D_1:
       [[fallthrough]];
@@ -48,1562 +56,242 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
     case Opcode::AArch64_ADR_LSL_ZZZ_S_2:
       [[fallthrough]];
     case Opcode::AArch64_ADR_LSL_ZZZ_S_3: {
-      // No defined access types
+      // Change the last 2 Z-regs from one MEM operand to two REG operands
       operandCount = 3;
-      operands[0].access = CS_AC_WRITE;
+      operands[1].type = AARCH64_OP_REG;
       operands[1].access = CS_AC_READ;
+      operands[1].reg = operands[1].mem.base;
+      operands[1].vas = operands[1].vas;
+      operands[2].type = AARCH64_OP_REG;
       operands[2].access = CS_AC_READ;
-      operands[2].type = ARM64_OP_REG;
+      operands[2].reg = operands[1].mem.index;
+      operands[2].vas = operands[1].vas;
+      operands[2].shift = operands[1].shift;
       break;
     }
-    case Opcode::AArch64_SMIN_ZPmZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_EOR_ZPmZ_B:
-      [[fallthrough]];
-    case Opcode::AArch64_EOR_ZPmZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_EOR_ZPmZ_H:
-      [[fallthrough]];
-    case Opcode::AArch64_EOR_ZPmZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_AND_ZPmZ_B:
-      [[fallthrough]];
-    case Opcode::AArch64_AND_ZPmZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_AND_ZPmZ_H:
-      [[fallthrough]];
-    case Opcode::AArch64_AND_ZPmZ_S:
-      // No defined access types
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      operands[3].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_BICv4i32:
-      // BIC incorrectly flags destination as WRITE only
-      operands[0].access = CS_AC_WRITE | CS_AC_READ;
-      break;
-    case Opcode::AArch64_ADDSWri:
-      // adds incorrectly flags destination as READ
-      operands[0].access = CS_AC_WRITE;
-      break;
-    case Opcode::AArch64_BICv8i16:
-      operands[0].access = CS_AC_WRITE | CS_AC_READ;
-      operands[1].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_BICv8i8:
-      // access specifier for last operand was missing
-      operands[2].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_CASALW:
+    case Opcode::AArch64_CASALW:  // Example bytecode - 02fce188
       [[fallthrough]];
     case Opcode::AArch64_CASALX:
-      operandCount = 3;
-      operands[0].access = CS_AC_READ;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_CBNZW:
-      [[fallthrough]];
-    case Opcode::AArch64_CBNZX:
-      [[fallthrough]];
-    case Opcode::AArch64_CBZW:
-      [[fallthrough]];
-    case Opcode::AArch64_CBZX:
-      // incorrectly adds implicit nzcv dependency
-      implicitSourceCount = 0;
-      break;
-    case Opcode::AArch64_CMPHI_PPzZZ_B:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPHI_PPzZZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPHI_PPzZZ_H:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPHI_PPzZZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPGT_PPzZZ_B:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPGT_PPzZZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPGT_PPzZZ_H:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPGT_PPzZZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPEQ_PPzZI_B:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPEQ_PPzZI_D:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPEQ_PPzZI_H:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPEQ_PPzZI_S:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPEQ_PPzZZ_B:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPEQ_PPzZZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPEQ_PPzZZ_H:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPEQ_PPzZZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPNE_PPzZZ_B:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPNE_PPzZZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPNE_PPzZZ_H:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPNE_PPzZZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPNE_PPzZI_B:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPNE_PPzZI_D:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPNE_PPzZI_H:
-      [[fallthrough]];
-    case Opcode::AArch64_CMPNE_PPzZI_S:
-      // No defined access types
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      operands[3].access = CS_AC_READ;
-      // Doesn't identify implicit NZCV destination
-      implicitDestinationCount = 1;
-      implicitDestinations[0] = ARM64_REG_NZCV;
+      // Remove implicit destination (MEM base reg)
+      implicitDestinationCount = 0;
       break;
-    case Opcode::AArch64_CNTB_XPiI:
+    case Opcode::AArch64_ADD_ZI_B:  // Example bytecode - 00c12025
       [[fallthrough]];
-    case Opcode::AArch64_CNTH_XPiI:
+    case Opcode::AArch64_ADD_ZI_D:
       [[fallthrough]];
-    case Opcode::AArch64_CNTD_XPiI:
+    case Opcode::AArch64_ADD_ZI_H:
       [[fallthrough]];
-    case Opcode::AArch64_CNTW_XPiI: {
-      // lacking access specifiers for destination
-      operands[0].access = CS_AC_WRITE;
-      if (operandStr.length() < 4) {
-        operandCount = 2;
-        operands[1].type = ARM64_OP_IMM;
-        operands[1].imm = 1;
-        operands[1].access = CS_AC_READ;
-        operands[1].shift = {ARM64_SFT_INVALID, 0};
-        operands[1].ext = ARM64_EXT_INVALID;
-        operands[1].vector_index = -1;
-      }
-      break;
-    }
-    case Opcode::AArch64_AND_ZI: {
+    case Opcode::AArch64_ADD_ZI_S: {
+      // Incorrect access types
       operands[0].access = CS_AC_WRITE;
       operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      operands[2].type = ARM64_OP_IMM;
-
-      char specifier = operandStr[operandStr.find(".") + 1];
-      switch (specifier) {
-        case 'b': {
-          uint8_t mask = static_cast<uint8_t>(operands[2].imm);
-          operands[2].imm = static_cast<uint64_t>(0);
-          for (int i = 0; i < 8; i++)
-            operands[2].imm |= (static_cast<uint64_t>(mask) << (i * 8));
-          break;
-        }
-        case 'h': {
-          uint16_t mask = static_cast<uint16_t>(operands[2].imm);
-          operands[2].imm = static_cast<uint64_t>(0);
-          for (int i = 0; i < 4; i++)
-            operands[2].imm |= (static_cast<uint64_t>(mask) << (i * 16));
-          break;
-        }
-        case 's': {
-          uint32_t mask = static_cast<uint32_t>(operands[2].imm);
-          operands[2].imm = static_cast<uint64_t>(0);
-          for (int i = 0; i < 2; i++)
-            operands[2].imm |= (static_cast<uint64_t>(mask) << (i * 32));
-          break;
-        }
-        default:
-          break;
-      }
       break;
     }
-    case Opcode::AArch64_CNTP_XPP_B:
+    case Opcode::AArch64_SMAX_ZI_B:
       [[fallthrough]];
-    case Opcode::AArch64_CNTP_XPP_D:
+    case Opcode::AArch64_SMAX_ZI_D:
       [[fallthrough]];
-    case Opcode::AArch64_CNTP_XPP_H:
+    case Opcode::AArch64_SMAX_ZI_H:
       [[fallthrough]];
-    case Opcode::AArch64_CNTP_XPP_S:
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_DECD_XPiI:
+    case Opcode::AArch64_SMAX_ZI_S:  // Example bytecode - 03c0a825
       [[fallthrough]];
-    case Opcode::AArch64_DECB_XPiI: {
-      // lacking access specifiers for destination
-      operands[0].access = CS_AC_READ | CS_AC_WRITE;
-      std::string str(operandStr);
-      if (str.length() < 4) {
-        operandCount = 2;
-        operands[1].type = ARM64_OP_IMM;
-        operands[1].imm = 1;
-        operands[1].access = CS_AC_READ;
-        operands[1].shift = {ARM64_SFT_INVALID, 0};
-        operands[1].ext = ARM64_EXT_INVALID;
-        operands[1].vector_index = -1;
-      }
-      break;
-    }
-    case Opcode::AArch64_EOR_PPzPP: {
+    case Opcode::AArch64_AND_ZI:  // Example bytecode - 00068005
+      // Incorrect access types
       operands[0].access = CS_AC_WRITE;
       operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      operands[3].access = CS_AC_READ;
       break;
-    }
-    case Opcode::AArch64_FMOVXDHighr:
-      // FMOVXDHighr incorrectly flags destination as only WRITE
-      operands[0].access = CS_AC_READ | CS_AC_WRITE;
-      break;
-    case Opcode::AArch64_FNMSB_ZPmZZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FNMSB_ZPmZZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_FNMLS_ZPmZZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FNMLS_ZPmZZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_FMAD_ZPmZZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FMAD_ZPmZZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_FMLA_ZPmZZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FMLA_ZPmZZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_FMLS_ZPmZZ_D:
+    case Opcode::AArch64_FSUB_ZPmI_D:
       [[fallthrough]];
-    case Opcode::AArch64_FMLS_ZPmZZ_S:
+    case Opcode::AArch64_FSUB_ZPmI_H:
       [[fallthrough]];
-    case Opcode::AArch64_FMSB_ZPmZZ_D:
+    case Opcode::AArch64_FSUB_ZPmI_S:  // Example bytecode - 00849965
       [[fallthrough]];
-    case Opcode::AArch64_FMSB_ZPmZZ_S:
+    case Opcode::AArch64_FMUL_ZPmI_D:
       [[fallthrough]];
-    case Opcode::AArch64_MLA_ZPmZZ_B:
+    case Opcode::AArch64_FMUL_ZPmI_H:
       [[fallthrough]];
-    case Opcode::AArch64_MLA_ZPmZZ_D:
+    case Opcode::AArch64_FMUL_ZPmI_S:  // Example bytecode - 00809a65
       [[fallthrough]];
-    case Opcode::AArch64_MLA_ZPmZZ_H:
+    case Opcode::AArch64_FADD_ZPmI_D:  // Example bytecode - 0584d865
       [[fallthrough]];
-    case Opcode::AArch64_MLA_ZPmZZ_S:
+    case Opcode::AArch64_FADD_ZPmI_H:
       [[fallthrough]];
-    case Opcode::AArch64_SMAX_ZPmZ_S:
-      // No defined access types
-      operands[0].access = CS_AC_READ | CS_AC_WRITE;
+    case Opcode::AArch64_FADD_ZPmI_S: {
+      // Incorrect access types
+      operands[0].access = CS_AC_WRITE;
       operands[1].access = CS_AC_READ;
       operands[2].access = CS_AC_READ;
-      operands[3].access = CS_AC_READ;
       break;
-    case Opcode::AArch64_ADDPL_XXI:
-      [[fallthrough]];
-    case Opcode::AArch64_ADDVL_XXI:
-      [[fallthrough]];
-    case Opcode::AArch64_UADDV_VPZ_B:
-      [[fallthrough]];
-    case Opcode::AArch64_UADDV_VPZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_UADDV_VPZ_H:
-      [[fallthrough]];
-    case Opcode::AArch64_UADDV_VPZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_MOVPRFX_ZPzZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_MOVPRFX_ZPzZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_SUB_ZZZ_B:
-      [[fallthrough]];
-    case Opcode::AArch64_SUB_ZZZ_H:
-      [[fallthrough]];
-    case Opcode::AArch64_SUB_ZZZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_SUB_ZZZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_INDEX_II_B:
-      [[fallthrough]];
-    case Opcode::AArch64_INDEX_II_H:
-      [[fallthrough]];
-    case Opcode::AArch64_INDEX_II_S:
-      [[fallthrough]];
-    case Opcode::AArch64_INDEX_II_D:
+    }
+    case Opcode::AArch64_AND_ZPmZ_D:  // Example bytecode - 4901da04
       [[fallthrough]];
-    case Opcode::AArch64_INDEX_IR_B:
+    case Opcode::AArch64_AND_ZPmZ_H:
       [[fallthrough]];
-    case Opcode::AArch64_INDEX_IR_D:
+    case Opcode::AArch64_AND_ZPmZ_S:
       [[fallthrough]];
-    case Opcode::AArch64_INDEX_IR_H:
+    case Opcode::AArch64_AND_ZPmZ_B:
       [[fallthrough]];
-    case Opcode::AArch64_INDEX_IR_S:
+    case Opcode::AArch64_SMULH_ZPmZ_B:  // Example bytecode - 20001204
       [[fallthrough]];
-    case Opcode::AArch64_INDEX_RI_B:
+    case Opcode::AArch64_SMULH_ZPmZ_D:
       [[fallthrough]];
-    case Opcode::AArch64_INDEX_RI_D:
+    case Opcode::AArch64_SMULH_ZPmZ_H:
       [[fallthrough]];
-    case Opcode::AArch64_INDEX_RI_H:
+    case Opcode::AArch64_SMULH_ZPmZ_S:
       [[fallthrough]];
-    case Opcode::AArch64_INDEX_RI_S:
+    case Opcode::AArch64_SMIN_ZPmZ_B:
       [[fallthrough]];
-    case Opcode::AArch64_INDEX_RR_B:
+    case Opcode::AArch64_SMIN_ZPmZ_D:
       [[fallthrough]];
-    case Opcode::AArch64_INDEX_RR_D:
+    case Opcode::AArch64_SMIN_ZPmZ_H:
       [[fallthrough]];
-    case Opcode::AArch64_INDEX_RR_H:
+    case Opcode::AArch64_SMIN_ZPmZ_S:  // Example bytecode - 01008a04
       [[fallthrough]];
-    case Opcode::AArch64_INDEX_RR_S:
+    case Opcode::AArch64_SMAX_ZPmZ_B:
       [[fallthrough]];
-    case Opcode::AArch64_ADD_ZZZ_B:
+    case Opcode::AArch64_SMAX_ZPmZ_D:
       [[fallthrough]];
-    case Opcode::AArch64_ADD_ZZZ_D:
+    case Opcode::AArch64_SMAX_ZPmZ_H:
       [[fallthrough]];
-    case Opcode::AArch64_ADD_ZZZ_H:
+    case Opcode::AArch64_SMAX_ZPmZ_S:  // Example bytecode - 01008804
       [[fallthrough]];
-    case Opcode::AArch64_ADD_ZZZ_S:
+    case Opcode::AArch64_MUL_ZPmZ_B:  // Example bytecode - 40001004
       [[fallthrough]];
-    case Opcode::AArch64_FADD_ZZZ_D:
+    case Opcode::AArch64_MUL_ZPmZ_D:
       [[fallthrough]];
-    case Opcode::AArch64_FADD_ZZZ_S:
+    case Opcode::AArch64_MUL_ZPmZ_H:
       [[fallthrough]];
-    case Opcode::AArch64_FSUB_ZZZ_D:
+    case Opcode::AArch64_MUL_ZPmZ_S:
       [[fallthrough]];
-    case Opcode::AArch64_FSUB_ZZZ_S:
+    case Opcode::AArch64_FSUBR_ZPmZ_D:
       [[fallthrough]];
-    case Opcode::AArch64_FMUL_ZZZ_D:
+    case Opcode::AArch64_FSUBR_ZPmZ_H:
       [[fallthrough]];
-    case Opcode::AArch64_FMUL_ZZZ_S:
+    case Opcode::AArch64_FSUBR_ZPmZ_S:  // Example bytecode - 24808365
       [[fallthrough]];
-    case Opcode::AArch64_SMAX_ZI_S:
+    case Opcode::AArch64_FSUB_ZPmZ_D:
       [[fallthrough]];
-    case Opcode::AArch64_SMINV_VPZ_S:
+    case Opcode::AArch64_FSUB_ZPmZ_H:
       [[fallthrough]];
-    case Opcode::AArch64_TRN1_ZZZ_B:
+    case Opcode::AArch64_FSUB_ZPmZ_S:  // Example bytecode - 24808165
       [[fallthrough]];
-    case Opcode::AArch64_TRN1_ZZZ_D:
+    case Opcode::AArch64_FMUL_ZPmZ_D:
       [[fallthrough]];
-    case Opcode::AArch64_TRN1_ZZZ_H:
+    case Opcode::AArch64_FMUL_ZPmZ_H:
       [[fallthrough]];
-    case Opcode::AArch64_TRN1_ZZZ_S:
+    case Opcode::AArch64_FMUL_ZPmZ_S:  // Example bytecode - 83808265
       [[fallthrough]];
-    case Opcode::AArch64_TRN2_ZZZ_B:
+    case Opcode::AArch64_FDIV_ZPmZ_D:  // Example bytecode - 0184cd65
       [[fallthrough]];
-    case Opcode::AArch64_TRN2_ZZZ_D:
+    case Opcode::AArch64_FDIV_ZPmZ_H:
       [[fallthrough]];
-    case Opcode::AArch64_TRN2_ZZZ_H:
+    case Opcode::AArch64_FDIV_ZPmZ_S:
       [[fallthrough]];
-    case Opcode::AArch64_TRN2_ZZZ_S:
+    case Opcode::AArch64_FDIVR_ZPmZ_D:  // Example bytecode - 0184cc65
       [[fallthrough]];
-    case Opcode::AArch64_UZP1_ZZZ_S:
-      // No defined access types
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_MOVPRFX_ZPmZ_D:
+    case Opcode::AArch64_FDIVR_ZPmZ_H:
       [[fallthrough]];
-    case Opcode::AArch64_FCPY_ZPmI_D:
+    case Opcode::AArch64_FDIVR_ZPmZ_S:
       [[fallthrough]];
-    case Opcode::AArch64_FCPY_ZPmI_S:
+    case Opcode::AArch64_FADDA_VPZ_D:
       [[fallthrough]];
-    case Opcode::AArch64_FNEG_ZPmZ_D:
+    case Opcode::AArch64_FADDA_VPZ_H:
       [[fallthrough]];
-    case Opcode::AArch64_FNEG_ZPmZ_S:
+    case Opcode::AArch64_FADDA_VPZ_S:  // Example bytecode - 01249865
       [[fallthrough]];
-    case Opcode::AArch64_FRINTN_ZPmZ_D:
+    case Opcode::AArch64_FADD_ZPmZ_D:  // Example bytecode - 6480c065
       [[fallthrough]];
-    case Opcode::AArch64_FRINTN_ZPmZ_S:
+    case Opcode::AArch64_FADD_ZPmZ_H:
       [[fallthrough]];
-    case Opcode::AArch64_FABS_ZPmZ_D:
+    case Opcode::AArch64_FADD_ZPmZ_S:
       [[fallthrough]];
-    case Opcode::AArch64_FABS_ZPmZ_S:
+    case Opcode::AArch64_FCADD_ZPmZ_D:  // Example bytecode - 2080c064
       [[fallthrough]];
-    case Opcode::AArch64_FSQRT_ZPmZ_S:
+    case Opcode::AArch64_FCADD_ZPmZ_H:
       [[fallthrough]];
-    case Opcode::AArch64_FSQRT_ZPmZ_D:
+    case Opcode::AArch64_FCADD_ZPmZ_S:
       [[fallthrough]];
-    case Opcode::AArch64_FCVTZS_ZPmZ_DtoD:
+    case Opcode::AArch64_ADD_ZPmZ_B:  // Example bytecode - 00000004
       [[fallthrough]];
-    case Opcode::AArch64_FCVTZS_ZPmZ_StoD:
+    case Opcode::AArch64_ADD_ZPmZ_D:
       [[fallthrough]];
-    case Opcode::AArch64_FCVTZS_ZPmZ_StoS:
+    case Opcode::AArch64_ADD_ZPmZ_H:
       [[fallthrough]];
-    case Opcode::AArch64_FCVTZS_ZPmZ_DtoS:
-      // No defined access types
-      operands[0].access = CS_AC_READ | CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_FMUL_ZPmI_D:
+    case Opcode::AArch64_ADD_ZPmZ_S:
       [[fallthrough]];
-    case Opcode::AArch64_FMUL_ZPmI_S: {
-      // No defined access types
-      operandCount = 4;
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      operands[3].type = ARM64_OP_FP;
-      operands[3].access = CS_AC_READ;
-      // Doesn't recognise immediate operands
-      // Extract two possible values, 0.5 or 2.0
-      if (operandStr.substr(operandStr.length() - 1, 1) == "5") {
-        operands[3].fp = 0.5f;
-      } else {
-        operands[3].fp = 2.0f;
-      }
-
-      break;
-    }
-    case Opcode::AArch64_FCMLA_ZPmZZ_D: {
-      // No defined access types
-      operands[0].access = CS_AC_READ | CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      operands[3].access = CS_AC_READ;
-      operands[4].access = CS_AC_READ;
-      operands[4].type = ARM64_OP_IMM;
-      break;
-    }
-    case Opcode::AArch64_FCADD_ZPmZ_D: {
-      // No defined access types
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      operands[3].access = CS_AC_READ;
-      operands[4].access = CS_AC_READ;
-      operands[4].type = ARM64_OP_IMM;
-      break;
-    }
-    case Opcode::AArch64_FSUB_ZPmI_D:
+    case Opcode::AArch64_EOR_ZPmZ_B:  // Example bytecode - 20001904
       [[fallthrough]];
-    case Opcode::AArch64_FSUB_ZPmI_S:
+    case Opcode::AArch64_EOR_ZPmZ_D:
       [[fallthrough]];
-    case Opcode::AArch64_FADD_ZPmI_D:
+    case Opcode::AArch64_EOR_ZPmZ_H:
       [[fallthrough]];
-    case Opcode::AArch64_FADD_ZPmI_S:
-      // No defined access types
-      operandCount = 4;
+    case Opcode::AArch64_EOR_ZPmZ_S:
+      // Incorrect access types
       operands[0].access = CS_AC_WRITE;
       operands[1].access = CS_AC_READ;
       operands[2].access = CS_AC_READ;
-      operands[3].type = ARM64_OP_FP;
       operands[3].access = CS_AC_READ;
-      // Doesn't recognise immediate operands
-      // Extract two possible values, 0.5 or 1.0
-      if (operandStr.substr(operandStr.length() - 1, 1) == "5") {
-        operands[3].fp = 0.5f;
-      } else {
-        operands[3].fp = 1.0f;
-      }
-      break;
-    case Opcode::AArch64_FCMGT_PPzZ0_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FCMGT_PPzZ0_S: {
-      // No defined access types
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      break;
-    }
-    case Opcode::AArch64_FMLA_ZZZI_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FMLA_ZZZI_S: {
-      // Need to define missing access types
-      operands[0].access = CS_AC_READ | CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
       break;
-    }
-    case Opcode::AArch64_FDIVR_ZPmZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FDIVR_ZPmZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_FDIV_ZPmZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_AND_PPzPP:
-      [[fallthrough]];
-    case Opcode::AArch64_ADD_ZPmZ_B:
-      [[fallthrough]];
-    case Opcode::AArch64_ADD_ZPmZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_ADD_ZPmZ_H:
-      [[fallthrough]];
-    case Opcode::AArch64_ADD_ZPmZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_FADD_ZPmZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FADD_ZPmZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_FCMGE_PPzZZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FCMGE_PPzZZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_FCMGE_PPzZ0_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FCMGE_PPzZ0_S:
-      [[fallthrough]];
-    case Opcode::AArch64_FCMGT_PPzZZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FCMGT_PPzZZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_FCMLE_PPzZ0_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FCMLE_PPzZ0_S:
-      [[fallthrough]];
-    case Opcode::AArch64_FCMLT_PPzZ0_S:
-      [[fallthrough]];
-    case Opcode::AArch64_FMUL_ZPmZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FMUL_ZPmZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_FSUBR_ZPmZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FSUBR_ZPmZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_FSUB_ZPmZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FSUB_ZPmZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_FADDA_VPZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_MUL_ZPmZ_B:
+
+    case Opcode::AArch64_INSERT_MXIPZ_H_B:
       [[fallthrough]];
-    case Opcode::AArch64_MUL_ZPmZ_D:
+    case Opcode::AArch64_INSERT_MXIPZ_H_D:
       [[fallthrough]];
-    case Opcode::AArch64_MUL_ZPmZ_H:
+    case Opcode::AArch64_INSERT_MXIPZ_H_H:
       [[fallthrough]];
-    case Opcode::AArch64_MUL_ZPmZ_S:
+    case Opcode::AArch64_INSERT_MXIPZ_H_Q:
       [[fallthrough]];
-    case Opcode::AArch64_ORR_PPzPP:
+    case Opcode::AArch64_INSERT_MXIPZ_H_S:
       [[fallthrough]];
-    case Opcode::AArch64_SMULH_ZPmZ_B:
+    case Opcode::AArch64_INSERT_MXIPZ_V_B:
       [[fallthrough]];
-    case Opcode::AArch64_SMULH_ZPmZ_H:
+    case Opcode::AArch64_INSERT_MXIPZ_V_D:
       [[fallthrough]];
-    case Opcode::AArch64_SMULH_ZPmZ_S:
+    case Opcode::AArch64_INSERT_MXIPZ_V_H:
       [[fallthrough]];
-    case Opcode::AArch64_SEL_ZPZZ_D:
+    case Opcode::AArch64_INSERT_MXIPZ_V_Q:
       [[fallthrough]];
-    case Opcode::AArch64_SEL_ZPZZ_S:
-      // No defined access types
+    case Opcode::AArch64_INSERT_MXIPZ_V_S:
+      // Need to add access specifiers
+      // although operands[0] should be READ | WRITE, due to the implemented
+      // decode logic for SME tile destinations, the register will be added as
+      // both source and destination with just WRITE access.
       operands[0].access = CS_AC_WRITE;
       operands[1].access = CS_AC_READ;
       operands[2].access = CS_AC_READ;
-      operands[3].access = CS_AC_READ;
       break;
-    case Opcode::AArch64_FRINTPDr:
-      [[fallthrough]];
-    case Opcode::AArch64_FRINTPSr:
-      [[fallthrough]];
-    case Opcode::AArch64_FDUP_ZI_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FDUP_ZI_S:
-      [[fallthrough]];
-    case Opcode::AArch64_PUNPKHI_PP:
-      [[fallthrough]];
-    case Opcode::AArch64_PUNPKLO_PP:
-      [[fallthrough]];
-    case Opcode::AArch64_RDVLI_XI:
-      // No defined access types
+    case Opcode::AArch64_LDR_ZA:
+      // Need to add access specifier
+      // although operands[0] should be READ | WRITE, due to the implemented
+      // decode logic for SME tile destinations, the register will be added as
+      // both source and destination with just WRITE access.
       operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_INCB_XPiI:
-      [[fallthrough]];
-    case Opcode::AArch64_INCD_XPiI:
-      [[fallthrough]];
-    case Opcode::AArch64_INCH_XPiI:
-      [[fallthrough]];
-    case Opcode::AArch64_INCW_XPiI: {
-      // lacking access specifiers for destination
-      operands[0].access = CS_AC_READ | CS_AC_WRITE;
-      if (operandStr.length() < 4) {
-        operandCount = 2;
-        operands[1].type = ARM64_OP_IMM;
-        operands[1].imm = 1;
-        operands[1].access = CS_AC_READ;
-        operands[1].shift = {ARM64_SFT_INVALID, 0};
-        operands[1].ext = ARM64_EXT_INVALID;
-        operands[1].vector_index = -1;
-      }
       break;
-    }
-    case Opcode::AArch64_INCD_ZPiI:
-      [[fallthrough]];
-    case Opcode::AArch64_INCH_ZPiI:
-      [[fallthrough]];
-    case Opcode::AArch64_INCW_ZPiI: {
-      // lacking access specifiers for destination
-      operands[0].access = CS_AC_READ | CS_AC_WRITE;
-      if (operandStr.length() < 6) {
-        operandCount = 2;
-        operands[1].type = ARM64_OP_IMM;
-        operands[1].imm = 1;
-        operands[1].access = CS_AC_READ;
-        operands[1].shift = {ARM64_SFT_INVALID, 0};
-        operands[1].ext = ARM64_EXT_INVALID;
-        operands[1].vector_index = -1;
+    case Opcode::AArch64_ZERO_M: {
+      // Incorrect access type: All are READ but should all be WRITE
+      for (int i = 0; i < operandCount; i++) {
+        operands[i].access = CS_AC_WRITE;
       }
       break;
     }
-    case Opcode::AArch64_INCP_XP_B:
-      [[fallthrough]];
-    case Opcode::AArch64_INCP_XP_D:
-      [[fallthrough]];
-    case Opcode::AArch64_INCP_XP_H:
-      [[fallthrough]];
-    case Opcode::AArch64_INCP_XP_S:
-      operands[0].access = CS_AC_READ | CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-    case Opcode::AArch64_LD1i32:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1i64:
-      operands[1].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_GLD1W_D_SCALED_REAL: {
-      // Access types are not set correctly
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      break;
-    }
-    case Opcode::AArch64_GLD1D_SCALED_REAL:
-      [[fallthrough]];
-    case Opcode::AArch64_GLD1D_REAL: {
-      // LD1D gather instruction doesn't correctly identify destination
-      // register
-      uint16_t reg_enum = ARM64_REG_Z0;
-      // Single or double digit Z register identifier
-      if (operandStr[3] == '.') {
-        reg_enum += std::stoi(operandStr.substr(2, 1));
-      } else {
-        reg_enum += std::stoi(operandStr.substr(2, 2));
-      }
-      operands[0].reg = static_cast<arm64_reg>(reg_enum);
+  }
 
-      // No defined access types
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      // LD1D gather instruction doesn't correctly identify memory operands
-      operands[2].type = ARM64_OP_MEM;
-      operands[2].access = CS_AC_READ;
-
-      // LD1D doesn't correctly identify vector memory register correctly
-      uint16_t vec_enum = ARM64_REG_Z0;
-      std::string tmp_str(operandStr.substr(operandStr.find("[")));
-      // Single or double digit Z register identifier
-      if (tmp_str.substr(tmp_str.find("z"))[2] == '.') {
-        vec_enum += std::stoi(tmp_str.substr(tmp_str.find("z") + 1, 1));
-      } else {
-        vec_enum += std::stoi(tmp_str.substr(tmp_str.find("z") + 1, 2));
-      }
-      operands[2].mem.index = static_cast<arm64_reg>(vec_enum);
-      break;
-    }
-    case Opcode::AArch64_LD1RQ_W_IMM: {
-      // LD1RQW doesn't identify correct access types
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      break;
-    }
-    case Opcode::AArch64_LD1RQ_D_IMM: {
-      // LD1RQ gather instruction doesn't correctly identify destination
-      // register
-      uint16_t reg_enum = ARM64_REG_Z0;
-      // Single or double digit Z register identifier
-      if (operandStr[3] == '.') {
-        reg_enum += std::stoi(operandStr.substr(2, 1));
-      } else {
-        reg_enum += std::stoi(operandStr.substr(2, 2));
-      }
-      operands[0].reg = static_cast<arm64_reg>(reg_enum);
-
-      // No defined access types
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      // LD1RQ gather instruction doesn't correctly identify memory operands
-      operands[2].type = ARM64_OP_MEM;
-      operands[2].access = CS_AC_READ;
-      break;
-    }
-    case Opcode::AArch64_GLD1SW_D_IMM_REAL:
-      [[fallthrough]];
-    case Opcode::AArch64_GLD1D_IMM_REAL: {
-      // LD1D gather instruction doesn't correctly identify destination
-      // register
-      uint16_t reg_enum = ARM64_REG_Z0;
-      // Single or double digit Z register identifier
-      if (operandStr[3] == '.') {
-        reg_enum += std::stoi(operandStr.substr(2, 1));
-      } else {
-        reg_enum += std::stoi(operandStr.substr(2, 2));
-      }
-
-      operands[0].reg = static_cast<arm64_reg>(reg_enum);
-      // No defined access types
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      // LD1D gather instruction doesn't correctly identify second Z reg as
-      // memory operand
-      operands[2].type = ARM64_OP_MEM;
-      operands[2].access = CS_AC_READ;
-      // LD1D gather instruction doesn't recognise memory-offset immediate
-      // correctly
-      if (operandStr[operandStr.length() - 3] != '.') {
-        int64_t startPos = operandStr.find('#') + 1;
-        int64_t immSize = (operandStr.length() - 1) - startPos;
-        if (immSize == 1) {
-          operands[2].mem.disp =
-              std::stoi(operandStr.substr(startPos, immSize));
-        } else {
-          // double or tripple digit immediates are converted to hex, and so
-          // require a different conversion to uint
-          operands[2].mem.disp =
-              std::stoul(operandStr.substr(startPos, immSize), nullptr, 16);
-        }
-      }
-      break;
-    }
-    case Opcode::AArch64_LD1B:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1D:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1D_IMM_REAL:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1RD_IMM:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1RW_IMM:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1H:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1W:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1W_IMM_REAL: {
-      // LD1RW doesn't correctly identify destination register
-      uint16_t reg_enum = ARM64_REG_Z0;
-      // Single or double digit Z register identifier
-      if (operandStr[3] == '.') {
-        reg_enum += std::stoi(operandStr.substr(2, 1));
-      } else {
-        reg_enum += std::stoi(operandStr.substr(2, 2));
-      }
-
-      operands[0].reg = static_cast<arm64_reg>(reg_enum);
-      // No defined access types
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      break;
-    }
-    case Opcode::AArch64_LD1Rv4s:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1Rv1d:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1Rv2d:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1Rv2s:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1Rv8b:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1Rv16b:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1Rv8h:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1Rv4h:
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_LD1Rv4h_POST:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1Rv8h_POST:
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ | CS_AC_WRITE;
-      // Fix for exclusion of post_index immediate in disassembly
-      operandCount = 3;
-      operands[2].type = ARM64_OP_IMM;
-      operands[2].access = CS_AC_READ;
-      // For vector arrangment of 16-bit, post_index immediate is 2
-      operands[2].imm = 2;
-      break;
-    case Opcode::AArch64_LD1Rv1d_POST:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1Rv2d_POST:
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ | CS_AC_WRITE;
-      // Fix for exclusion of post_index immediate in disassembly
-      operandCount = 3;
-      operands[2].type = ARM64_OP_IMM;
-      operands[2].access = CS_AC_READ;
-      // For vector arrangment of 64-bit, post_index immediate is 8
-      operands[2].imm = 8;
-      break;
-    case Opcode::AArch64_LD1Rv16b_POST:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1Rv8b_POST:
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ | CS_AC_WRITE;
-
-      // Fix for exclusion of post_index immediate in disassembly
-      operandCount = 3;
-      operands[2].type = ARM64_OP_IMM;
-      operands[2].access = CS_AC_READ;
-      // For vector arrangment of 8-bit, post_index immediate is 1
-      operands[2].imm = 1;
-      break;
-    case Opcode::AArch64_LD1Rv2s_POST:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1Rv4s_POST:
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ | CS_AC_WRITE;
-
-      // Fix for exclusion of post_index immediate in disassembly
-      operandCount = 3;
-      operands[2].type = ARM64_OP_IMM;
-      operands[2].access = CS_AC_READ;
-      // For vector arrangment of 32-bit, post_index immediate is 4
-      operands[2].imm = 4;
-      break;
-    case Opcode::AArch64_LD1Onev16b:
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_LD1Onev16b_POST:
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ | CS_AC_WRITE;
-      break;
-    case Opcode::AArch64_LD1Twov16b:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1Twov16b_POST:
-      // Fix incorrect access types
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_WRITE;
-      break;
-    case Opcode::AArch64_LDADDLW:
-      [[fallthrough]];
-    case Opcode::AArch64_LDADDW:
-      operands[0].access = CS_AC_READ;
-      operands[1].access = CS_AC_WRITE;
-      break;
-    case Opcode::AArch64_LD2Twov4s_POST:
-      // Fixing wrong access flag for offset register operand
-      if (operandCount == 4) {
-        operands[3].access = CS_AC_READ;
-      }
-      break;
-    case Opcode::AArch64_LDR_PXI:
-      [[fallthrough]];
-    case Opcode::AArch64_LDR_ZXI:
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_LSL_ZZI_S:
-      // No defined access types
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      // No instruction id assigned
-      id = ARM64_INS_LSL;
-      break;
-    case Opcode::AArch64_LD2D:
-    case Opcode::AArch64_LD2D_IMM: {
-      // LD2D doesn't correctly identify destination registers
-      uint16_t reg_enum0 = ARM64_REG_Z0;
-      uint16_t reg_enum1 = ARM64_REG_Z0;
-
-      // tmpOpStr = "zxx.d, zyy.d"
-      std::string tmpOpStr(operandStr.substr(1, operandStr.find("}") - 1));
-      // get dest0, then remove from string
-      // Single or double digit Z register identifier
-      if (tmpOpStr[2] == '.') {
-        reg_enum0 += std::stoi(tmpOpStr.substr(1, 1));
-        tmpOpStr.erase(0, 6);
-      } else {
-        reg_enum0 += std::stoi(tmpOpStr.substr(1, 2));
-        tmpOpStr.erase(0, 7);
-      }
-      // get dest1, then remove from string
-      // Single or double digit Z register identifier
-      if (tmpOpStr[2] == '.') {
-        reg_enum1 += std::stoi(tmpOpStr.substr(1, 1));
-      } else {
-        reg_enum1 += std::stoi(tmpOpStr.substr(1, 2));
-      }
-
-      operands[0].reg = static_cast<arm64_reg>(reg_enum0);
-      operands[0].access = CS_AC_WRITE;
-      operands[1].reg = static_cast<arm64_reg>(reg_enum1);
-      operands[1].access = CS_AC_WRITE;
-
-      operands[2].access = CS_AC_READ;
-      operands[3].access = CS_AC_READ;
-      break;
-    }
-    case Opcode::AArch64_LD3D_IMM: {
-      // LD3D doesn't correctly identify destination registers
-      uint16_t reg_enum0 = ARM64_REG_Z0;
-      uint16_t reg_enum1 = ARM64_REG_Z0;
-      uint16_t reg_enum2 = ARM64_REG_Z0;
-
-      // tmpOpStr = "zxx.d, zyy.d, znn.d"
-      std::string tmpOpStr(operandStr.substr(1, operandStr.find("}") - 1));
-      // get dest0, then remove from string
-      // Single or double digit Z register identifier
-      if (tmpOpStr[2] == '.') {
-        reg_enum0 += std::stoi(tmpOpStr.substr(1, 1));
-        tmpOpStr.erase(0, 6);
-      } else {
-        reg_enum0 += std::stoi(tmpOpStr.substr(1, 2));
-        tmpOpStr.erase(0, 7);
-      }
-      // get dest1, then remove from string
-      // Single or double digit Z register identifier
-      if (tmpOpStr[2] == '.') {
-        reg_enum1 += std::stoi(tmpOpStr.substr(1, 1));
-        tmpOpStr.erase(0, 6);
-      } else {
-        reg_enum1 += std::stoi(tmpOpStr.substr(1, 2));
-        tmpOpStr.erase(0, 7);
-      }
-      // get dest2
-      // Single or double digit Z register identifier
-      if (tmpOpStr[2] == '.') {
-        reg_enum2 += std::stoi(tmpOpStr.substr(1, 1));
-      } else {
-        reg_enum2 += std::stoi(tmpOpStr.substr(1, 2));
-      }
-
-      operands[0].reg = static_cast<arm64_reg>(reg_enum0);
-      operands[0].access = CS_AC_WRITE;
-      operands[1].reg = static_cast<arm64_reg>(reg_enum1);
-      operands[1].access = CS_AC_WRITE;
-      operands[2].reg = static_cast<arm64_reg>(reg_enum2);
-      operands[2].access = CS_AC_WRITE;
-
-      operands[3].access = CS_AC_READ;
-      operands[4].access = CS_AC_READ;
-      break;
-    }
-    case Opcode::AArch64_LD4D_IMM: {
-      // LD4D doesn't correctly identify destination registers
-      uint16_t reg_enum0 = ARM64_REG_Z0;
-      uint16_t reg_enum1 = ARM64_REG_Z0;
-      uint16_t reg_enum2 = ARM64_REG_Z0;
-      uint16_t reg_enum3 = ARM64_REG_Z0;
-
-      // tmpOpStr = "zxx.d, zyy.d, znn.d, zmm.d"
-      std::string tmpOpStr(operandStr.substr(1, operandStr.find("}") - 1));
-      // get dest0, then remove from string
-      // Single or double digit Z register identifier
-      if (tmpOpStr[2] == '.') {
-        reg_enum0 += std::stoi(tmpOpStr.substr(1, 1));
-        tmpOpStr.erase(0, 6);
-      } else {
-        reg_enum0 += std::stoi(tmpOpStr.substr(1, 2));
-        tmpOpStr.erase(0, 7);
-      }
-      // get dest1, then remove from string
-      // Single or double digit Z register identifier
-      if (tmpOpStr[2] == '.') {
-        reg_enum1 += std::stoi(tmpOpStr.substr(1, 1));
-        tmpOpStr.erase(0, 6);
-      } else {
-        reg_enum1 += std::stoi(tmpOpStr.substr(1, 2));
-        tmpOpStr.erase(0, 7);
-      }
-      // get dest2
-      // Single or double digit Z register identifier
-      if (tmpOpStr[2] == '.') {
-        reg_enum2 += std::stoi(tmpOpStr.substr(1, 1));
-        tmpOpStr.erase(0, 6);
-      } else {
-        reg_enum2 += std::stoi(tmpOpStr.substr(1, 2));
-        tmpOpStr.erase(0, 7);
-      }
-      // get dest3
-      // Single or double digit Z register identifier
-      if (tmpOpStr[2] == '.') {
-        reg_enum3 += std::stoi(tmpOpStr.substr(1, 1));
-      } else {
-        reg_enum3 += std::stoi(tmpOpStr.substr(1, 2));
-      }
-
-      operands[0].reg = static_cast<arm64_reg>(reg_enum0);
-      operands[0].access = CS_AC_WRITE;
-      operands[1].reg = static_cast<arm64_reg>(reg_enum1);
-      operands[1].access = CS_AC_WRITE;
-      operands[2].reg = static_cast<arm64_reg>(reg_enum2);
-      operands[2].access = CS_AC_WRITE;
-      operands[3].reg = static_cast<arm64_reg>(reg_enum3);
-      operands[3].access = CS_AC_WRITE;
-
-      operands[4].access = CS_AC_READ;
-      operands[5].access = CS_AC_READ;
-      break;
-    }
-    case Opcode::AArch64_MOVNWi:
-      [[fallthrough]];
-    case Opcode::AArch64_MOVNXi:
-      [[fallthrough]];
-    case Opcode::AArch64_MOVZWi:
-      [[fallthrough]];
-    case Opcode::AArch64_MOVZXi:
-      // MOVZ incorrectly flags destination as READ | WRITE
-      operands[0].access = CS_AC_WRITE;
-      break;
-    case Opcode::AArch64_MOVPRFX_ZZ:
-      // Assign operand access types
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_MRS:
-      // MRS incorrectly flags source/destination as READ | WRITE
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      // MRS incorrectly tags ARM64_OP_REG_MRS as ARM64_OP_SYS
-      operands[1].type = ARM64_OP_REG_MRS;
-      break;
-    case Opcode::AArch64_MSR:
-      // MSR incorrectly flags source/destination as READ | WRITE
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      // MSR incorrectly tags ARM64_OP_REG_MSR as ARM64_OP_SYS
-      operands[0].type = ARM64_OP_REG_MSR;
-      break;
-    case Opcode::AArch64_PTEST_PP: {
-      // PTEST doesn't label access types for operands
-      operands[0].access = CS_AC_READ;
-      operands[1].access = CS_AC_READ;
-      // Doesn't identify implicit NZCV destination
-      implicitDestinationCount = 1;
-      implicitDestinations[0] = ARM64_REG_NZCV;
-      break;
-    }
-    case Opcode::AArch64_PTRUE_B:
-      [[fallthrough]];
-    case Opcode::AArch64_PTRUE_H:
-      [[fallthrough]];
-    case Opcode::AArch64_PTRUE_D:
-      [[fallthrough]];
-    case Opcode::AArch64_PTRUE_S:
-      // PTRUE doesn't label access
-      operands[0].access = CS_AC_WRITE;
-      break;
-    case Opcode::AArch64_RET:
-      // If no register supplied to RET, default to x30 (LR)
-      if (operandCount == 0) {
-        operandCount = 1;
-        operands[0].type = ARM64_OP_REG;
-        operands[0].reg = ARM64_REG_LR;
-        operands[0].access = CS_AC_READ;
-      }
-      groupCount = 1;
-      groups[0] = CS_GRP_JUMP;
-      break;
-    case Opcode::AArch64_REV_ZZ_B:
-      [[fallthrough]];
-    case Opcode::AArch64_REV_ZZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_REV_ZZ_H:
-      [[fallthrough]];
-    case Opcode::AArch64_REV_ZZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_REV_PP_B:
-      [[fallthrough]];
-    case Opcode::AArch64_REV_PP_D:
-      [[fallthrough]];
-    case Opcode::AArch64_REV_PP_H:
-      [[fallthrough]];
-    case Opcode::AArch64_REV_PP_S: {
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      break;
-    }
-    case Opcode::AArch64_SST1B_D_REAL:
-      [[fallthrough]];
-    case Opcode::AArch64_SST1D_REAL:
-      [[fallthrough]];
-    case Opcode::AArch64_SST1D_SCALED_SCALED_REAL: {
-      // ST1W doesn't correctly identify first source register
-      uint16_t reg_enum = ARM64_REG_Z0;
-      // Single or double digit Z register identifier
-      if (operandStr[3] == '.') {
-        reg_enum += std::stoi(operandStr.substr(2, 1));
-      } else {
-        reg_enum += std::stoi(operandStr.substr(2, 2));
-      }
-
-      operands[0].reg = static_cast<arm64_reg>(reg_enum);
-      // No defined access types
-      operands[0].access = CS_AC_READ;
-      operands[1].access = CS_AC_READ;
-      // SST1D{_SCALED} gather instruction doesn't correctly identify memory
-      // operands
-      operands[2].type = ARM64_OP_MEM;
-      operands[2].access = CS_AC_READ;
-
-      // ST1D doesn't correctly identify vector memory register correctly
-      uint16_t vec_enum = ARM64_REG_Z0;
-      std::string tmp_str(operandStr.substr(operandStr.find("[")));
-      // Single or double digit Z register identifier
-      if (tmp_str.substr(tmp_str.find("z"))[2] == '.') {
-        vec_enum += std::stoi(tmp_str.substr(tmp_str.find("z") + 1, 1));
-      } else {
-        vec_enum += std::stoi(tmp_str.substr(tmp_str.find("z") + 1, 2));
-      }
-      operands[2].mem.index = static_cast<arm64_reg>(vec_enum);
-      break;
-    }
-    case Opcode::AArch64_ST2D_IMM: {
-      // ST2D doesn't correctly identify destination registers
-      uint16_t reg_enum0 = ARM64_REG_Z0;
-      uint16_t reg_enum1 = ARM64_REG_Z0;
-
-      // tmpOpStr = "zxx.d, zyy.d"
-      std::string tmpOpStr(operandStr.substr(1, operandStr.find("}") - 1));
-      // get dest0, then remove from string
-      // Single or double digit Z register identifier
-      if (tmpOpStr[2] == '.') {
-        reg_enum0 += std::stoi(tmpOpStr.substr(1, 1));
-        tmpOpStr.erase(0, 6);
-      } else {
-        reg_enum0 += std::stoi(tmpOpStr.substr(1, 2));
-        tmpOpStr.erase(0, 7);
-      }
-      // get dest1, then remove from string
-      // Single or double digit Z register identifier
-      if (tmpOpStr[2] == '.') {
-        reg_enum1 += std::stoi(tmpOpStr.substr(1, 1));
-        tmpOpStr.erase(0, 6);
-      } else {
-        reg_enum1 += std::stoi(tmpOpStr.substr(1, 2));
-        tmpOpStr.erase(0, 7);
-      }
-
-      operands[0].reg = static_cast<arm64_reg>(reg_enum0);
-      operands[0].access = CS_AC_READ;
-      operands[1].reg = static_cast<arm64_reg>(reg_enum1);
-      operands[1].access = CS_AC_READ;
-
-      operands[2].access = CS_AC_READ;
-      operands[3].access = CS_AC_READ;
-      break;
-    }
-    case Opcode::AArch64_ST1B:
-      [[fallthrough]];
-    case Opcode::AArch64_ST1D:
-      [[fallthrough]];
-    case Opcode::AArch64_ST1D_IMM:
-      [[fallthrough]];
-    case Opcode::AArch64_ST1W_IMM: {
-      // ST1W doesn't correctly identify first source register
-      uint16_t reg_enum = ARM64_REG_Z0;
-      // Single or double digit Z register identifier
-      if (operandStr[3] == '.') {
-        reg_enum += std::stoi(operandStr.substr(2, 1));
-      } else {
-        reg_enum += std::stoi(operandStr.substr(2, 2));
-      }
-
-      operands[0].reg = static_cast<arm64_reg>(reg_enum);
-      // No defined access types
-      operands[0].access = CS_AC_READ;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      break;
-    }
-    case Opcode::AArch64_ST1W:
-      [[fallthrough]];
-    case Opcode::AArch64_ST1W_D: {
-      // ST1W doesn't correctly identify first source register
-      uint16_t reg_enum = ARM64_REG_Z0;
-      // Single or double digit Z register identifier
-      if (operandStr[3] == '.') {
-        reg_enum += std::stoi(operandStr.substr(2, 1));
-      } else {
-        reg_enum += std::stoi(operandStr.substr(2, 2));
-      }
-
-      operands[0].reg = static_cast<arm64_reg>(reg_enum);
-      // No defined access types
-      operands[0].access = CS_AC_READ;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      operands[3].access = CS_AC_READ;
-      break;
-    }
-    case Opcode::AArch64_SST1D_IMM:
-      [[fallthrough]];
-    case Opcode::AArch64_SST1W_D_IMM:
-      [[fallthrough]];
-    case Opcode::AArch64_SST1W_IMM: {
-      // ST1W scatter instruction doesn't correctly identify first source
-      // register
-      uint16_t reg_enum = ARM64_REG_Z0;
-      // Single or double digit Z register identifier
-      if (operandStr[3] == '.') {
-        reg_enum += std::stoi(operandStr.substr(2, 1));
-      } else {
-        reg_enum += std::stoi(operandStr.substr(2, 2));
-      }
-
-      operands[0].reg = static_cast<arm64_reg>(reg_enum);
-      // No defined access types
-      operands[0].access = CS_AC_READ;
-      operands[1].access = CS_AC_READ;
-      // ST1W scatter instruction doesn't correctly identify second Z reg as
-      // memory operand
-      operands[2].type = ARM64_OP_MEM;
-      operands[2].access = CS_AC_READ;
-      // ST1W scatter instruction doesn't recognise memory-offset immediate
-      // correctly
-      if (operandStr[operandStr.length() - 3] != '.') {
-        int64_t startPos = operandStr.find('#') + 1;
-        int64_t immSize = (operandStr.length() - 1) - startPos;
-        if (immSize == 1) {
-          operands[2].mem.disp =
-              std::stoi(operandStr.substr(startPos, immSize));
-        } else {
-          // double or tripple digit immediates are converted to hex, and so
-          // require a different conversion to uint
-          operands[2].mem.disp =
-              std::stoul(operandStr.substr(startPos, immSize), nullptr, 16);
-        }
-      }
-      break;
-    }
-    case Opcode::AArch64_ST1Fourv2s_POST:
-      [[fallthrough]];
-    case Opcode::AArch64_ST1Fourv4s_POST: {
-      // ST1 four vectors doesn't set access rights correctly
-      operands[0].access = CS_AC_READ;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      operands[3].access = CS_AC_READ;
-      // operands[4] is memory + write-back enabled already
-      if (operandCount == 6) operands[5].access = CS_AC_READ;
-      break;
-    }
-    case Opcode::AArch64_ST1i8_POST:
-      [[fallthrough]];
-    case Opcode::AArch64_ST1i16_POST:
-      [[fallthrough]];
-    case Opcode::AArch64_ST1i32_POST:
-      [[fallthrough]];
-    case Opcode::AArch64_ST1i64_POST:
-      // fixing incorrect access type for register offset
-      if (operandCount == 3) {
-        operands[2].access = CS_AC_READ;
-      }
-      break;
-    case Opcode::AArch64_ST1Twov4s:
-      [[fallthrough]];
-    case Opcode::AArch64_ST1Twov16b:
-      // ST1 incorrectly flags read and write
-      operands[1].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_ST2Twov4s_POST:
-      // ST2 post incorrectly flags read and write
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ | CS_AC_WRITE;
-      // Another incorrect acess flag for register offset operand
-      if (operandCount == 4) {
-        operands[3].access = CS_AC_READ;
-      }
-      break;
-    case Opcode::AArch64_STRBui:
-      [[fallthrough]];
-    case Opcode::AArch64_STRDui:
-      [[fallthrough]];
-    case Opcode::AArch64_STRHui:
-      [[fallthrough]];
-    case Opcode::AArch64_STRQui:
-      [[fallthrough]];
-    case Opcode::AArch64_STRSui:
-      [[fallthrough]];
-    case Opcode::AArch64_STRWui:
-      [[fallthrough]];
-    case Opcode::AArch64_STRXui:
-      operands[1].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_STR_PXI:
-      [[fallthrough]];
-    case Opcode::AArch64_STR_ZXI:
-      operands[0].access = CS_AC_READ;
-      operands[1].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_SBFMWri:
-      [[fallthrough]];
-    case Opcode::AArch64_SBFMXri:
-      // SBFM incorrectly flags destination as READ | WRITE
-      operands[0].access = CS_AC_WRITE;
-      break;
-    case Opcode::AArch64_SVC:
-      // SVC is incorrectly marked as setting x30
-      implicitDestinationCount = 0;
-      break;
-    case Opcode::AArch64_SYSxt:
-      // No defined metadata.id for SYS instructions
-      id = ARM64_INS_SYS;
-      break;
-    case Opcode::AArch64_PSEL_PPPRI_B:
-      [[fallthrough]];
-    case Opcode::AArch64_PSEL_PPPRI_D:
-      [[fallthrough]];
-    case Opcode::AArch64_PSEL_PPPRI_H:
-      [[fallthrough]];
-    case Opcode::AArch64_PSEL_PPPRI_S:
-      // Add correct access types
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_UBFMWri:
-      [[fallthrough]];
-    case Opcode::AArch64_UBFMXri:
-      // UBFM incorrectly flags destination as READ | WRITE
-      operands[0].access = CS_AC_WRITE;
-      break;
-    case Opcode::AArch64_UQDECB_WPiI:
-      [[fallthrough]];
-    case Opcode::AArch64_UQDECB_XPiI:
-      [[fallthrough]];
-    case Opcode::AArch64_UQDECD_WPiI:
-      [[fallthrough]];
-    case Opcode::AArch64_UQDECD_XPiI:
-      [[fallthrough]];
-    case Opcode::AArch64_UQDECH_WPiI:
-      [[fallthrough]];
-    case Opcode::AArch64_UQDECH_XPiI:
-      [[fallthrough]];
-    case Opcode::AArch64_UQDECW_WPiI:
-      [[fallthrough]];
-    case Opcode::AArch64_UQDECW_XPiI:
-      // UQDEC lacks access types
-      operands[0].access = CS_AC_READ | CS_AC_WRITE;
-      if (operandCount == 1) {
-        operandCount = 2;
-        operands[1].type = ARM64_OP_IMM;
-        operands[1].imm = 1;
-      }
-      operands[1].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_UUNPKHI_ZZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_UUNPKHI_ZZ_H:
-      [[fallthrough]];
-    case Opcode::AArch64_UUNPKHI_ZZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_UUNPKLO_ZZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_UUNPKLO_ZZ_H:
-      [[fallthrough]];
-    case Opcode::AArch64_UUNPKLO_ZZ_S:
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_WHILELT_PXX_B:
-      [[fallthrough]];
-    case Opcode::AArch64_WHILELT_PXX_D:
-      [[fallthrough]];
-    case Opcode::AArch64_WHILELT_PXX_H:
-      [[fallthrough]];
-    case Opcode::AArch64_WHILELT_PXX_S:
-      [[fallthrough]];
-    case Opcode::AArch64_WHILELO_PWW_B:
-      [[fallthrough]];
-    case Opcode::AArch64_WHILELO_PWW_D:
-      [[fallthrough]];
-    case Opcode::AArch64_WHILELO_PWW_H:
-      [[fallthrough]];
-    case Opcode::AArch64_WHILELO_PWW_S:
-      [[fallthrough]];
-    case Opcode::AArch64_WHILELO_PXX_B:
-      [[fallthrough]];
-    case Opcode::AArch64_WHILELO_PXX_D:
-      [[fallthrough]];
-    case Opcode::AArch64_WHILELO_PXX_H:
-      [[fallthrough]];
-    case Opcode::AArch64_WHILELO_PXX_S:
-      // WHILELO doesn't label access or vector specifiers
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      // Doesn't identify implicit NZCV destination
-      implicitDestinationCount = 1;
-      implicitDestinations[0] = ARM64_REG_NZCV;
-      break;
-    case Opcode::AArch64_XTNv16i8:
-    case Opcode::AArch64_XTNv4i32:
-    case Opcode::AArch64_XTNv8i16:
-      // XTN2 incorrectly flags destination as only WRITE
-      operands[0].access = CS_AC_READ | CS_AC_WRITE;
-      break;
-    case Opcode::AArch64_ZIP1_PPP_B:
-      [[fallthrough]];
-    case Opcode::AArch64_ZIP1_PPP_D:
-      [[fallthrough]];
-    case Opcode::AArch64_ZIP1_PPP_H:
-      [[fallthrough]];
-    case Opcode::AArch64_ZIP1_PPP_S:
-      [[fallthrough]];
-    case Opcode::AArch64_ZIP1_ZZZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_ZIP1_ZZZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_ZIP2_PPP_B:
-      [[fallthrough]];
-    case Opcode::AArch64_ZIP2_PPP_D:
-      [[fallthrough]];
-    case Opcode::AArch64_ZIP2_PPP_H:
-      [[fallthrough]];
-    case Opcode::AArch64_ZIP2_PPP_S:
-      [[fallthrough]];
-    case Opcode::AArch64_ZIP2_ZZZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_ZIP2_ZZZ_D:
-      // ZIP lacks access types
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_SXTW_ZPmZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FCVT_ZPmZ_DtoS:
-      [[fallthrough]];
-    case Opcode::AArch64_FCVT_ZPmZ_StoD:
-      [[fallthrough]];
-    case Opcode::AArch64_SCVTF_ZPmZ_DtoS:
-      [[fallthrough]];
-    case Opcode::AArch64_SCVTF_ZPmZ_StoD:
-      [[fallthrough]];
-    case Opcode::AArch64_SCVTF_ZPmZ_StoS:
-      [[fallthrough]];
-    case Opcode::AArch64_SCVTF_ZPmZ_DtoD:
-      // Need to see if Destination vector elements are active
-      operands[0].access = CS_AC_READ | CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_TBLv8i8One:
-      [[fallthrough]];
-    case Opcode::AArch64_TBLv16i8One:
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_TBLv8i8Two:
-      [[fallthrough]];
-    case Opcode::AArch64_TBLv16i8Two:
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      operands[3].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_TBLv8i8Three:
-      [[fallthrough]];
-    case Opcode::AArch64_TBLv16i8Three:
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      operands[3].access = CS_AC_READ;
-      operands[4].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_TBLv8i8Four:
-      [[fallthrough]];
-    case Opcode::AArch64_TBLv16i8Four:
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      operands[3].access = CS_AC_READ;
-      operands[4].access = CS_AC_READ;
-      operands[5].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_LD1_MXIPXX_V_S:
-      [[fallthrough]];
-    case Opcode::AArch64_LD1_MXIPXX_H_S: {
-      // Lacking access specifiers
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      break;
-    }
-    case Opcode::AArch64_ST1_MXIPXX_H_S:
-      [[fallthrough]];
-    case Opcode::AArch64_ST1_MXIPXX_V_S:
-      // Access types are not defined
-      operands[0].access = CS_AC_READ;
-      operands[1].access = CS_AC_READ;
-      break;
-    case Opcode::AArch64_FMOPA_MPPZZ_S: {
-      // Need to add access specifiers
-      // although operands[0] should be READ | WRITE, due to the implemented
-      // decode logic for SME tile destinations, the register will be added as
-      // both source and distination with just WRITE access.
-      operands[0].access = CS_AC_WRITE;
-      operands[1].access = CS_AC_READ;
-      operands[2].access = CS_AC_READ;
-      operands[3].access = CS_AC_READ;
-      operands[4].access = CS_AC_READ;
-      operands[5].access = CS_AC_READ;
-      break;
-    }
-    case Opcode::AArch64_ZERO_M: {
-      // Operands often mangled from ZA tile overlap aliasing in decode. Need to
-      // re-extract relevant tiles from operandStr
-      operandCount = 0;
-      size_t pos = operandStr.find("za", 0);
-      while (pos != std::string::npos) {
-        size_t pos_2 = operandStr.find(".", pos);
-        if (pos_2 != std::string::npos) {
-          char type = operandStr[pos_2 + 1];
-          // Tile Number can only ever be 1 digit
-          uint8_t tileNum = std::stoi(operandStr.substr((pos + 2), 1));
-          switch (type) {
-            case 'b':
-              operands[operandCount].reg = ARM64_REG_ZAB0;
-              break;
-            case 'h':
-              operands[operandCount].reg =
-                  static_cast<arm64_reg>(ARM64_REG_ZAH0 + tileNum);
-              break;
-            case 's':
-              operands[operandCount].reg =
-                  static_cast<arm64_reg>(ARM64_REG_ZAS0 + tileNum);
-              break;
-            case 'd':
-              operands[operandCount].reg =
-                  static_cast<arm64_reg>(ARM64_REG_ZAD0 + tileNum);
-              break;
-            case 'q':
-              operands[operandCount].reg =
-                  static_cast<arm64_reg>(ARM64_REG_ZAQ0 + tileNum);
-              break;
-          }
-        } else {
-          operands[operandCount].reg = ARM64_REG_ZA;
-        }
-        operands[operandCount].type = ARM64_OP_REG;
-        operands[operandCount].access = CS_AC_WRITE;
-        operandCount++;
-        pos = operandStr.find("za", pos + 1);
-      }
-      break;
-    }
+  if (isAlias) {
+    exceptionString_ =
+        "This instruction is an alias. The printed mnemonic and operand string "
+        "differ from what is expected of the Capstone opcode.";
   }
-
-  revertAliasing();
 }
 
 InstructionMetadata::InstructionMetadata(const uint8_t* invalidEncoding,
                                          uint8_t bytes)
-    : id(ARM64_INS_INVALID),
-      opcode(Opcode::AArch64_INSTRUCTION_LIST_END),
+    : id(AARCH64_INS_INVALID),
+      opcode(Opcode::INSTRUCTION_LIST_END),
       implicitSourceCount(0),
       implicitDestinationCount(0),
       groupCount(0),
       setsFlags(false),
-      writeback(false),
+      isAlias(false),
       operandCount(0) {
   assert(bytes <= sizeof(encoding));
   std::memcpy(encoding, invalidEncoding, bytes);
@@ -1611,809 +299,6 @@ InstructionMetadata::InstructionMetadata(const uint8_t* invalidEncoding,
   operandStr[0] = '\0';
 }
 
-void InstructionMetadata::revertAliasing() {
-  // Check mnemonics known to be aliases and see if their opcode matches
-  // something else
-  switch (id) {
-    case ARM64_INS_ASR:
-      if (opcode == Opcode::AArch64_ASRVWr ||
-          opcode == Opcode::AArch64_ASRVXr) {
-        // asr rd, rn, rm; alias for: asrv rd, rn, rm
-        return;
-      }
-      if (opcode == Opcode::AArch64_SBFMWri ||
-          opcode == Opcode::AArch64_SBFMXri) {
-        operandCount = 4;
-
-        operands[3].type = ARM64_OP_IMM;
-        operands[3].access = CS_AC_READ;
-        if (opcode == Opcode::AArch64_SBFMWri) {
-          // 32-bit
-          operands[3].imm = 31;
-        } else {
-          operands[3].imm = 63;
-        }
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_AT:
-      return aliasNYI();
-    case ARM64_INS_BFI:
-      if (opcode == Opcode::AArch64_BFMWri) {
-        // bfi wd, wn, #lsb, #width; alias for
-        // bfm wd, wn, #(-lsb MOD 32), #(width - 1)
-        operands[2].imm = static_cast<uint32_t>(-operands[2].imm) % 32;
-        operands[3].imm = operands[3].imm - 1;
-        return;
-      }
-      if (opcode == Opcode::AArch64_BFMXri) {
-        // bfi xd, xn, #lsb, #width; alias for
-        // bfm xd, xn, #(-lsb MOD 64), #(width - 1)
-        operands[2].imm = static_cast<uint32_t>(-operands[2].imm) % 64;
-        operands[3].imm = operands[3].imm - 1;
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_BFXIL:
-      if (opcode == Opcode::AArch64_BFMWri ||
-          opcode == Opcode::AArch64_BFMXri) {
-        // bfxil rd, rn, #lsb, #width; alias for
-        // bfm rd, rn, #lsb, #(lsb + width - 1)
-        operands[3].imm = operands[2].imm + operands[3].imm - 1;
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_CINC:
-      if (opcode == Opcode::AArch64_CSINCWr ||
-          opcode == Opcode::AArch64_CSINCXr) {
-        // cinc rd, rn, cc; alias for: csinc rd, rn, rn, invert(cc)
-        operandCount = 3;
-
-        operands[2].type = ARM64_OP_REG;
-        operands[2].access = CS_AC_READ;
-        operands[2].reg = operands[1].reg;
-
-        cc ^= 1;  // invert lowest bit to negate cc
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_CINV:
-      return aliasNYI();
-    case ARM64_INS_CMN:
-      // cmn <operands>, alias for adds <wzr|xzr> <operands>
-      operandCount = 3;
-      operands[2] = operands[1];
-      operands[1] = operands[0];
-      operands[1].access = CS_AC_READ;
-
-      operands[0].type = ARM64_OP_REG;
-      operands[0].access = CS_AC_WRITE;
-
-      if (opcode == Opcode::AArch64_ADDSXri ||
-          opcode == Opcode::AArch64_ADDSXrr ||
-          opcode == Opcode::AArch64_ADDSXrs) {
-        // 64-bit version
-        operands[0].reg = ARM64_REG_XZR;
-      } else {
-        // 32-bit version
-        operands[0].reg = ARM64_REG_WZR;
-      }
-      return;
-    case ARM64_INS_CMP:
-      if (opcode == Opcode::AArch64_SUBSWri ||
-          opcode == Opcode::AArch64_SUBSWrs ||
-          opcode == Opcode::AArch64_SUBSWrx ||
-          opcode == Opcode::AArch64_SUBSXri ||
-          opcode == Opcode::AArch64_SUBSXrs ||
-          opcode == Opcode::AArch64_SUBSXrx ||
-          opcode == Opcode::AArch64_SUBSXrx64) {
-        operandCount = 3;
-        operands[2] = operands[1];
-
-        operands[1] = operands[0];
-        operands[1].access = CS_AC_READ;
-
-        operands[0].type = ARM64_OP_REG;
-        operands[0].access = CS_AC_WRITE;
-
-        if (opcode == Opcode::AArch64_SUBSWri ||
-            opcode == Opcode::AArch64_SUBSWrs ||
-            opcode == Opcode::AArch64_SUBSWrx) {
-          operands[0].reg = ARM64_REG_WZR;
-        } else {
-          operands[0].reg = ARM64_REG_XZR;
-        }
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_CNEG:
-      if (opcode == Opcode::AArch64_CSNEGWr ||
-          opcode == Opcode::AArch64_CSNEGXr) {
-        // cneg rd, rn, cc; alias for: csneg rd, rn, rn, invert(cc)
-        operandCount = 3;
-        operands[2] = operands[1];
-        cc ^= 1;  // invert lowest bit to negate cc
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_CSET:
-      if (opcode == Opcode::AArch64_CSINCWr ||
-          opcode == Opcode::AArch64_CSINCXr) {
-        // cset rd, cc; alias for: csinc rd, zr, zr, invert(cc)
-        operandCount = 3;
-
-        operands[1].type = ARM64_OP_REG;
-        operands[1].access = CS_AC_READ;
-
-        operands[2].type = ARM64_OP_REG;
-        operands[2].access = CS_AC_READ;
-
-        if (opcode == Opcode::AArch64_CSINCWr) {
-          operands[1].reg = ARM64_REG_WZR;
-          operands[2].reg = ARM64_REG_WZR;
-        } else {
-          operands[1].reg = ARM64_REG_XZR;
-          operands[2].reg = ARM64_REG_XZR;
-        }
-
-        cc ^= 1;  // invert lowest bit to negate cc
-
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_CSETM:
-      if (opcode == Opcode::AArch64_CSINVWr ||
-          opcode == Opcode::AArch64_CSINVXr) {
-        // csetm rd, cc; alias for: csinv rd, zr, zr, invert(cc)
-        operandCount = 3;
-
-        operands[1].type = ARM64_OP_REG;
-        operands[1].access = CS_AC_READ;
-
-        operands[2].type = ARM64_OP_REG;
-        operands[2].access = CS_AC_READ;
-
-        if (opcode == Opcode::AArch64_CSINVWr) {
-          operands[1].reg = ARM64_REG_WZR;
-          operands[2].reg = ARM64_REG_WZR;
-        } else {
-          operands[1].reg = ARM64_REG_XZR;
-          operands[2].reg = ARM64_REG_XZR;
-        }
-
-        cc ^= 1;  // invert lowest bit to negate cc
-
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_DC:
-      return aliasNYI();
-    case ARM64_INS_IC:
-      return aliasNYI();
-    case ARM64_INS_LSL:
-      if (opcode == Opcode::AArch64_UBFMWri ||
-          opcode == Opcode::AArch64_UBFMXri) {
-        // lsl rd, rn, #shift; alias for:
-        //  ubfm rd, rn, #(-shift MOD <32|64>), #(<31|63> - shift)
-        operandCount = 4;
-        uint8_t highestBit = 63;
-        if (opcode == Opcode::AArch64_UBFMWri) {
-          highestBit = 31;
-        }
-
-        auto shift = operands[2].imm;
-        operands[2].imm = (-shift) & highestBit;
-        operands[3].type = ARM64_OP_IMM;
-        operands[3].imm = highestBit - shift;
-        operands[3].access = CS_AC_READ;
-        return;
-      }
-      if (opcode == Opcode::AArch64_LSLVWr ||
-          opcode == Opcode::AArch64_LSLVXr ||
-          opcode == Opcode::AArch64_LSL_ZZI_S) {
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_LSR:
-      if (opcode == Opcode::AArch64_LSRVWr ||
-          opcode == Opcode::AArch64_LSRVXr) {
-        // lsr rd, rn, rm; alias for lsrv rd, rn, rm
-        return;
-      }
-      if (opcode == Opcode::AArch64_UBFMWri ||
-          opcode == Opcode::AArch64_UBFMXri) {
-        // lsr rd, rn, #amount; alias for ubfm rd, rn, #amount, #<31|63>
-        operandCount = 4;
-
-        operands[3].type = ARM64_OP_IMM;
-        operands[3].access = CS_AC_READ;
-
-        if (opcode == Opcode::AArch64_UBFMWri) {
-          operands[3].imm = 31;
-        } else {
-          operands[3].imm = 63;
-        }
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_MNEG:
-      if (opcode == Opcode::AArch64_MSUBXrrr) {
-        // mneg xd, xn, xm; alias for msub xd, xn, xm, xzr
-        operandCount = 4;
-        operands[3].type = ARM64_OP_REG;
-        operands[3].access = CS_AC_READ;
-        operands[3].reg = ARM64_REG_XZR;
-        return;
-      }
-      if (opcode == Opcode::AArch64_MSUBWrrr) {
-        // mneg wd, wn, wm; alias for msub wd, wn, wm, wzr
-        operandCount = 4;
-        operands[3].type = ARM64_OP_REG;
-        operands[3].access = CS_AC_READ;
-        operands[3].reg = ARM64_REG_WZR;
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_MOV:
-      if (opcode == Opcode::AArch64_AND_PPzPP) {
-        // mov pd.b, pg/z, pn.b; alias for: and pd.b, pg/z, pn.b, pn.b
-        operandCount = 4;
-        operands[3] = operands[2];
-        return;
-      }
-      if (opcode == Opcode::AArch64_ADDXri ||
-          opcode == Opcode::AArch64_ADDWri) {
-        // mov to/from sp; alias for: add <sp|rd>, <rn|sp>, #0
-        operandCount = 3;
-        operands[2].type = ARM64_OP_IMM;
-        operands[2].imm = 0;
-        operands[2].access = CS_AC_READ;
-        operands[2].shift.type = ARM64_SFT_INVALID;
-        operands[2].vas = ARM64_VAS_INVALID;
-        operands[2].vector_index = -1;
-        return;
-      }
-      if (opcode == Opcode::AArch64_DUPi8 || opcode == Opcode::AArch64_DUPi16 ||
-          opcode == Opcode::AArch64_DUPi32 ||
-          opcode == Opcode::AArch64_DUPi64) {
-        // mov vd, Vn.T[index]; alias of dup vd, Vn.T[index]
-        return;
-      }
-      if (opcode == Opcode ::AArch64_CPY_ZPzI_B ||
-          opcode == Opcode ::AArch64_CPY_ZPzI_D ||
-          opcode == Opcode ::AArch64_CPY_ZPzI_H ||
-          opcode == Opcode ::AArch64_CPY_ZPzI_S) {
-        // mov zd.T, pg/z, #imm{, shift}; alias of cpy zd.T, pg/z, #imm{,
-        // shift}
-        operandCount = 3;
-        operands[0].access = CS_AC_WRITE;
-        operands[1].access = CS_AC_READ;
-        operands[2].type = ARM64_OP_IMM;
-        operands[2].access = CS_AC_READ;
-
-        // get imm value
-        std::string tmpOpStr(operandStr.substr(operandStr.find("#") + 1));
-        auto value = std::stoi(tmpOpStr, 0, 16);
-        operands[2].imm = tmpOpStr.length() == 4 ? static_cast<int8_t>(value)
-                                                 : static_cast<int16_t>(value);
-        return;
-      }
-      if (opcode == Opcode::AArch64_DUPM_ZI ||
-          opcode == Opcode::AArch64_DUP_ZI_B ||
-          opcode == Opcode::AArch64_DUP_ZI_D ||
-          opcode == Opcode::AArch64_DUP_ZI_H ||
-          opcode == Opcode::AArch64_DUP_ZI_S) {
-        // mov Zd.T, #imm; alias for dupm Zd.T, #imm
-        // or
-        // mov Zd.T, #imm{, shift}; alias for dup Zd.T, #imm{, shift}
-        operandCount = 2;
-        operands[0].access = CS_AC_WRITE;
-        operands[1].type = ARM64_OP_IMM;
-        operands[1].access = CS_AC_READ;
-
-        uint8_t start = operandStr[6] == '#' ? 7 : 8;
-
-        if (opcode == Opcode::AArch64_DUPM_ZI) {
-          char specifier = operandStr[start - 4];
-          switch (specifier) {
-            case 'b':
-              operands[0].vas = ARM64_VAS_1B;
-              break;
-            case 'h':
-              operands[0].vas = ARM64_VAS_1H;
-              break;
-            case 's':
-              operands[0].vas = ARM64_VAS_1S;
-              break;
-            case 'd':
-              operands[0].vas = ARM64_VAS_1D;
-              break;
-
-            default:
-              break;
-          }
-        }
-
-        bool hex = false;
-        if (operandStr[start + 1] == 'x') {
-          hex = true;
-          start += 2;
-        }
-
-        uint8_t end = start + 1;
-        while (true) {
-          if (operandStr[end] < '0') {
-            break;
-          }
-          end++;
-        }
-
-        std::string sub = operandStr.substr(start, end);
-        if (hex) {
-          operands[1].imm = std::stoul(sub, 0, 16);
-        } else {
-          operands[1].imm = stoi(sub);
-        }
-
-        return;
-      }
-      if (opcode == Opcode::AArch64_DUP_ZR_S ||
-          opcode == Opcode::AArch64_DUP_ZR_D ||
-          opcode == Opcode::AArch64_DUP_ZR_B ||
-          opcode == Opcode::AArch64_DUP_ZR_H) {
-        // mov Zd.T, <rn|sp>; alias for dup Zd.T, <rn|sp>
-        operands[0].access = CS_AC_WRITE;
-        operands[1].access = CS_AC_READ;
-
-        char specifier = operandStr[operandStr.find(".") + 1];
-        switch (specifier) {
-          case 'b':
-            operands[0].vas = ARM64_VAS_1B;
-            break;
-          case 'h':
-            operands[0].vas = ARM64_VAS_1H;
-            break;
-          case 's':
-            operands[0].vas = ARM64_VAS_1S;
-            break;
-          case 'd':
-            operands[0].vas = ARM64_VAS_1D;
-            break;
-
-          default:
-            break;
-        }
-        return;
-      }
-      if (opcode == Opcode::AArch64_DUP_ZZI_S ||
-          opcode == Opcode::AArch64_DUP_ZZI_D ||
-          opcode == Opcode::AArch64_DUP_ZZI_Q) {
-        // mov Zd.T, Vn; alias for dup Zd.T, Zn.T[0]
-        operands[0].access = CS_AC_WRITE;
-        operands[1].access = CS_AC_READ;
-
-        uint8_t start = operandStr[2] == '.' ? 7 : 8;
-        uint8_t end = operandStr.length() - start;
-
-        operands[1].reg = static_cast<arm64_reg>(
-            ARM64_REG_Z0 + stoi(operandStr.substr(start, end)));
-        operands[1].vector_index = 0;
-        return;
-      }
-      if (opcode == Opcode::AArch64_INSvi32lane ||
-          opcode == Opcode::AArch64_INSvi64lane) {
-        // mov vd.T[index1], vn.T[index2]; alias for ins vd.T[index1],
-        // vn.T[index2]
-        return;
-      }
-      if (opcode == Opcode::AArch64_ORRv8i8) {
-        // mov vd, vn; alias for orr vd.t, vn.t, vn.t
-        operandCount = 3;
-
-        operands[2] = operands[1];
-        operands[1].access = CS_AC_READ;
-        operands[2].access = CS_AC_READ;
-        return;
-      }
-      if (opcode == Opcode::AArch64_ORRWri ||
-          opcode == Opcode::AArch64_ORRWrs ||
-          opcode == Opcode::AArch64_ORRXri ||
-          opcode == Opcode::AArch64_ORRXrs) {
-        // mov rd, rn; alias for: orr rd, zr, rn
-        operandCount = 3;
-        operands[2] = operands[1];
-
-        operands[1].type = ARM64_OP_REG;
-        operands[1].access = CS_AC_READ;
-        operands[1].shift = {ARM64_SFT_INVALID, 0};
-        if (opcode == Opcode::AArch64_ORRWri ||
-            opcode == Opcode::AArch64_ORRWrs) {
-          operands[1].reg = ARM64_REG_WZR;
-        } else {
-          operands[1].reg = ARM64_REG_XZR;
-        }
-        return;
-      }
-      if (opcode == Opcode::AArch64_ORR_PPzPP) {
-        // mov Pd.b, Pn.b; alias for: orr Pd.b, Pn/z, Pn.b, Pn.b
-        operandCount = 4;
-        operands[0].access = CS_AC_WRITE;
-        operands[0].vas = ARM64_VAS_1B;
-        operands[1].access = CS_AC_READ;
-        operands[1].vas = ARM64_VAS_1B;
-        operands[2] = operands[1];
-        operands[3] = operands[1];
-        return;
-      }
-      if (opcode == Opcode::AArch64_ORR_ZZZ) {
-        // mov Zd.d, Zn.d; alias for: orr Zd.d, Zn.d, Zn.d
-        operandCount = 3;
-        operands[0].access = CS_AC_WRITE;
-        operands[1].access = CS_AC_READ;
-        operands[2] = operands[1];
-        return;
-      }
-      if (opcode == Opcode::AArch64_ORRv16i8) {
-        // mov Vd.16b, Vn.16b; alias for: orr Vd.16b, Vn.16b, Vn.16b
-        operandCount = 3;
-        operands[2] = operands[1];
-        return;
-      }
-      if (opcode == Opcode::AArch64_SEL_ZPZZ_S ||
-          opcode == Opcode::AArch64_SEL_ZPZZ_D) {
-        // mov Zd.T, Pg/M, Zn.T; alias for: sel Zd.T, Pg, Zn.T, Zd.T
-        if (mnemonic[0] == 'm') {
-          // SEL instructions id sometimes set as ARM64_INS_MOV even if
-          // aliasing hasn't occured so double check mnemoic is MOV alias
-          operandCount = 4;
-          operands[3] = operands[0];
-          operands[3].access = CS_AC_READ;
-        }
-        return;
-      }
-      if (opcode == Opcode::AArch64_UMOVvi8 ||
-          opcode == Opcode::AArch64_UMOVvi16 ||
-          opcode == Opcode::AArch64_UMOVvi32 ||
-          opcode == Opcode::AArch64_UMOVvi64) {
-        // mov rd, Vn.T[index]; alias for umov rd, Vn.T[index]
-        return;
-      }
-      if (opcode == Opcode::AArch64_MOVZWi ||
-          opcode == Opcode::AArch64_MOVZXi) {
-        // mov rd, #0; alias for: movz rd, #0{, shift #0}
-        operands[1].access = CS_AC_READ;
-        operands[1].shift = {ARM64_SFT_LSL, 0};
-        return;
-      }
-      if (opcode == Opcode::AArch64_MOVNWi ||
-          opcode == Opcode::AArch64_MOVNXi) {
-        // mov rd, #amount; alias for: movn rd, #amount{, shift #0}
-        operands[1].access = CS_AC_READ;
-        operands[1].shift = {ARM64_SFT_LSL, 0};
-        operands[1].imm = ~(operands[1].imm);
-        return;
-      }
-      if (opcode == Opcode::AArch64_INSvi8gpr ||
-          opcode == Opcode::AArch64_INSvi16gpr ||
-          opcode == Opcode::AArch64_INSvi32gpr ||
-          opcode == Opcode::AArch64_INSvi64gpr) {
-        // mov vd.ts[index], rn; alias for: ins vd.ts[index], rn
-        return;
-      }
-      if (opcode == Opcode::AArch64_UMOVvi32_idx0 ||
-          opcode == Opcode::AArch64_UMOVvi64_idx0) {
-        // mov wd, vn.t[0]; alias for: umov wd, vn.t[0]
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_MUL:
-      if (opcode == Opcode::AArch64_MADDXrrr ||
-          opcode == Opcode::AArch64_MADDWrrr) {
-        operandCount = 4;
-        operands[3].type = ARM64_OP_REG;
-        operands[3].access = CS_AC_READ;
-        if (opcode == Opcode::AArch64_MADDWrrr) {
-          operands[3].reg = ARM64_REG_WZR;
-        } else {
-          operands[3].reg = ARM64_REG_XZR;
-        }
-        return;
-      }
-      if (opcode == Opcode::AArch64_MUL_ZPmZ_B ||
-          opcode == Opcode::AArch64_MUL_ZPmZ_D ||
-          opcode == Opcode::AArch64_MUL_ZPmZ_H ||
-          opcode == Opcode::AArch64_MUL_ZPmZ_S) {
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_MVN:
-      if (opcode == Opcode::AArch64_ORNWrs ||
-          opcode == Opcode::AArch64_ORNXrs) {
-        // mvn rd, rn; alias for: orn rd, zr, rn
-        operandCount = 3;
-        operands[2] = operands[1];
-
-        operands[1].type = ARM64_OP_REG;
-        operands[1].access = CS_AC_READ;
-        operands[1].shift = {ARM64_SFT_INVALID, 0};
-        if (opcode == Opcode::AArch64_ORNWrs) {
-          operands[1].reg = ARM64_REG_WZR;
-        } else {
-          operands[1].reg = ARM64_REG_XZR;
-        }
-        return;
-      }
-      if (opcode == Opcode::AArch64_NOTv16i8 ||
-          opcode == Opcode::AArch64_NOTv8i8) {
-        // mvn vd.t, vn.t; alias for : not vd.t, vn.t
-        // Blank entry was for a legitimate alias, however operands were
-        // identical so nothing to alter between the instructions.
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_NEG:
-      if (opcode == Opcode::AArch64_SUBWrs ||
-          opcode == Opcode::AArch64_SUBXrs) {
-        // neg rd, rm{, shift #amount}; alias for:
-        //  sub rd, zr, rm{, shift #amount}
-        operandCount = 3;
-        operands[2] = operands[1];
-
-        operands[1].type = ARM64_OP_REG;
-        operands[1].access = CS_AC_READ;
-
-        if (opcode == Opcode::AArch64_SUBWrs) {
-          operands[1].reg = ARM64_REG_WZR;
-        } else {
-          operands[1].reg = ARM64_REG_XZR;
-        }
-        return;
-      }
-      if (opcode == Opcode::AArch64_NEGv2i64) {
-        // No alias present, trying to alias self.
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_NEGS:
-      if (opcode == Opcode::AArch64_SUBSWrs ||
-          opcode == Opcode::AArch64_SUBSXrs) {
-        // negs rd, rm{, shift #amount}; alias for:
-        //  subs rd, zr, rm{, shift #amount}
-        operandCount = 3;
-        operands[2] = operands[1];
-
-        operands[1].type = ARM64_OP_REG;
-        operands[1].access = CS_AC_READ;
-
-        if (opcode == Opcode::AArch64_SUBWrs) {
-          operands[1].reg = ARM64_REG_WZR;
-        } else {
-          operands[1].reg = ARM64_REG_XZR;
-        }
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_NGC:
-      return aliasNYI();
-    case ARM64_INS_NGCS:
-      return aliasNYI();
-    case ARM64_INS_NOT:
-      if (opcode == Opcode::AArch64_EOR_PPzPP) {
-        // not pd.b, pg/z, pn.b; alisas for: eor pd.b, pg/z, pn.b, pg.b
-        operandCount = 4;
-        operands[0].access = CS_AC_WRITE;
-        operands[1].access = CS_AC_READ;
-        operands[2].access = CS_AC_READ;
-        operands[3] = operands[1];
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_REV64:
-      // rev64 vd.t, vn.t
-      if (opcode == Opcode::AArch64_REV64v16i8 ||
-          opcode == Opcode::AArch64_REV64v2i32 ||
-          opcode == Opcode::AArch64_REV64v4i16 ||
-          opcode == Opcode::AArch64_REV64v4i32 ||
-          opcode == Opcode::AArch64_REV64v8i16 ||
-          opcode == Opcode::AArch64_REV64v8i8) {
-        operandCount = 2;
-        operands[0].access = CS_AC_WRITE;
-        operands[1].access = CS_AC_READ;
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_ROR:
-      if (opcode == Opcode::AArch64_RORVWr ||
-          opcode == Opcode::AArch64_RORVXr) {
-        // ror wd, wn, wm; alias for : rorv wd, wn, wm
-        // ror xd, xn, xm; alias for : rorv xd, xn, xm
-        // Blank entry was for a legitimate alias, however operands were
-        // identical so nothing to alter between the instructions.
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_SBFIZ:
-      if (opcode == Opcode::AArch64_SBFMWri ||
-          opcode == Opcode::AArch64_SBFMXri) {
-        operands[3].imm -= 1;
-
-        uint8_t highestBit = 63;
-        if (opcode == Opcode::AArch64_SBFMWri) {
-          highestBit = 31;
-        }
-
-        operands[2].imm = (-operands[2].imm) & highestBit;
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_SBFX:
-      if (opcode == Opcode::AArch64_SBFMWri ||
-          opcode == Opcode::AArch64_SBFMXri) {
-        // sbfx rd, rn, #lsb, #width; alias for
-        // sbfm rd, rn, #lsb, #(lsb + width - 1)
-        operands[3].imm = operands[2].imm + operands[3].imm - 1;
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_SMNEGL:
-      return aliasNYI();
-    case ARM64_INS_SMULL:
-      if (opcode == Opcode::AArch64_SMADDLrrr) {
-        operandCount = 4;
-        operands[3].type = ARM64_OP_REG;
-        operands[3].access = CS_AC_READ;
-        operands[3].reg = ARM64_REG_XZR;
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_SXTB:
-      // sxtb rd, rn; alias for: sbfm rd, rn, #0, #7
-      if (opcode == Opcode::AArch64_SBFMWri ||
-          opcode == Opcode::AArch64_SBFMXri) {
-        operandCount = 4;
-
-        operands[2].type = ARM64_OP_IMM;
-        operands[2].access = CS_AC_READ;
-        operands[2].imm = 0;
-
-        operands[3].type = ARM64_OP_IMM;
-        operands[3].access = CS_AC_READ;
-        operands[3].imm = 7;
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_SXTH:
-      // sxth rd, rn; alias for: sbfm rd, rn, #0, #15
-      if (opcode == Opcode::AArch64_SBFMWri ||
-          opcode == Opcode::AArch64_SBFMXri) {
-        operandCount = 4;
-
-        operands[2].type = ARM64_OP_IMM;
-        operands[2].access = CS_AC_READ;
-        operands[2].imm = 0;
-
-        operands[3].type = ARM64_OP_IMM;
-        operands[3].access = CS_AC_READ;
-        operands[3].imm = 15;
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_SXTW:
-      // sxtw rd, rn; alias for: sbfm rd, rn, #0, #31
-      if (opcode == Opcode::AArch64_SBFMXri) {
-        operandCount = 4;
-
-        operands[2].type = ARM64_OP_IMM;
-        operands[2].access = CS_AC_READ;
-        operands[2].imm = 0;
-
-        operands[3].type = ARM64_OP_IMM;
-        operands[3].access = CS_AC_READ;
-        operands[3].imm = 31;
-        return;
-      }
-      if (opcode == Opcode::AArch64_SXTW_ZPmZ_D) {
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_SYS: {
-      // Extract IC/DC/AT/TLBI operation
-      if (std::string(mnemonic) == "dc") {
-        if (operandStr.substr(0, 3) == "zva") {
-          id = ARM64_INS_DC;
-          operandCount = 3;
-          operands[1] = operands[0];
-          operands[1].access = CS_AC_READ;
-          operands[0].type = ARM64_OP_SYS;
-          operands[0].sys = ARM64_DC_ZVA;
-          operands[2].type = ARM64_OP_REG_MRS;
-          operands[2].access = CS_AC_READ;
-          operands[2].imm = ARM64_SYSREG_DCZID_EL0;
-          return;
-        }
-      }
-      return aliasNYI();
-    }
-    case ARM64_INS_TLBI:
-      return aliasNYI();
-    case ARM64_INS_TST:
-      if (opcode == Opcode::AArch64_ANDSWrs ||
-          opcode == Opcode::AArch64_ANDSXrs ||
-          opcode == Opcode::AArch64_ANDSWri ||
-          opcode == Opcode::AArch64_ANDSXri) {
-        // tst rn, rm; alias for: ands zr, rn, rm
-        // tst rn, #imm; alias for: ands zr, rn, #imm
-        operandCount = 3;
-        operands[2] = operands[1];
-        operands[1] = operands[0];
-        operands[1].access = CS_AC_READ;
-
-        operands[0].type = ARM64_OP_REG;
-        operands[0].access = CS_AC_WRITE;
-        if (opcode == Opcode::AArch64_ANDSWrs ||
-            opcode == Opcode::AArch64_ANDSWri) {
-          operands[0].reg = ARM64_REG_WZR;
-        } else {
-          operands[0].reg = ARM64_REG_XZR;
-        }
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_UBFIZ:
-      if (opcode == Opcode::AArch64_UBFMWri ||
-          opcode == Opcode::AArch64_UBFMXri) {
-        operands[3].imm -= 1;
-
-        uint8_t highestBit = 63;
-        if (opcode == Opcode::AArch64_UBFMWri) {
-          highestBit = 31;
-        }
-
-        operands[2].imm = (-operands[2].imm) & highestBit;
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_UBFX:
-      if (opcode == Opcode::AArch64_UBFMWri ||
-          opcode == Opcode::AArch64_UBFMXri) {
-        // ubfx rd, rn, #lsb, #width; alias for
-        // ubfm rd, rn, #lsb, #(lsb + width - 1)
-        operands[3].imm = operands[2].imm + operands[3].imm - 1;
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_UMNEGL:
-      return aliasNYI();
-    case ARM64_INS_UMULL:
-      // umull xd, wn, wm; alias for: umaddl xd, wn, wm, xzr
-      if (opcode == Opcode::AArch64_UMADDLrrr) {
-        operandCount = 4;
-        operands[3].type = ARM64_OP_REG;
-        operands[3].access = CS_AC_READ;
-        operands[3].reg = ARM64_REG_XZR;
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_UXTB:
-      // uxtb wd, wn; alias for: ubfm wd, wn, #0, #7
-      if (opcode == Opcode::AArch64_UBFMWri) {
-        operandCount = 4;
-        operands[2].type = ARM64_OP_IMM;
-        operands[2].access = CS_AC_READ;
-        operands[2].imm = 0;
-        operands[3].type = ARM64_OP_IMM;
-        operands[3].access = CS_AC_READ;
-        operands[3].imm = 7;
-        return;
-      }
-      return aliasNYI();
-    case ARM64_INS_UXTH:
-      return aliasNYI();
-  }
-}
-
-void InstructionMetadata::aliasNYI() { id = ARM64_INS_INVALID; }
-
 }  // namespace aarch64
 }  // namespace arch
-}  // namespace simeng
\ No newline at end of file
+}  // namespace simeng
diff --git a/src/lib/arch/aarch64/InstructionMetadata.hh b/src/lib/arch/aarch64/InstructionMetadata.hh
index 0641e7aa26..d905d93e4d 100644
--- a/src/lib/arch/aarch64/InstructionMetadata.hh
+++ b/src/lib/arch/aarch64/InstructionMetadata.hh
@@ -25,6 +25,19 @@ struct InstructionMetadata {
   /** Constructs an invalid metadata object containing the invalid encoding. */
   InstructionMetadata(const uint8_t* invalidEncoding, uint8_t bytes = 4);
 
+  /* Returns the current exception state of the metadata */
+  InstructionException getMetadataException() const {
+    return metadataException_;
+  }
+
+  /* Returns a bool stating whether an exception has been encountered */
+  bool getMetadataExceptionEncountered() const {
+    return metadataExceptionEncountered_;
+  }
+
+  /* Return extra information about the exception */
+  std::string getExceptionString() const { return exceptionString_; }
+
   /** The maximum operand string length as defined in Capstone */
   static const size_t MAX_OPERAND_STR_LENGTH =
       sizeof(cs_insn::op_str) / sizeof(char);
@@ -40,7 +53,7 @@ struct InstructionMetadata {
   static const size_t MAX_GROUPS = sizeof(cs_detail::groups) / sizeof(uint8_t);
   /** The maximum number of operands as defined in Capstone */
   static const size_t MAX_OPERANDS =
-      sizeof(cs_arm64::operands) / sizeof(cs_arm64_op);
+      sizeof(cs_aarch64::operands) / sizeof(cs_aarch64_op);
 
   /** The instruction's mnemonic ID. */
   unsigned int id;
@@ -52,7 +65,7 @@ struct InstructionMetadata {
   uint8_t encoding[4];
 
   /** The instruction's mnemonic. */
-  char mnemonic[CS_MNEMONIC_SIZE];
+  std::string mnemonic;
   /** The remainder of the instruction's assembly representation. */
   std::string operandStr;
 
@@ -75,22 +88,25 @@ struct InstructionMetadata {
   uint8_t cc;
   /** Whether this instruction sets the condition flags. */
   bool setsFlags;
-  /** Whether this instruction performs a base-address register writeback
-   * operation. */
-  bool writeback;
+
+  /** Whether this instruction is an alias. */
+  bool isAlias;
 
   /** The explicit operands. */
-  cs_arm64_op operands[MAX_OPERANDS];
+  cs_aarch64_op operands[MAX_OPERANDS];
+
   /** The number of explicit operands. */
   uint8_t operandCount;
 
  private:
-  /** Detect instruction aliases and update metadata to match the de-aliased
-   * instruction. */
-  void revertAliasing();
+  /** The current exception state of this instruction. */
+  InstructionException metadataException_ = InstructionException::None;
+
+  /** Whether an exception has been encountered. */
+  bool metadataExceptionEncountered_ = false;
 
-  /** Flag the instruction as invalid due to a detected unsupported alias. */
-  void aliasNYI();
+  /** Additional information to print to the user */
+  std::string exceptionString_ = "";
 };
 
 }  // namespace aarch64
diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 45c108739e..ec4f269a8f 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -2,6 +2,7 @@
 #include <iostream>
 
 #include "InstructionMetadata.hh"
+#include "simeng/arch/aarch64/helpers/auxiliaryFunctions.hh"
 
 namespace simeng {
 namespace arch {
@@ -9,7 +10,7 @@ namespace aarch64 {
 
 void generateContiguousAddresses(
     uint64_t baseAddr, uint16_t numVecElems, uint8_t size,
-    std::vector<simeng::MemoryAccessTarget>& addresses) {
+    std::vector<simeng::memory::MemoryAccessTarget>& addresses) {
   for (uint16_t i = 0; i < numVecElems; i++) {
     addresses.push_back({baseAddr + (i * size), size});
   }
@@ -17,7 +18,8 @@ void generateContiguousAddresses(
 
 void generatePredicatedContiguousAddressBlocks(
     uint64_t baseAddr, uint16_t numVecElems, uint8_t elemSize, uint8_t predSize,
-    const uint64_t* pred, std::vector<simeng::MemoryAccessTarget>& addresses) {
+    const uint64_t* pred,
+    std::vector<simeng::memory::MemoryAccessTarget>& addresses) {
   bool recordingBlock = false;
   uint64_t currAddr = 0;
   uint16_t currSize = 0;
@@ -44,25 +46,25 @@ void generatePredicatedContiguousAddressBlocks(
   if (recordingBlock) addresses.push_back({currAddr, currSize});
 }
 
-span<const MemoryAccessTarget> Instruction::generateAddresses() {
+span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
   assert((isLoad() || isStoreAddress()) &&
          "generateAddresses called on non-load-or-store instruction");
   if (isMicroOp_) {
     switch (microOpcode_) {
       case MicroOpcode::LDR_ADDR: {
-        std::vector<simeng::MemoryAccessTarget> addresses;
+        std::vector<simeng::memory::MemoryAccessTarget> addresses;
         generateContiguousAddresses(
-            operands[0].get<uint64_t>() + metadata.operands[1].mem.disp, 1,
-            dataSize_, addresses);
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
+            1, dataSize_, addresses);
 
         setMemoryAddresses(addresses);
         break;
       }
       case MicroOpcode::STR_ADDR: {
-        std::vector<simeng::MemoryAccessTarget> addresses;
+        std::vector<simeng::memory::MemoryAccessTarget> addresses;
         generateContiguousAddresses(
-            operands[0].get<uint64_t>() + metadata.operands[0].mem.disp, 1,
-            dataSize_, addresses);
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[0].mem.disp,
+            1, dataSize_, addresses);
 
         setMemoryAddresses(addresses);
         break;
@@ -73,54 +75,116 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         break;
     }
   } else {
-    // 0th bit of SVCR register determins if streaming-mode is enabled.
+    // 0th bit of SVCR register determines if streaming-mode is enabled.
     const bool SMenabled = architecture_.getSVCRval() & 1;
     // When streaming mode is enabled, the architectural vector length goes from
     // SVE's VL to SME's SVL.
     const uint16_t VL_bits = SMenabled
                                  ? architecture_.getStreamingVectorLength()
                                  : architecture_.getVectorLength();
-    switch (metadata.opcode) {
+    switch (metadata_.opcode) {
       case Opcode::AArch64_CASALW: {  // casal ws, wt, [xn|sp]
-        setMemoryAddresses({{operands[2].get<uint64_t>(), 4}});
+        setMemoryAddresses({{sourceValues_[2].get<uint64_t>(), 4}});
         break;
       }
       case Opcode::AArch64_CASALX: {  // casal xs, xt, [xn|sp]
-        setMemoryAddresses({{operands[2].get<uint64_t>(), 8}});
+        setMemoryAddresses({{sourceValues_[2].get<uint64_t>(), 8}});
         break;
       }
-      case Opcode::AArch64_LD1_MXIPXX_V_S:    // ld1w {zatv.s[ws, #imm]}, pg/z,
-                                              // [<xn|sp>{, xm, LSL #2}]
+      case Opcode::AArch64_LD1_MXIPXX_V_B:  // ld1b {zatv.b[ws, #imm]}, pg/z,
+                                            // [<xn|sp>{, xm}]
+        // SME
+        [[fallthrough]];
+      case Opcode::AArch64_LD1_MXIPXX_H_B: {  // ld1b {zath.b[ws, #imm]}, pg/z,
+                                              // [<xn|sp>{, xm}]
+        // SME
+        const uint16_t partition_num = VL_bits / 8;
+        const uint64_t n = sourceValues_[partition_num + 2].get<uint64_t>();
+        uint64_t m = 0;
+        if (metadata_.operands[2].mem.index)
+          m = sourceValues_[partition_num + 3].get<uint64_t>();
+        setMemoryAddresses({(n + m), static_cast<uint16_t>(VL_bits / 8)});
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_V_D:  // ld1d {zatv.d[ws, #imm]}, pg/z,
+                                            // [<xn|sp>{, xm, lsl #3}]
+        // SME
+        [[fallthrough]];
+      case Opcode::AArch64_LD1_MXIPXX_H_D: {  // ld1d {zath.d[ws, #imm]}, pg/z,
+                                              // [<xn|sp>{, xm, lsl #3}]
+        // SME
+        const uint16_t partition_num = VL_bits / 64;
+        const uint64_t n = sourceValues_[partition_num + 2].get<uint64_t>();
+        uint64_t m = 0;
+        if (metadata_.operands[2].mem.index != AARCH64_REG_INVALID)
+          m = sourceValues_[partition_num + 3].get<uint64_t>() << 3;
+        setMemoryAddresses({(n + m), static_cast<uint16_t>(VL_bits / 8)});
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_V_H:  // ld1h {zatv.h[ws, #imm]}, pg/z,
+                                            // [<xn|sp>{, xm, lsl #1}]
+        // SME
+        [[fallthrough]];
+      case Opcode::AArch64_LD1_MXIPXX_H_H: {  // ld1h {zath.h[ws, #imm]}, pg/z,
+                                              // [<xn|sp>{, xm, lsl #1}]
+        // SME
+        const uint16_t partition_num = VL_bits / 16;
+        const uint64_t n = sourceValues_[partition_num + 2].get<uint64_t>();
+        uint64_t m = 0;
+        if (metadata_.operands[2].mem.index)
+          m = sourceValues_[partition_num + 3].get<uint64_t>() << 1;
+        setMemoryAddresses({(n + m), static_cast<uint16_t>(VL_bits / 8)});
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_V_Q:  // ld1q {zatv.q[ws]}, pg/z,
+                                            // [<xn|sp>{, xm, lsl #4}]
+        // SME
+        [[fallthrough]];
+      case Opcode::AArch64_LD1_MXIPXX_H_Q: {  // ld1q {zath.q[ws]}, pg/z,
+                                              // [<xn|sp>{, xm, lsl #4}]
+        // SME
+        const uint16_t partition_num = VL_bits / 128;
+        const uint64_t n = sourceValues_[partition_num + 2].get<uint64_t>();
+        uint64_t m = 0;
+        if (metadata_.operands[2].mem.index)
+          m = sourceValues_[partition_num + 3].get<uint64_t>() << 4;
+        setMemoryAddresses({(n + m), static_cast<uint16_t>(VL_bits / 8)});
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_V_S:  // ld1w {zatv.s[ws, #imm]}, pg/z,
+                                            // [<xn|sp>{, xm, LSL #2}]
+        // SME
+        [[fallthrough]];
       case Opcode::AArch64_LD1_MXIPXX_H_S: {  // ld1w {zath.s[ws, #imm]}, pg/z,
                                               // [<xn|sp>{, xm, LSL #2}]
         // SME
         const uint16_t partition_num = VL_bits / 32;
-        const uint64_t n = operands[partition_num + 2].get<uint64_t>();
+        const uint64_t n = sourceValues_[partition_num + 2].get<uint64_t>();
         uint64_t m = 0;
-        if (metadata.operands[2].mem.index)
-          m = operands[partition_num + 3].get<uint64_t>() << 2;
+        if (metadata_.operands[2].mem.index)
+          m = sourceValues_[partition_num + 3].get<uint64_t>() << 2;
         setMemoryAddresses({(n + m), static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
       case Opcode::AArch64_LD1i32: {  // ld1 {vt.s}[index], [xn]
-        setMemoryAddresses({{operands[1].get<uint64_t>(), 4}});
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 4}});
         break;
       }
       case Opcode::AArch64_LD1i64: {  // ld1 {vt.d}[index], [xn]
-        setMemoryAddresses({{operands[1].get<uint64_t>(), 8}});
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 8}});
         break;
       }
       case Opcode::AArch64_LD1i64_POST: {  // ld1 {vt.d}[index], [xn], #8
-        setMemoryAddresses({{operands[1].get<uint64_t>(), 8}});
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 8}});
         break;
       }
       case Opcode::AArch64_LD1RD_IMM: {  // ld1rd {zt.d}, pg/z, [xn, #imm]
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
         for (int i = 0; i < 4; i++) {
           if (p[i] != 0) {
-            setMemoryAddresses(
-                {{operands[1].get<uint64_t>() + metadata.operands[2].mem.disp,
-                  8}});
+            setMemoryAddresses({{sourceValues_[1].get<uint64_t>() +
+                                     metadata_.operands[2].mem.disp,
+                                 8}});
             break;
           }
         }
@@ -128,161 +192,206 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
       }
       case Opcode::AArch64_LD1RQ_D_IMM: {  // ld1rqd {zd.d}, pg/z, [xn{, #imm}]
         uint64_t addr =
-            operands[1].get<uint64_t>() + metadata.operands[2].mem.disp;
+            sourceValues_[1].get<uint64_t>() + metadata_.operands[2].mem.disp;
+        setMemoryAddresses({addr, static_cast<uint16_t>(16)});
+        break;
+      }
+      case Opcode::AArch64_LD1RQ_W: {  // ld1rqw {zd.s}, pg/z, [xn, xm, lsl #2]
+        uint64_t addr = sourceValues_[1].get<uint64_t>() +
+                        (sourceValues_[2].get<uint64_t>() * 4);
         setMemoryAddresses({addr, static_cast<uint16_t>(16)});
         break;
       }
       case Opcode::AArch64_LD1RQ_W_IMM: {  // ld1rqw {zd.s}, pg/z, [xn{, #imm}]
         uint64_t addr =
-            operands[1].get<uint64_t>() + metadata.operands[2].mem.disp;
+            sourceValues_[1].get<uint64_t>() + metadata_.operands[2].mem.disp;
         setMemoryAddresses({addr, static_cast<uint16_t>(16)});
         break;
       }
       case Opcode::AArch64_LD1RW_IMM: {  // ld1rw {zt.s}, pg/z, [xn, #imm]
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
         for (int i = 0; i < 4; i++) {
           if (p[i] != 0) {
-            setMemoryAddresses(
-                {{operands[1].get<uint64_t>() + metadata.operands[2].mem.disp,
-                  4}});
+            setMemoryAddresses({{sourceValues_[1].get<uint64_t>() +
+                                     metadata_.operands[2].mem.disp,
+                                 4}});
             break;
           }
         }
         break;
       }
       case Opcode::AArch64_LD1Rv16b: {  // ld1r {vt.16b}, [xn]
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 16}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 16}});
         break;
       }
       case Opcode::AArch64_LD1Rv16b_POST: {  // ld1r {vt.16b}, [xn], #imm
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 16}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 16}});
         break;
       }
       case Opcode::AArch64_LD1Rv1d: {  // ld1r {vt.1d}, [xn]
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 8}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 8}});
         break;
       }
       case Opcode::AArch64_LD1Rv1d_POST: {  // ld1r {vt.1d}, [xn], #imm
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 8}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 8}});
         break;
       }
       case Opcode::AArch64_LD1Rv2d: {  // ld1r {vt.2d}, [xn]
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 16}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 16}});
         break;
       }
       case Opcode::AArch64_LD1Rv2d_POST: {  // ld1r {vt.2d}, [xn], #imm
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 16}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 16}});
         break;
       }
       case Opcode::AArch64_LD1Rv2s: {  // ld1r {vt.2s}, [xn]
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 16}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 16}});
         break;
       }
       case Opcode::AArch64_LD1Rv2s_POST: {  // ld1r {vt.2s}, [xn], #imm
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 8}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 8}});
         break;
       }
       case Opcode::AArch64_LD1Rv4h: {  // ld1r {vt.4h}, [xn]
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 8}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 8}});
         break;
       }
       case Opcode::AArch64_LD1Rv4h_POST: {  // ld1r {vt.4h}, [xn], #imm
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 8}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 8}});
         break;
       }
       case Opcode::AArch64_LD1Rv8b: {  // ld1r {vt.8b}, [xn]
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 8}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 8}});
         break;
       }
       case Opcode::AArch64_LD1Rv8b_POST: {  // ld1r {vt.8b}, [xn], #imm
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 8}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 8}});
         break;
       }
       case Opcode::AArch64_LD1Rv8h: {  // ld1r {vt.8h}, [xn]
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 16}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 16}});
         break;
       }
       case Opcode::AArch64_LD1Rv8h_POST: {  // ld1r {vt.8h}, [xn], #imm
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 16}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 16}});
         break;
       }
       case Opcode::AArch64_LD1Rv4s: {  // ld1r {vt.4s}, [xn]
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 16}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 16}});
         break;
       }
       case Opcode::AArch64_LD1Rv4s_POST: {  // ld1r {vt.4s}, [xn], #imm
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 16}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 16}});
         break;
       }
       case Opcode::AArch64_LD1Onev16b: {  // ld1 {vt.16b}, [xn]
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 16}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 16}});
         break;
       }
       case Opcode::AArch64_LD1Onev16b_POST: {  // ld1 {vt.16b}, [xn], #imm
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 16}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 16}});
         break;
       }
-      case Opcode::AArch64_LD1Twov16b: {  // ld1 {vt1.16b, vt2.16b}, [xn]
-        uint64_t base = operands[0].get<uint64_t>();
-        setMemoryAddresses({{base, 16}, {base + 16, 16}});
+      case Opcode::AArch64_LD1Fourv16b:  // ld1 {vt1.16b, vt2.16b, vt3.16b,
+                                         // vt4.16b}, [xn]
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Fourv16b_POST:  // ld1 {vt1.16b, vt2.16b, vt3.16b,
+                                              // vt4.16b}, [xn], <#imm|xm>
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Fourv2d:  // ld1 {vt1.2d, vt2.2d, vt3.2d, vt4.2d},
+                                        // [xn]
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Fourv2d_POST:  // ld1 {vt1.2d, vt2.2d, vt3.2d,
+                                             // vt4.2d}, [xn], <#imm|xm>
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Fourv4s:  // ld1 {vt1.4s, vt2.4s, vt3.4s, vt4.4s},
+                                        // [xn]
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Fourv4s_POST: {  // ld1 {vt1.4s, vt2.4s, vt3.4s,
+                                               // vt4.4s}, [xn], <#imm|xm>
+        uint64_t base = sourceValues_[0].get<uint64_t>();
+        setMemoryAddresses(
+            {{base, 16}, {base + 16, 16}, {base + 32, 16}, {base + 48, 16}});
         break;
       }
-      case Opcode::AArch64_LD1Twov16b_POST: {  // ld1 {vt1.16b, vt2.16b}, [xn],
-                                               //   #imm
-        uint64_t base = operands[0].get<uint64_t>();
+      case Opcode::AArch64_LD1Twov16b:  // ld1 {vt1.16b, vt2.16b}, [xn]
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Twov16b_POST:  // ld1 {vt1.16b, vt2.16b}, [xn],
+                                             // <#imm|xm>
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Twov2d:  // ld1 {vt1.2d, vt2.2d}, [xn]
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Twov2d_POST:  // ld1 {vt1.2d, vt2.2d}, [xn],
+                                            // <#imm|xm>
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Twov4s:  // ld1 {vt1.4s, vt2.4s}, [xn]
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Twov4s_POST: {  // ld1 {vt1.4s, vt2.4s}, [xn],
+                                              // <#imm|xm>
+        uint64_t base = sourceValues_[0].get<uint64_t>();
         setMemoryAddresses({{base, 16}, {base + 16, 16}});
         break;
       }
       case Opcode::AArch64_LD1B: {  // ld1b {zt.b}, pg/z, [xn, xm]
-        const uint64_t base = operands[1].get<uint64_t>();
-        const uint64_t offset = operands[2].get<uint64_t>();
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset = sourceValues_[2].get<uint64_t>();
 
         setMemoryAddresses({base + offset, static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
+      case Opcode::AArch64_LD1B_IMM: {  // ld1b {zt.b}, pg/z, [xn{, #imm,
+                                        // mul vl}]
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[2].mem.disp);
+        uint64_t addr = base + (offset * (VL_bits / 8));
+
+        setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
+        break;
+      }
       case Opcode::AArch64_LD1D: {  // ld1d {zt.d}, pg/z, [xn, xm, lsl #3]
-        const uint64_t base = operands[1].get<uint64_t>();
-        const uint64_t offset = operands[2].get<uint64_t>();
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset = sourceValues_[2].get<uint64_t>();
         const uint64_t addr = base + (offset * 8);
 
         setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
-      case Opcode::AArch64_LD1D_IMM_REAL: {  // ld1d {zt.d}, pg/z, [xn{, #imm,
-                                             // mul vl}]
+      case Opcode::AArch64_LD1D_IMM: {  // ld1d {zt.d}, pg/z, [xn{, #imm,
+                                        // mul vl}]
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t base = operands[1].get<uint64_t>();
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
         const uint64_t offset =
-            static_cast<uint64_t>(metadata.operands[2].mem.disp);
+            static_cast<uint64_t>(metadata_.operands[2].mem.disp);
         const uint64_t addr = base + (offset * partition_num * 8);
 
         setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
       case Opcode::AArch64_LD1H: {  // ld1h {zt.h}, pg/z, [xn, xm, lsl #1]
-        const uint64_t base = operands[1].get<uint64_t>();
-        const uint64_t offset = operands[2].get<uint64_t>();
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset = sourceValues_[2].get<uint64_t>();
         const uint64_t addr = base + (offset * 2);
 
         setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
       case Opcode::AArch64_LD1W: {  // ld1w {zt.s}, pg/z, [xn, xm, lsl #2]
-        const uint64_t base = operands[1].get<uint64_t>();
-        const uint64_t offset = operands[2].get<uint64_t>();
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset = sourceValues_[2].get<uint64_t>();
         const uint64_t addr = base + (offset * 4);
 
         setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
-      case Opcode::AArch64_LD1W_IMM_REAL: {  // ld1w {zt.s}, pg/z, [xn{, #imm,
-                                             // mul vl}]
+      case Opcode::AArch64_LD1W_IMM: {  // ld1w {zt.s}, pg/z, [xn{, #imm,
+                                        // mul vl}]
         const uint16_t partition_num = VL_bits / 32;
 
-        const uint64_t base = operands[1].get<uint64_t>();
-        const uint64_t offset =
-            static_cast<uint64_t>(metadata.operands[2].mem.disp);
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const int64_t offset =
+            static_cast<uint64_t>(metadata_.operands[2].mem.disp);
         const uint64_t addr = base + (offset * partition_num * 4);
 
         setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
@@ -290,9 +399,9 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
       }
       case Opcode::AArch64_LD2D: {  // ld2d {zt1.d, zt2.d}, pg/z, [xn|sp, xm,
                                     // lsl #3]
-        const uint64_t base = operands[1].get<uint64_t>();
-        uint64_t offset = operands[2].get<uint64_t>();
-        std::vector<MemoryAccessTarget> addresses;
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        uint64_t offset = sourceValues_[2].get<uint64_t>();
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(2);
 
         uint64_t addr = base + (offset * 8);
@@ -308,10 +417,10 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
                                         // [xn|sp{, #imm, MUL VL}]
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t base = operands[1].get<uint64_t>();
-        const uint64_t offset =
-            static_cast<int64_t>(metadata.operands[3].mem.disp);
-        std::vector<MemoryAccessTarget> addresses;
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[3].mem.disp);
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(2);
 
         uint64_t addr = base + (offset * partition_num * 8);
@@ -327,10 +436,10 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
                                         // [xn|sp{, #imm, MUL VL}]
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t base = operands[1].get<uint64_t>();
-        const uint64_t offset =
-            static_cast<int64_t>(metadata.operands[4].mem.disp);
-        std::vector<MemoryAccessTarget> addresses;
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[4].mem.disp);
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(3);
 
         uint64_t addr = base + (offset * partition_num * 8);
@@ -347,10 +456,10 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
                                         // pg/z, [xn|sp{, #imm, MUL VL}]
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t base = operands[1].get<uint64_t>();
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
         const int64_t offset =
-            static_cast<int64_t>(metadata.operands[5].mem.disp);
-        std::vector<MemoryAccessTarget> addresses;
+            static_cast<int64_t>(metadata_.operands[5].mem.disp);
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(4);
 
         uint64_t addr = base + (offset * partition_num * 8);
@@ -369,195 +478,229 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
       }
       case Opcode::AArch64_LD2Twov4s_POST: {  // ld2 {vt1.4s, vt2.4s}, [xn],
                                               // #imm
-        const uint64_t base = operands[2].get<uint64_t>();
+        const uint64_t base = sourceValues_[0].get<uint64_t>();
         setMemoryAddresses({{base, 16}, {base + 16, 16}});
         break;
       }
       case Opcode::AArch64_LDADDLW:  // ldaddl ws, wt, [xn]
         [[fallthrough]];
       case Opcode::AArch64_LDADDW: {  // ldadd ws, wt, [xn]
-        setMemoryAddresses({{operands[1].get<uint64_t>(), 4}});
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 4}});
         break;
       }
       case Opcode::AArch64_LDARB: {  // ldarb wt, [xn]
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 1}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 1}});
         break;
       }
       case Opcode::AArch64_LDARW: {  // ldar wt, [xn]
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 4}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 4}});
         break;
       }
       case Opcode::AArch64_LDARX: {  // ldar xt, [xn]
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 8}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 8}});
         break;
       }
       case Opcode::AArch64_LDAXRW: {  // ldaxr wd, [xn]
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 4}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 4}});
         break;
       }
       case Opcode::AArch64_LDAXRX: {  // ldaxr xd, [xn]
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 8}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 8}});
+        break;
+      }
+      case Opcode::AArch64_LDR_ZA: {  // ldr za[wv, #imm], [<xn|sp>{, #imm, mul
+                                      // vl}]
+        // SME
+        // ZA Row count === current VL in bytes
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint64_t xn = sourceValues_[zaRowCount + 1].get<uint64_t>();
+        const uint64_t imm =
+            static_cast<uint64_t>(metadata_.operands[1].mem.disp);
+        setMemoryAddresses({xn + (imm * zaRowCount), zaRowCount});
         break;
       }
       case Opcode::AArch64_LDRBBpost: {  // ldrb wt, [xn], #imm
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 1}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 1}});
         break;
       }
       case Opcode::AArch64_LDRBBpre: {  // ldrb wt, [xn, #imm]!
         setMemoryAddresses(
-            {{operands[0].get<uint64_t>() + metadata.operands[1].mem.disp, 1}});
+            {{sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              1}});
         break;
       }
       case Opcode::AArch64_LDRBBroW: {  // ldrb wt,
                                         //  [xn, wm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint32_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 1}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint32_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 1}});
         break;
       }
       case Opcode::AArch64_LDRBBroX: {  // ldrb wt,
                                         //  [xn, xm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 1}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 1}});
         break;
       }
       case Opcode::AArch64_LDRBBui: {  // ldrb wt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[0].get<uint64_t>() + metadata.operands[1].mem.disp, 1}});
+            {{sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              1}});
         break;
       }
       case Opcode::AArch64_LDRDroW: {  // ldr dt, [xn, wm{, extend {amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint32_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 8}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint32_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 8}});
         break;
       }
       case Opcode::AArch64_LDRDroX: {  // ldr dt, [xn, xm{, extend {amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 8}});
-        break;
-      }
-      case Opcode::AArch64_LDRBui:     // ldr bt, [xn, #imm]
-      case Opcode::AArch64_LDRBpre:    // ldr bt, [xn, #imm]!
-      case Opcode::AArch64_LDRDui:     // ldr dt, [xn, #imm]
-      case Opcode::AArch64_LDRDpre:    // ldr dt, [xn, #imm]!
-      case Opcode::AArch64_LDRHui:     // ldr ht, [xn, #imm]
-      case Opcode::AArch64_LDRHpre:    // ldr ht, [xn, #imm]!
-      case Opcode::AArch64_LDRQui:     // ldr qt, [xn, #imm]
-      case Opcode::AArch64_LDRQpre:    // ldr qt, [xn, #imm]!
-      case Opcode::AArch64_LDRSui:     // ldr st, [xn, #imm]
-      case Opcode::AArch64_LDRSpre:    // ldr st, [xn, #imm]!
-      case Opcode::AArch64_LDRWui:     // ldr wt, [xn, #imm]
-      case Opcode::AArch64_LDRWpre:    // ldr wt, [xn, #imm]!
-      case Opcode::AArch64_LDRXui:     // ldr xt, [xn, #imm]
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 8}});
+        break;
+      }
+      case Opcode::AArch64_LDRBui:  // ldr bt, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDRBpre:  // ldr bt, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_LDRDui:  // ldr dt, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDRDpre:  // ldr dt, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_LDRHui:  // ldr ht, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDRHpre:  // ldr ht, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_LDRQui:  // ldr qt, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDRQpre:  // ldr qt, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_LDRSui:  // ldr st, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDRSpre:  // ldr st, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_LDRWui:  // ldr wt, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDRWpre:  // ldr wt, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_LDRXui:  // ldr xt, [xn, #imm]
+        [[fallthrough]];
       case Opcode::AArch64_LDRXpre: {  // ldr xt, [xn, #imm]!
-        std::vector<simeng::MemoryAccessTarget> addresses;
+        std::vector<simeng::memory::MemoryAccessTarget> addresses;
         generateContiguousAddresses(
-            operands[0].get<uint64_t>() + metadata.operands[1].mem.disp, 1,
-            dataSize_, addresses);
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
+            1, dataSize_, addresses);
         setMemoryAddresses(addresses);
         break;
       }
-      case Opcode::AArch64_LDRBpost:    // ldr bt, [xn], #imm
-      case Opcode::AArch64_LDRDpost:    // ldr dt, [xn], #imm
-      case Opcode::AArch64_LDRHpost:    // ldr ht, [xn], #imm
-      case Opcode::AArch64_LDRQpost:    // ldr qt, [xn], #imm
-      case Opcode::AArch64_LDRSpost:    // ldr st, [xn], #imm
-      case Opcode::AArch64_LDRWpost:    // ldr wt, [xn], #imm
+      case Opcode::AArch64_LDRBpost:  // ldr bt, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_LDRDpost:  // ldr dt, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_LDRHpost:  // ldr ht, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_LDRQpost:  // ldr qt, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_LDRSpost:  // ldr st, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_LDRWpost:  // ldr wt, [xn], #imm
+        [[fallthrough]];
       case Opcode::AArch64_LDRXpost: {  // ldr xt, [xn], #imm
-        std::vector<simeng::MemoryAccessTarget> addresses;
-        generateContiguousAddresses(operands[0].get<uint64_t>(), 1, dataSize_,
-                                    addresses);
+        std::vector<memory::MemoryAccessTarget> addresses;
+        generateContiguousAddresses(sourceValues_[0].get<uint64_t>(), 1,
+                                    dataSize_, addresses);
         setMemoryAddresses(addresses);
         break;
       }
       case Opcode::AArch64_LDRHHpost: {  // ldrh wt, [xn], #imm
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 2}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 2}});
         break;
       }
       case Opcode::AArch64_LDRHHpre: {  // ldrh wt, [xn, #imm]!
         setMemoryAddresses(
-            {{operands[0].get<uint64_t>() + metadata.operands[1].mem.disp, 2}});
+            {{sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              2}});
         break;
       }
       case Opcode::AArch64_LDRHHroW: {  // ldrh wt, [xn, wm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 2}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 2}});
         break;
       }
       case Opcode::AArch64_LDRHHroX: {  // ldrh wt, [xn, xm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 2}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 2}});
         break;
       }
       case Opcode::AArch64_LDRHHui: {  // ldrh wt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[0].get<uint64_t>() + metadata.operands[1].mem.disp, 2}});
+            {{sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              2}});
         break;
       }
       case Opcode::AArch64_LDRQroX: {  // ldr qt, [xn, xm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 16}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 16}});
         break;
       }
       case Opcode::AArch64_LDRSroW: {  // ldr st, [xn, wm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 4}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 4}});
         break;
       }
       case Opcode::AArch64_LDRSroX: {  // ldr st, [xn, xm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 4}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 4}});
         break;
       }
       case Opcode::AArch64_LDRSWl: {  // ldrsw xt, #imm
         setMemoryAddresses(
-            {{metadata.operands[1].imm + instructionAddress_, 4}});
+            {{metadata_.operands[1].imm + instructionAddress_, 4}});
         break;
       }
       case Opcode::AArch64_LDRWroW: {  // ldr wt, [xn, wm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint32_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 4}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint32_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 4}});
         break;
       }
       case Opcode::AArch64_LDRWroX: {  // ldr wt, [xn, xm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 4}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 4}});
         break;
       }
       case Opcode::AArch64_LDRXl: {  // ldr xt, #imm
         setMemoryAddresses(
-            {{metadata.operands[1].imm + instructionAddress_, 8}});
+            {{metadata_.operands[1].mem.disp + instructionAddress_, 8}});
         break;
       }
       case Opcode::AArch64_LDRXroW: {  // ldr xt, [xn, wn{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint32_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 8}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint32_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 8}});
         break;
       }
       case Opcode::AArch64_LDRXroX: {  // ldr xt, [xn, xn{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 8}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 8}});
         break;
       }
       case Opcode::AArch64_LDR_PXI: {  // ldr pt, [xn{, #imm, mul vl}]
         const uint64_t PL_bits = VL_bits / 8;
         const uint16_t partition_num = PL_bits / 8;
 
-        const uint64_t base = operands[0].get<uint64_t>();
-        const uint64_t offset =
-            static_cast<uint64_t>(metadata.operands[1].mem.disp);
+        const uint64_t base = sourceValues_[0].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[1].mem.disp);
 
         uint64_t addr = base + (offset * partition_num);
 
@@ -567,9 +710,9 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
       case Opcode::AArch64_LDR_ZXI: {  // ldr zt, [xn{, #imm, mul vl}]
         const uint16_t partition_num = VL_bits / 8;
 
-        const uint64_t base = operands[0].get<uint64_t>();
+        const uint64_t base = sourceValues_[0].get<uint64_t>();
         const int64_t offset =
-            static_cast<int64_t>(metadata.operands[1].mem.disp);
+            static_cast<int64_t>(metadata_.operands[1].mem.disp);
         const uint64_t addr = base + (offset * partition_num);
 
         setMemoryAddresses({addr, partition_num});
@@ -577,163 +720,187 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
       }
       case Opcode::AArch64_LDNPSi: {  // ldnp st1, st2, [xn, #imm]
         uint64_t base =
-            operands[0].get<uint64_t>() + metadata.operands[2].mem.disp;
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[2].mem.disp;
         setMemoryAddresses({{base, 4}, {base + 4, 4}});
         break;
       }
-      case Opcode::AArch64_LDPDi:      // ldp dt1, dt2, [xn, #imm]
-      case Opcode::AArch64_LDPDpre:    // ldp dt1, dt2, [xn, #imm!]
-      case Opcode::AArch64_LDPQi:      // ldp qt1, qt2, [xn, #imm]
-      case Opcode::AArch64_LDPQpre:    // ldp qt1, qt2, [xn, #imm!]
-      case Opcode::AArch64_LDPSi:      // ldp st1, st2, [xn, #imm]
-      case Opcode::AArch64_LDPSpre:    // ldp st1, st2, [xn, #imm!]
-      case Opcode::AArch64_LDPWi:      // ldp wt1, wt2, [xn, #imm]
-      case Opcode::AArch64_LDPWpre:    // ldp wt1, wt2, [xn, #imm!]
-      case Opcode::AArch64_LDPXi:      // ldp xt1, xt2, [xn, #imm]
+      case Opcode::AArch64_LDPDi:  // ldp dt1, dt2, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDPDpre:  // ldp dt1, dt2, [xn, #imm!]
+        [[fallthrough]];
+      case Opcode::AArch64_LDPQi:  // ldp qt1, qt2, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDPQpre:  // ldp qt1, qt2, [xn, #imm!]
+        [[fallthrough]];
+      case Opcode::AArch64_LDPSi:  // ldp st1, st2, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDPSpre:  // ldp st1, st2, [xn, #imm!]
+        [[fallthrough]];
+      case Opcode::AArch64_LDPWi:  // ldp wt1, wt2, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDPWpre:  // ldp wt1, wt2, [xn, #imm!]
+        [[fallthrough]];
+      case Opcode::AArch64_LDPXi:  // ldp xt1, xt2, [xn, #imm]
+        [[fallthrough]];
       case Opcode::AArch64_LDPXpre: {  // ldp xt1, xt2, [xn, #imm!]
-        std::vector<simeng::MemoryAccessTarget> addresses;
+        std::vector<simeng::memory::MemoryAccessTarget> addresses;
         generateContiguousAddresses(
-            operands[0].get<uint64_t>() + metadata.operands[2].mem.disp, 2,
-            dataSize_, addresses);
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[2].mem.disp,
+            2, dataSize_, addresses);
         setMemoryAddresses(addresses);
         break;
       }
-      case Opcode::AArch64_LDPDpost:    // ldp dt1, dt2, [xn], #imm
-      case Opcode::AArch64_LDPQpost:    // ldp qt1, qt2, [xn], #imm
-      case Opcode::AArch64_LDPSpost:    // ldp st1, st2, [xn], #imm
-      case Opcode::AArch64_LDPWpost:    // ldp wt1, wt2, [xn], #imm
+      case Opcode::AArch64_LDPDpost:  // ldp dt1, dt2, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_LDPQpost:  // ldp qt1, qt2, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_LDPSpost:  // ldp st1, st2, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_LDPWpost:  // ldp wt1, wt2, [xn], #imm
+        [[fallthrough]];
       case Opcode::AArch64_LDPXpost: {  // ldp xt1, xt2, [xn], #imm
-        std::vector<simeng::MemoryAccessTarget> addresses;
-        generateContiguousAddresses(operands[0].get<uint64_t>(), 2, dataSize_,
-                                    addresses);
+        std::vector<memory::MemoryAccessTarget> addresses;
+        generateContiguousAddresses(sourceValues_[0].get<uint64_t>(), 2,
+                                    dataSize_, addresses);
         setMemoryAddresses(addresses);
         break;
       }
       case Opcode::AArch64_LDPSWi: {  // ldpsw xt1, xt2, [xn {, #imm}]
         uint64_t base =
-            operands[0].get<uint64_t>() + metadata.operands[2].mem.disp;
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[2].mem.disp;
         setMemoryAddresses({{base, 4}, {base + 4, 4}});
         break;
       }
       case Opcode::AArch64_LDRSBWroX: {  // ldrsb wt, [xn, xm{, extend
                                          // {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 1}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 1}});
         break;
       }
       case Opcode::AArch64_LDRSBWui: {  // ldrsb xt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[0].get<uint64_t>() + metadata.operands[1].mem.disp, 1}});
+            {{sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              1}});
         break;
       }
       case Opcode::AArch64_LDRSBXui: {  // ldrsb xt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[0].get<uint64_t>() + metadata.operands[1].mem.disp, 1}});
+            {{sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              1}});
         break;
       }
       case Opcode::AArch64_LDRSHWroW: {  // ldrsh wt, [xn, wm{, extend
                                          // {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint32_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 2}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint32_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 2}});
         break;
       }
       case Opcode::AArch64_LDRSHWroX: {  // ldrsh wt, [xn, xm{, extend
                                          // {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 2}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 2}});
         break;
       }
       case Opcode::AArch64_LDRSHWui: {  // ldrsh wt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[0].get<uint64_t>() + metadata.operands[1].mem.disp, 2}});
+            {{sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              2}});
         break;
       }
       case Opcode::AArch64_LDRSHXroW: {  // ldrsh xt, [xn, wm{, extend
                                          // {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint32_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 2}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint32_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 2}});
         break;
       }
       case Opcode::AArch64_LDRSHXroX: {  // ldrsh xt, [xn, xm{, extend
                                          // {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 2}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 2}});
         break;
       }
       case Opcode::AArch64_LDRSHXui: {  // ldrsh xt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[0].get<uint64_t>() + metadata.operands[1].mem.disp, 2}});
+            {{sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              2}});
         break;
       }
       case Opcode::AArch64_LDRSWpost: {  // ldrsw xt, [xn], #simm
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 4}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 4}});
         break;
       }
       case Opcode::AArch64_LDRSWroX: {  // ldrsw xt, [xn, xm{, extend
                                         // {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[1].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[0].get<uint64_t>() + offset, 4}});
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 4}});
         break;
       }
       case Opcode::AArch64_LDRSWui: {  // ldrsw xt, [xn{, #pimm}]
         uint64_t base =
-            operands[0].get<uint64_t>() + metadata.operands[1].mem.disp;
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp;
         setMemoryAddresses({{base, 4}});
         break;
       }
       case Opcode::AArch64_LDURBBi: {  // ldurb wt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[0].get<uint64_t>() + metadata.operands[1].mem.disp, 1}});
+            {{sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              1}});
         break;
       }
       case Opcode::AArch64_LDURDi: {  // ldur dt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[0].get<uint64_t>() + metadata.operands[1].mem.disp, 8}});
+            {{sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              8}});
         break;
       }
       case Opcode::AArch64_LDURHHi: {  // ldurh wt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[0].get<uint64_t>() + metadata.operands[1].mem.disp, 2}});
+            {{sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              2}});
         break;
       }
       case Opcode::AArch64_LDURQi: {  // ldur qt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[0].get<uint64_t>() + metadata.operands[1].mem.disp,
+            {{sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
               16}});
         break;
       }
       case Opcode::AArch64_LDURSWi: {  // ldursw xt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[0].get<uint64_t>() + metadata.operands[1].mem.disp, 4}});
+            {{sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              4}});
         break;
       }
       case Opcode::AArch64_LDURSi: {  // ldur sd, [<xn|sp>{, #imm}]
         setMemoryAddresses(
-            {{operands[0].get<uint64_t>() + metadata.operands[1].mem.disp, 4}});
+            {{sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              4}});
         break;
       }
       case Opcode::AArch64_LDURWi: {  // ldur wt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[0].get<uint64_t>() + metadata.operands[1].mem.disp, 4}});
+            {{sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              4}});
         break;
       }
       case Opcode::AArch64_LDURXi: {  // ldur xt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[0].get<uint64_t>() + metadata.operands[1].mem.disp, 8}});
+            {{sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              8}});
         break;
       }
       case Opcode::AArch64_LDXRW: {  // ldxr wt, [xn]
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 4}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 4}});
         break;
       }
       case Opcode::AArch64_LDXRX: {  // ldxr xt, [xn]
-        setMemoryAddresses({{operands[0].get<uint64_t>(), 8}});
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 8}});
         break;
       }
       case Opcode::AArch64_PRFMui: {  // prfm op, [xn, xm{, extend {#amount}}]
@@ -741,13 +908,13 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         break;
       }
       case Opcode::AArch64_ST1B: {  // st1b {zt.b}, pg, [xn, xm]
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 8;
 
-        const uint64_t base = operands[2].get<uint64_t>();
-        const uint64_t offset = operands[3].get<uint64_t>();
+        const uint64_t base = sourceValues_[2].get<uint64_t>();
+        const uint64_t offset = sourceValues_[3].get<uint64_t>();
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num);
 
         generatePredicatedContiguousAddressBlocks(base + offset, partition_num,
@@ -755,14 +922,31 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
-      case Opcode::AArch64_SST1B_D_REAL: {  // st1b {zd.d}, pg, [xn, zm.d]
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+      case Opcode::AArch64_ST1B_IMM: {  // st1b {zt.b}, pg, [xn{, #imm, mul vl}]
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
+        const uint16_t partition_num = VL_bits / 8;
+
+        const uint64_t base = sourceValues_[2].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[2].mem.disp);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(partition_num);
+        uint64_t addr = base + (offset * partition_num);
+
+        generatePredicatedContiguousAddressBlocks(addr, partition_num, 1, 1, p,
+                                                  addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_SST1B_D: {  // st1b {zd.d}, pg, [xn, zm.d]
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t base = operands[2].get<uint64_t>();
-        const uint64_t* offset = operands[3].getAsVector<uint64_t>();
+        const uint64_t base = sourceValues_[2].get<uint64_t>();
+        const uint64_t* offset = sourceValues_[3].getAsVector<uint64_t>();
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
 
         for (int i = 0; i < partition_num; i++) {
           uint64_t shifted_active = 1ull << ((i % 8) * 8);
@@ -774,14 +958,14 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(addresses);
         break;
       }
-      case Opcode::AArch64_SST1D_REAL: {  // st1d {zt.d}, pg, [xn, zm.d]
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+      case Opcode::AArch64_SST1D: {  // st1d {zt.d}, pg, [xn, zm.d]
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t base = operands[2].get<uint64_t>();
-        const uint64_t* offset = operands[3].getAsVector<uint64_t>();
+        const uint64_t base = sourceValues_[2].get<uint64_t>();
+        const uint64_t* offset = sourceValues_[3].getAsVector<uint64_t>();
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num);
 
         for (int i = 0; i < partition_num; i++) {
@@ -794,15 +978,15 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(addresses);
         break;
       }
-      case Opcode::AArch64_SST1D_SCALED_SCALED_REAL: {  // st1d {zt.d}, pg, [xn,
-                                                        // zm.d, lsl #3]
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+      case Opcode::AArch64_SST1D_SCALED: {  // st1d {zt.d}, pg, [xn,
+                                            // zm.d, lsl #3]
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t base = operands[2].get<uint64_t>();
-        const uint64_t* offset = operands[3].getAsVector<uint64_t>();
+        const uint64_t base = sourceValues_[2].get<uint64_t>();
+        const uint64_t* offset = sourceValues_[3].getAsVector<uint64_t>();
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num);
 
         for (int i = 0; i < partition_num; i++) {
@@ -816,13 +1000,13 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         break;
       }
       case Opcode::AArch64_ST1D: {  // st1d {zt.d}, pg, [xn, xm, lsl #3]
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t base = operands[2].get<uint64_t>();
-        const uint64_t offset = operands[3].get<uint64_t>();
+        const uint64_t base = sourceValues_[2].get<uint64_t>();
+        const uint64_t offset = sourceValues_[3].get<uint64_t>();
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num);
 
         generatePredicatedContiguousAddressBlocks(
@@ -831,14 +1015,14 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         break;
       }
       case Opcode::AArch64_ST1D_IMM: {  // st1d {zt.d}, pg, [xn{, #imm, mul vl}]
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t base = operands[2].get<uint64_t>();
-        const uint64_t offset =
-            static_cast<uint64_t>(metadata.operands[2].mem.disp);
+        const uint64_t base = sourceValues_[2].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[2].mem.disp);
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num);
 
         generatePredicatedContiguousAddressBlocks(
@@ -849,14 +1033,14 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
       }
       case Opcode::AArch64_ST2D_IMM: {  // st2d {zt1.d, zt2.d}, pg, [<xn|sp>{,
                                         // #imm, mul vl}]
-        const uint64_t* p = operands[2].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[2].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t base = operands[3].get<uint64_t>();
-        const uint64_t offset =
-            static_cast<int64_t>(metadata.operands[3].mem.disp);
+        const uint64_t base = sourceValues_[3].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[3].mem.disp);
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num * 2);
 
         uint64_t addr = base + (offset * partition_num * 8);
@@ -866,20 +1050,114 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
-      case Opcode::AArch64_ST1_MXIPXX_H_S:    // st1w {zath.s[ws, #imm]}, pg/z,
-                                              // [<xn|sp>{, xm, LSL #2}]
+      case Opcode::AArch64_ST1_MXIPXX_H_B:  // st1b {zath.b[ws, #imm]}, pg,
+                                            // [<xn|sp>{, xm}]
+        // SME
+        [[fallthrough]];
+      case Opcode::AArch64_ST1_MXIPXX_V_B: {  // st1b {zatv.b[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm}]
+        // SME
+        const uint16_t partition_num = VL_bits / 8;
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+        const uint64_t n = sourceValues_[partition_num + 2].get<uint64_t>();
+        uint64_t m = 0;
+        if (metadata_.operands[2].mem.index)
+          m = sourceValues_[partition_num + 3].get<uint64_t>();
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(partition_num);
+
+        generatePredicatedContiguousAddressBlocks((n + m), partition_num, 1, 1,
+                                                  pg, addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_H_D:  // st1d {zath.d[ws, #imm]}, pg,
+                                            // [<xn|sp>{, xm, lsl #3}]
+        // SME
+        [[fallthrough]];
+      case Opcode::AArch64_ST1_MXIPXX_V_D: {  // st1d {zatv.d[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm, lsl #3}]
+        // SME
+        const uint16_t partition_num = VL_bits / 64;
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+        const uint64_t n = sourceValues_[partition_num + 2].get<uint64_t>();
+        uint64_t m = 0;
+        if (metadata_.operands[2].mem.index)
+          m = sourceValues_[partition_num + 3].get<uint64_t>() << 3;
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(partition_num);
+
+        generatePredicatedContiguousAddressBlocks((n + m), partition_num, 8, 8,
+                                                  pg, addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_H_H:  // st1h {zath.h[ws, #imm]}, pg,
+                                            // [<xn|sp>{, xm, lsl #1}]
+        // SME
+        [[fallthrough]];
+      case Opcode::AArch64_ST1_MXIPXX_V_H: {  // st1h {zatv.h[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm, lsl #1}]
+        // SME
+        const uint16_t partition_num = VL_bits / 16;
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+        const uint64_t n = sourceValues_[partition_num + 2].get<uint64_t>();
+        uint64_t m = 0;
+        if (metadata_.operands[2].mem.index)
+          m = sourceValues_[partition_num + 3].get<uint64_t>() << 1;
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(partition_num);
+
+        generatePredicatedContiguousAddressBlocks((n + m), partition_num, 2, 2,
+                                                  pg, addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_H_Q:  // st1q {zath.q[ws]}, pg, [<xn|sp>{,
+                                            // xm, lsl #4}]
+        // SME
+        [[fallthrough]];
+      case Opcode::AArch64_ST1_MXIPXX_V_Q: {  // st1q {zatv.q[ws]}, pg,
+                                              // [<xn|sp>{, xm, lsl #4}]
+        // SME
+        const uint16_t partition_num = VL_bits / 128;
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+        const uint64_t n = sourceValues_[partition_num + 2].get<uint64_t>();
+        uint64_t m = 0;
+        if (metadata_.operands[2].mem.index)
+          m = sourceValues_[partition_num + 3].get<uint64_t>() << 4;
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(partition_num);
+
+        generatePredicatedContiguousAddressBlocks((n + m), partition_num, 16,
+                                                  16, pg, addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_H_S:  // st1w {zath.s[ws, #imm]}, pg/z,
+                                            // [<xn|sp>{, xm, LSL #2}]
+        // SME
+        [[fallthrough]];
       case Opcode::AArch64_ST1_MXIPXX_V_S: {  // st1w {zatv.s[ws, #imm]}, pg/z,
                                               // [<xn|sp>{, xm, LSL #2}]
         // SME
         const uint16_t partition_num = VL_bits / 32;
         const uint64_t* pg =
-            operands[partition_num + 1].getAsVector<uint64_t>();
-        const uint64_t n = operands[partition_num + 2].get<uint64_t>();
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+        const uint64_t n = sourceValues_[partition_num + 2].get<uint64_t>();
         uint64_t m = 0;
-        if (metadata.operands[2].mem.index)
-          m = operands[partition_num + 3].get<uint64_t>() << 2;
+        if (metadata_.operands[2].mem.index)
+          m = sourceValues_[partition_num + 3].get<uint64_t>() << 2;
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num);
 
         generatePredicatedContiguousAddressBlocks((n + m), partition_num, 4, 4,
@@ -888,13 +1166,13 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         break;
       }
       case Opcode::AArch64_ST1W: {  // st1w {zt.s}, pg, [xn, xm, lsl #2]
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 32;
 
-        const uint64_t base = operands[2].get<uint64_t>();
-        const uint64_t offset = operands[3].get<uint64_t>();
+        const uint64_t base = sourceValues_[2].get<uint64_t>();
+        const uint64_t offset = sourceValues_[3].get<uint64_t>();
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num);
 
         generatePredicatedContiguousAddressBlocks(
@@ -903,13 +1181,13 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         break;
       }
       case Opcode::AArch64_ST1W_D: {  // st1w {zt.d}, pg, [xn, xm, lsl #2]
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t base = operands[2].get<uint64_t>();
-        const uint64_t offset = operands[3].get<uint64_t>();
+        const uint64_t base = sourceValues_[2].get<uint64_t>();
+        const uint64_t offset = sourceValues_[3].get<uint64_t>();
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num);
 
         generatePredicatedContiguousAddressBlocks(
@@ -919,14 +1197,14 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
       }
       case Opcode::AArch64_ST1W_IMM: {  // st1w {zt.s}, pg, [xn{, #imm, mul
                                         // vl}]
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 32;
 
-        const uint64_t base = operands[2].get<uint64_t>();
-        const uint64_t offset =
-            static_cast<uint64_t>(metadata.operands[2].mem.disp);
+        const uint64_t base = sourceValues_[2].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[2].mem.disp);
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num);
 
         generatePredicatedContiguousAddressBlocks(
@@ -936,14 +1214,14 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         break;
       }
       case Opcode::AArch64_SST1W_D_IMM: {  // st1w {zt.d}, pg, [zn.d{, #imm}]
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t* n = operands[2].getAsVector<uint64_t>();
-        const uint64_t offset =
-            static_cast<uint64_t>(metadata.operands[2].mem.disp);
+        const uint64_t* n = sourceValues_[2].getAsVector<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[2].mem.disp);
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num);
 
         for (int i = 0; i < partition_num; i++) {
@@ -957,14 +1235,14 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         break;
       }
       case Opcode::AArch64_SST1W_IMM: {  // st1w {zt.s}, pg, [zn.s{, #imm}]
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 32;
 
-        const uint32_t* n = operands[2].getAsVector<uint32_t>();
-        const uint64_t offset = static_cast<uint64_t>(
-            static_cast<uint32_t>(metadata.operands[2].mem.disp));
+        const uint32_t* n = sourceValues_[2].getAsVector<uint32_t>();
+        const int64_t offset = static_cast<int64_t>(
+            static_cast<int32_t>(metadata_.operands[2].mem.disp));
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num);
 
         for (int i = 0; i < partition_num; i++) {
@@ -978,13 +1256,13 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         break;
       }
       case Opcode::AArch64_GLD1D_REAL: {  // ld1d {zt.d}, pg/z, [xn, zm.d]
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t base = operands[1].get<uint64_t>();
-        const uint64_t* offset = operands[2].getAsVector<uint64_t>();
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t* offset = sourceValues_[2].getAsVector<uint64_t>();
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num);
 
         for (int i = 0; i < partition_num; i++) {
@@ -999,13 +1277,13 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
       }
       case Opcode::AArch64_GLD1D_SCALED_REAL: {  // ld1d {zt.d}, pg/z, [xn,
                                                  // zm.d, LSL #3]
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t base = operands[1].get<uint64_t>();
-        const uint64_t* offset = operands[2].getAsVector<uint64_t>();
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t* offset = sourceValues_[2].getAsVector<uint64_t>();
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num);
 
         for (int i = 0; i < partition_num; i++) {
@@ -1020,14 +1298,14 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
       }
       case Opcode::AArch64_GLD1D_IMM_REAL: {  // ld1d {zd.d}, pg/z, [zn.d{,
                                               // #imm}]
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t* n = operands[1].getAsVector<uint64_t>();
-        const uint64_t offset =
-            static_cast<uint64_t>(metadata.operands[2].mem.disp);
+        const uint64_t* n = sourceValues_[1].getAsVector<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[2].mem.disp);
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num);
 
         for (int i = 0; i < partition_num; i++) {
@@ -1042,14 +1320,14 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
       }
       case Opcode::AArch64_GLD1SW_D_IMM_REAL: {  // ld1sw {zd.d}, pg/z,
                                                  // [zn.d{, #imm}]
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t* n = operands[1].getAsVector<uint64_t>();
-        const uint64_t offset =
-            static_cast<uint64_t>(metadata.operands[2].mem.disp);
+        const uint64_t* n = sourceValues_[1].getAsVector<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[2].mem.disp);
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num);
 
         for (int i = 0; i < partition_num; i++) {
@@ -1064,13 +1342,13 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
       }
       case Opcode::AArch64_GLD1W_D_SCALED_REAL: {  // ld1w {zd.d}, pg/z,
                                                    // [<xn|sp>, zm.d, lsl #2]
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t n = operands[1].get<uint64_t>();
-        const uint64_t* m = operands[2].getAsVector<uint64_t>();
+        const uint64_t n = sourceValues_[1].get<uint64_t>();
+        const uint64_t* m = sourceValues_[2].getAsVector<uint64_t>();
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num);
 
         for (int i = 0; i < partition_num; i++) {
@@ -1083,15 +1361,36 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(addresses);
         break;
       }
+      case Opcode::AArch64_GLD1W_SXTW_REAL: {  // ld1w {zd.s}, pg/z,
+                                               // [<xn|sp>, zm.s, sxtw]
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
+        const uint16_t partition_num = VL_bits / 32;
+
+        const uint64_t n = sourceValues_[1].get<uint64_t>();
+        const uint32_t* m = sourceValues_[2].getAsVector<uint32_t>();
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(partition_num);
+
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << ((i % 16) * 4);
+          if (p[i / 16] & shifted_active) {
+            uint64_t addr = n + m[i];
+            addresses.push_back({addr, 4});
+          }
+        }
+        setMemoryAddresses(addresses);
+        break;
+      }
       case Opcode::AArch64_SST1D_IMM: {  // st1d {zt.d}, pg, [zn.d{, #imm}]
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
 
-        const uint64_t* n = operands[2].getAsVector<uint64_t>();
-        const uint64_t offset =
-            static_cast<uint64_t>(metadata.operands[2].mem.disp);
+        const uint64_t* n = sourceValues_[2].getAsVector<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[2].mem.disp);
 
-        std::vector<MemoryAccessTarget> addresses;
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num);
 
         for (int i = 0; i < partition_num; i++) {
@@ -1105,9 +1404,9 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         break;
       }
       case Opcode::AArch64_ST1Fourv2s_POST: {  // st1 {vt.2s, vt2.2s, vt3.2s,
-                                               // vt4.2s}, [xn|sp], #imm
-        const uint64_t base = operands[4].get<uint64_t>();
-        std::vector<MemoryAccessTarget> addresses;
+                                               // vt4.2s}, [xn], <#imm|xm>
+        const uint64_t base = sourceValues_[4].get<uint64_t>();
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(4);
 
         for (int i = 0; i < 4; i++) {
@@ -1116,10 +1415,25 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
+      case Opcode::AArch64_ST1Fourv16b:  // st1 {vt.16b, vt2.16b, vt3.16b,
+                                         // v42.16b}, [xn]
+        [[fallthrough]];
+      case Opcode::AArch64_ST1Fourv16b_POST:  // st1 {vt.16b, vt3.16b, v42.16b,
+                                              // vt2.16b}, [xn], <#imm|xm>
+        [[fallthrough]];
+      case Opcode::AArch64_ST1Fourv2d:  // st1 {vt.2d, vt2.2d, vt3.2d, vt4.2d},
+                                        // [xn]
+        [[fallthrough]];
+      case Opcode::AArch64_ST1Fourv2d_POST:  // st1 {vt.2d, vt3.2d, vt4.2d,
+                                             // vt2.2d}, [xn], <#imm|xm>
+        [[fallthrough]];
+      case Opcode::AArch64_ST1Fourv4s:  // st1 {vt.4s, vt2.4s, vt3.4s, vt4.4s},
+                                        // [xn]
+        [[fallthrough]];
       case Opcode::AArch64_ST1Fourv4s_POST: {  // st1 {vt.4s, vt2.4s, vt3.4s,
-                                               // vt4.4s}, [xn|sp], #imm
-        const uint64_t base = operands[4].get<uint64_t>();
-        std::vector<MemoryAccessTarget> addresses;
+                                               // vt4.4s}, [xn], <#imm|xm>
+        const uint64_t base = sourceValues_[4].get<uint64_t>();
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(4);
 
         for (int i = 0; i < 4; i++) {
@@ -1128,20 +1442,22 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
-      case Opcode::AArch64_ST1Twov16b: {  // st1 {vt.16b, vt2.16b}, [xn]
-        const uint64_t base = operands[2].get<uint64_t>();
-        std::vector<MemoryAccessTarget> addresses;
-        addresses.reserve(2);
-
-        for (int i = 0; i < 2; i++) {
-          addresses.push_back({base + (i * 16), 16});
-        }
-        setMemoryAddresses(std::move(addresses));
-        break;
-      }
-      case Opcode::AArch64_ST1Twov4s: {  // st1 {vt.4s, vt2.4s}, [xn]
-        const uint64_t base = operands[2].get<uint64_t>();
-        std::vector<MemoryAccessTarget> addresses;
+      case Opcode::AArch64_ST1Twov16b:  // st1 {vt.16b, vt2.16b}, [xn]
+        [[fallthrough]];
+      case Opcode::AArch64_ST1Twov16b_POST:  // st1 {vt.16b, vt2.16b}, [xn],
+                                             // <#imm|xm>
+        [[fallthrough]];
+      case Opcode::AArch64_ST1Twov2d:  // st1 {vt.2d, vt2.2d}, [xn]
+        [[fallthrough]];
+      case Opcode::AArch64_ST1Twov2d_POST:  // st1 {vt.2d, vt2.2d}, [xn],
+                                            // <#imm|xm>
+        [[fallthrough]];
+      case Opcode::AArch64_ST1Twov4s:  // st1 {vt.4s, vt2.4s}, [xn]
+        [[fallthrough]];
+      case Opcode::AArch64_ST1Twov4s_POST: {  // st1 {vt.4s, vt2.4s}, [xn],
+                                              // <#imm|xm>
+        const uint64_t base = sourceValues_[2].get<uint64_t>();
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(2);
 
         for (int i = 0; i < 2; i++) {
@@ -1153,31 +1469,31 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
       case Opcode::AArch64_ST1i8_POST:
         [[fallthrough]];
       case Opcode::AArch64_ST1i8: {  // st1 {vt.b}[index], [xn]
-        setMemoryAddresses({{operands[1].get<uint64_t>(), 1}});
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 1}});
         break;
       }
       case Opcode::AArch64_ST1i16_POST:
         [[fallthrough]];
       case Opcode::AArch64_ST1i16: {  // st1 {vt.h}[index], [xn]
-        setMemoryAddresses({{operands[1].get<uint64_t>(), 2}});
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 2}});
         break;
       }
       case Opcode::AArch64_ST1i32_POST:
         [[fallthrough]];
       case Opcode::AArch64_ST1i32: {  // st1 {vt.s}[index], [xn]
-        setMemoryAddresses({{operands[1].get<uint64_t>(), 4}});
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 4}});
         break;
       }
       case Opcode::AArch64_ST1i64_POST:
         [[fallthrough]];
       case Opcode::AArch64_ST1i64: {  // st1 {vt.d}[index], [xn]
-        setMemoryAddresses({{operands[1].get<uint64_t>(), 8}});
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 8}});
         break;
       }
       case Opcode::AArch64_ST2Twov4s_POST: {  // st2 {vt1.4s, vt2.4s}, [xn],
                                               // #imm
-        const uint64_t base = operands[2].get<uint64_t>();
-        std::vector<MemoryAccessTarget> addresses;
+        const uint64_t base = sourceValues_[2].get<uint64_t>();
+        std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(2);
 
         for (int i = 0; i < 2; i++) {
@@ -1188,260 +1504,312 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
         break;
       }
       case Opcode::AArch64_STLRB: {  // stlrb wt, [xn]
-        setMemoryAddresses({{operands[1].get<uint64_t>(), 1}});
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 1}});
         break;
       }
       case Opcode::AArch64_STLRW: {  // stlr wt, [xn]
-        setMemoryAddresses({{operands[1].get<uint64_t>(), 4}});
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 4}});
         break;
       }
       case Opcode::AArch64_STLRX: {  // stlr xt, [xn]
-        setMemoryAddresses({{operands[1].get<uint64_t>(), 8}});
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 8}});
         break;
       }
       case Opcode::AArch64_STLXRW: {  // stlxr ws, wt, [xn]
-        setMemoryAddresses({{operands[1].get<uint64_t>(), 4}});
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 4}});
         break;
       }
       case Opcode::AArch64_STLXRX: {  // stlxr ws, xt, [xn]
-        setMemoryAddresses({{operands[1].get<uint64_t>(), 8}});
-        break;
-      }
-      case Opcode::AArch64_STPDi:      // stp dt1, dt2, [xn, #imm]
-      case Opcode::AArch64_STPDpre:    // stp dt1, dt2, [xn, #imm]!
-      case Opcode::AArch64_STPQi:      // stp qt1, qt2, [xn, #imm]
-      case Opcode::AArch64_STPQpre:    // stp qt1, qt2, [xn, #imm]!
-      case Opcode::AArch64_STPSi:      // stp st1, st2, [xn, #imm]
-      case Opcode::AArch64_STPSpre:    // stp st1, st2, [xn, #imm]!
-      case Opcode::AArch64_STPWi:      // stp wt1, wt2, [xn, #imm]
-      case Opcode::AArch64_STPWpre:    // stp wt1, wt2, [xn, #imm]!
-      case Opcode::AArch64_STPXi:      // stp xt1, xt2, [xn, #imm]
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 8}});
+        break;
+      }
+      case Opcode::AArch64_STPDi:  // stp dt1, dt2, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STPDpre:  // stp dt1, dt2, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STPQi:  // stp qt1, qt2, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STPQpre:  // stp qt1, qt2, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STPSi:  // stp st1, st2, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STPSpre:  // stp st1, st2, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STPWi:  // stp wt1, wt2, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STPWpre:  // stp wt1, wt2, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STPXi:  // stp xt1, xt2, [xn, #imm]
+        [[fallthrough]];
       case Opcode::AArch64_STPXpre: {  // stp xt1, xt2, [xn, #imm]!
-        std::vector<simeng::MemoryAccessTarget> addresses;
+        std::vector<simeng::memory::MemoryAccessTarget> addresses;
         generateContiguousAddresses(
-            operands[2].get<uint64_t>() + metadata.operands[2].mem.disp, 2,
-            dataSize_, addresses);
+            sourceValues_[2].get<uint64_t>() + metadata_.operands[2].mem.disp,
+            2, dataSize_, addresses);
         setMemoryAddresses(addresses);
         break;
       }
-      case Opcode::AArch64_STPDpost:    // stp dt1, dt2, [xn], #imm
-      case Opcode::AArch64_STPQpost:    // stp qt1, qt2, [xn], #imm
-      case Opcode::AArch64_STPSpost:    // stp st1, st2, [xn], #imm
-      case Opcode::AArch64_STPWpost:    // stp wt1, wt2, [xn], #imm
+      case Opcode::AArch64_STPDpost:  // stp dt1, dt2, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_STPQpost:  // stp qt1, qt2, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_STPSpost:  // stp st1, st2, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_STPWpost:  // stp wt1, wt2, [xn], #imm
+        [[fallthrough]];
       case Opcode::AArch64_STPXpost: {  // stp xt1, xt2, [xn], #imm
-        std::vector<simeng::MemoryAccessTarget> addresses;
-        generateContiguousAddresses(operands[2].get<uint64_t>(), 2, dataSize_,
-                                    addresses);
+        std::vector<memory::MemoryAccessTarget> addresses;
+        generateContiguousAddresses(sourceValues_[2].get<uint64_t>(), 2,
+                                    dataSize_, addresses);
         setMemoryAddresses(addresses);
         break;
       }
       case Opcode::AArch64_STRBBpost: {  // strb wd, [xn], #imm
-        setMemoryAddresses({{operands[1].get<uint64_t>(), 1}});
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 1}});
         break;
       }
       case Opcode::AArch64_STRBBpre: {  // strb wd, [xn, #imm]!
         setMemoryAddresses(
-            {{operands[1].get<uint64_t>() + metadata.operands[1].mem.disp, 1}});
+            {{sourceValues_[1].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              1}});
         break;
       }
       case Opcode::AArch64_STRBBroW: {  // strb wd,
                                         //  [xn, wm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[2].get<uint32_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[1].get<uint64_t>() + offset, 1}});
+        uint64_t offset = extendOffset(sourceValues_[2].get<uint32_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>() + offset, 1}});
         break;
       }
       case Opcode::AArch64_STRBBroX: {  // strb wd,
                                         //  [xn, xm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[2].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[1].get<uint64_t>() + offset, 1}});
+        uint64_t offset = extendOffset(sourceValues_[2].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>() + offset, 1}});
         break;
       }
       case Opcode::AArch64_STRBBui: {  // strb wd, [xn, #imm]
         setMemoryAddresses(
-            {{operands[1].get<uint64_t>() + metadata.operands[1].mem.disp, 1}});
+            {{sourceValues_[1].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              1}});
         break;
       }
       case Opcode::AArch64_STRDroW: {  // str dt, [xn, wm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[2].get<uint32_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[1].get<uint64_t>() + offset, 8}});
+        uint64_t offset = extendOffset(sourceValues_[2].get<uint32_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>() + offset, 8}});
         break;
       }
       case Opcode::AArch64_STRDroX: {  // str dt, [xn, xm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[2].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[1].get<uint64_t>() + offset, 8}});
-        break;
-      }
-      case Opcode::AArch64_STRBui:     // str bt, [xn, #imm]
-      case Opcode::AArch64_STRBpre:    // str bt, [xn, #imm]!
-      case Opcode::AArch64_STRDui:     // str dt, [xn, #imm]
-      case Opcode::AArch64_STRDpre:    // str dt, [xn, #imm]!
-      case Opcode::AArch64_STRHui:     // str ht, [xn, #imm]
-      case Opcode::AArch64_STRHpre:    // str ht, [xn, #imm]!
-      case Opcode::AArch64_STRQui:     // str qt, [xn, #imm]
-      case Opcode::AArch64_STRQpre:    // str qt, [xn, #imm]!
-      case Opcode::AArch64_STRSui:     // str st, [xn, #imm]
-      case Opcode::AArch64_STRSpre:    // str st, [xn, #imm]!
-      case Opcode::AArch64_STRWui:     // str wt, [xn, #imm]
-      case Opcode::AArch64_STRWpre:    // str wt, [xn, #imm]!
-      case Opcode::AArch64_STRXui:     // str xt, [xn, #imm]
+        uint64_t offset = extendOffset(sourceValues_[2].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>() + offset, 8}});
+        break;
+      }
+      case Opcode::AArch64_STRBui:  // str bt, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STRBpre:  // str bt, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STRDui:  // str dt, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STRDpre:  // str dt, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STRHui:  // str ht, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STRHpre:  // str ht, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STRQui:  // str qt, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STRQpre:  // str qt, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STRSui:  // str st, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STRSpre:  // str st, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STRWui:  // str wt, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STRWpre:  // str wt, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STRXui:  // str xt, [xn, #imm]
+        [[fallthrough]];
       case Opcode::AArch64_STRXpre: {  // str xt, [xn, #imm]!
-        std::vector<simeng::MemoryAccessTarget> addresses;
+        std::vector<simeng::memory::MemoryAccessTarget> addresses;
         generateContiguousAddresses(
-            operands[1].get<uint64_t>() + metadata.operands[1].mem.disp, 1,
-            dataSize_, addresses);
+            sourceValues_[1].get<uint64_t>() + metadata_.operands[1].mem.disp,
+            1, dataSize_, addresses);
         setMemoryAddresses(addresses);
         break;
       }
-      case Opcode::AArch64_STRBpost:    // str bt, [xn], #imm
-      case Opcode::AArch64_STRDpost:    // str dt, [xn], #imm
-      case Opcode::AArch64_STRHpost:    // str ht, [xn], #imm
-      case Opcode::AArch64_STRQpost:    // str qt, [xn], #imm
-      case Opcode::AArch64_STRSpost:    // str st, [xn], #imm
-      case Opcode::AArch64_STRWpost:    // str wt, [xn], #imm
+      case Opcode::AArch64_STRBpost:  // str bt, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_STRDpost:  // str dt, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_STRHpost:  // str ht, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_STRQpost:  // str qt, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_STRSpost:  // str st, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_STRWpost:  // str wt, [xn], #imm
+        [[fallthrough]];
       case Opcode::AArch64_STRXpost: {  // str xt, [xn], #imm
-        std::vector<simeng::MemoryAccessTarget> addresses;
-        generateContiguousAddresses(operands[1].get<uint64_t>(), 1, dataSize_,
-                                    addresses);
+        std::vector<memory::MemoryAccessTarget> addresses;
+        generateContiguousAddresses(sourceValues_[1].get<uint64_t>(), 1,
+                                    dataSize_, addresses);
         setMemoryAddresses(addresses);
         break;
       }
       case Opcode::AArch64_STRHHpost: {  // strh wt, [xn], #imm
-        setMemoryAddresses({{operands[1].get<uint64_t>(), 2}});
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 2}});
         break;
       }
       case Opcode::AArch64_STRHHpre: {  // strh wd, [xn, #imm]!
         setMemoryAddresses(
-            {{operands[1].get<uint64_t>() + metadata.operands[1].mem.disp, 2}});
+            {{sourceValues_[1].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              2}});
         break;
       }
       case Opcode::AArch64_STRHHroW: {  // strh wd,
                                         //  [xn, wm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[2].get<uint32_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[1].get<uint64_t>() + offset, 2}});
+        uint64_t offset = extendOffset(sourceValues_[2].get<uint32_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>() + offset, 2}});
         break;
       }
       case Opcode::AArch64_STRHHroX: {  // strh wd,
                                         //  [xn, xm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[2].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[1].get<uint64_t>() + offset, 2}});
+        uint64_t offset = extendOffset(sourceValues_[2].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>() + offset, 2}});
         break;
       }
       case Opcode::AArch64_STRHHui: {  // strh wt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[1].get<uint64_t>() + metadata.operands[1].mem.disp, 2}});
+            {{sourceValues_[1].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              2}});
         break;
       }
       case Opcode::AArch64_STRQroX: {  // str qt, [xn, xm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[2].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[1].get<uint64_t>() + offset, 16}});
+        uint64_t offset = extendOffset(sourceValues_[2].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>() + offset, 16}});
         break;
       }
       case Opcode::AArch64_STRSroW: {  // str st, [xn, wm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[2].get<uint32_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[1].get<uint64_t>() + offset, 4}});
+        uint64_t offset = extendOffset(sourceValues_[2].get<uint32_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>() + offset, 4}});
         break;
       }
       case Opcode::AArch64_STRSroX: {  // str st, [xn, xm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[2].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[1].get<uint64_t>() + offset, 4}});
+        uint64_t offset = extendOffset(sourceValues_[2].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>() + offset, 4}});
         break;
       }
       case Opcode::AArch64_STRWroW: {  // str wd, [xn, wm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[2].get<uint32_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[1].get<uint64_t>() + offset, 4}});
+        uint64_t offset = extendOffset(sourceValues_[2].get<uint32_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>() + offset, 4}});
         break;
       }
       case Opcode::AArch64_STRWroX: {  // str wt, [xn, xm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[2].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[1].get<uint64_t>() + offset, 4}});
+        uint64_t offset = extendOffset(sourceValues_[2].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>() + offset, 4}});
         break;
       }
       case Opcode::AArch64_STRXroW: {  // str xd, [xn, wm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[2].get<uint32_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[1].get<uint64_t>() + offset, 8}});
+        uint64_t offset = extendOffset(sourceValues_[2].get<uint32_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>() + offset, 8}});
         break;
       }
       case Opcode::AArch64_STRXroX: {  // str xt, [xn, xm{, extend {#amount}}]
-        uint64_t offset =
-            extendOffset(operands[2].get<uint64_t>(), metadata.operands[1]);
-        setMemoryAddresses({{operands[1].get<uint64_t>() + offset, 8}});
+        uint64_t offset = extendOffset(sourceValues_[2].get<uint64_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>() + offset, 8}});
         break;
       }
       case Opcode::AArch64_STR_PXI: {  // str pt, [xn{, #imm, mul vl}]
         const uint64_t PL_bits = VL_bits / 8;
         const uint16_t partition_num = PL_bits / 8;
 
-        const uint64_t base = operands[1].get<uint64_t>();
-        const uint64_t offset =
-            static_cast<uint64_t>(metadata.operands[1].mem.disp);
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[1].mem.disp);
 
         setMemoryAddresses({base + (offset * partition_num), partition_num});
         break;
       }
+      case Opcode::AArch64_STR_ZA: {  // str za[wv, #imm], [xn|sp{, #imm, mul
+                                      // vl}]
+        // SME
+        // ZA Row count === current VL in bytes
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint64_t xn = sourceValues_[zaRowCount + 1].get<uint64_t>();
+        const uint64_t imm = metadata_.operands[1].mem.disp;
+        setMemoryAddresses({{xn + (imm * zaRowCount), zaRowCount}});
+        break;
+      }
       case Opcode::AArch64_STR_ZXI: {  // str zt, [xn{, #imm, mul vl}]
         const uint16_t partition_num = VL_bits / 8;
 
-        const uint64_t base = operands[1].get<uint64_t>();
-        const uint64_t offset =
-            static_cast<uint64_t>(metadata.operands[1].mem.disp);
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[1].mem.disp);
 
         setMemoryAddresses({base + (offset * partition_num), partition_num});
         break;
       }
       case Opcode::AArch64_STURBBi: {  // sturb wd, [xn, #imm]
         setMemoryAddresses(
-            {{operands[1].get<uint64_t>() + metadata.operands[1].mem.disp, 1}});
+            {{sourceValues_[1].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              1}});
         break;
       }
       case Opcode::AArch64_STURDi: {  // stur dt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[1].get<uint64_t>() + metadata.operands[1].mem.disp, 8}});
+            {{sourceValues_[1].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              8}});
         break;
       }
       case Opcode::AArch64_STURHHi: {  // sturh wt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[1].get<uint64_t>() + metadata.operands[1].mem.disp, 2}});
+            {{sourceValues_[1].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              2}});
         break;
       }
       case Opcode::AArch64_STURQi: {  // stur qt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[1].get<uint64_t>() + metadata.operands[1].mem.disp,
+            {{sourceValues_[1].get<uint64_t>() + metadata_.operands[1].mem.disp,
               16}});
         break;
       }
       case Opcode::AArch64_STURSi: {  // stur st, [xn, #imm]
         setMemoryAddresses(
-            {{operands[1].get<uint64_t>() + metadata.operands[1].mem.disp, 4}});
+            {{sourceValues_[1].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              4}});
         break;
       }
       case Opcode::AArch64_STURWi: {  // stur wt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[1].get<uint64_t>() + metadata.operands[1].mem.disp, 4}});
+            {{sourceValues_[1].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              4}});
         break;
       }
       case Opcode::AArch64_STURXi: {  // stur xt, [xn, #imm]
         setMemoryAddresses(
-            {{operands[1].get<uint64_t>() + metadata.operands[1].mem.disp, 8}});
+            {{sourceValues_[1].get<uint64_t>() + metadata_.operands[1].mem.disp,
+              8}});
         break;
       }
       case Opcode::AArch64_STXRW: {  // stxr ws, wt, [xn]
-        setMemoryAddresses({{operands[1].get<uint64_t>(), 4}});
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 4}});
         break;
       }
       case Opcode::AArch64_STXRX: {  // stxr ws, xt, [xn]
-        setMemoryAddresses({{operands[1].get<uint64_t>(), 8}});
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 8}});
         break;
       }
       default:
diff --git a/src/lib/arch/aarch64/Instruction_decode.cc b/src/lib/arch/aarch64/Instruction_decode.cc
index 3c0a4dfee3..3535ce590f 100644
--- a/src/lib/arch/aarch64/Instruction_decode.cc
+++ b/src/lib/arch/aarch64/Instruction_decode.cc
@@ -1,14 +1,70 @@
-#include "InstructionMetadata.hh"
+#include <unordered_set>
 
-#define NOT(bits, length) (~bits & (1 << length - 1))
-#define CONCAT(hi, lo, lowLen) ((hi << lowLen) & lo)
-#define ONES(n) ((1 << (n)) - 1)
-#define ROR(x, shift, size) ((x >> shift) | (x << (size - shift)))
+#include "InstructionMetadata.hh"
 
 namespace simeng {
 namespace arch {
 namespace aarch64 {
 
+/**************************
+ * HELPER DATA STRUCTURES
+ **************************/
+
+static const std::unordered_set<std::string> logicalOps = {
+    "and", "bic", "bif", "bit", "bsl",  "bcax", "bmop",
+    "eor", "eon", "mvn", "not", "nand", "nbsl", "nor",
+    "rax", "xar", "orr", "orq", "orv",  "tst",  "orn"};
+
+static const std::unordered_set<std::string> cmpOps = {
+    "ccmn",   "cmn",   "cmp",   "cmpp",   "cmpeq", "cmpge", "cmpgt",
+    "cmphi",  "cmphs", "cmple", "cmplo",  "cmpls", "cmplt", "cmpne",
+    "cmptst", "ccmp",  "cmeq",  "cmge",   "cmgt",  "cmtst", "cmhi",
+    "cmhs",   "cmla",  "cmle",  "cmlt",   "fac",   "facge", "facgt",
+    "facle",  "faclt", "fccmp", "fccmpe", "fcmp",  "fcmpe", "fcmuo",
+    "fcmeq",  "fcmge", "fcmgt", "fcmle",  "fcmlt", "fcmne"};
+
+static const std::unordered_set<std::string> cvtOps = {
+    "bfcvt",   "bfcvtn",  "bfcvtnt",  "bf1cvt",  "bf1cvtl", "bf1cvtlt",
+    "bf2cvt",  "bf2cvtl", "bf2cvtlt", "fcvt",    "fcvtas",  "fcvtau",
+    "fcvtl",   "fcvtms",  "fcvtmu",   "fcvtn",   "fcvtns",  "fcvtnu",
+    "fcvtps",  "fcvtpu",  "fcvtxn",   "fcvtzs",  "fcvtzu",  "fcvtlt",
+    "fcvtnb",  "fcvtnt",  "fcvtx",    "fcvtxnt", "fcvtzs",  "fcvtzu",
+    "f1cvt",   "f1cvtl",  "f1cvtlt",  "f2cvt",   "f2cvtl",  "f2cvtlt",
+    "fjcvtzs", "scvtf",   "ucvtf"};
+
+static const std::unordered_set<std::string> divsqrtOps = {
+    "sdiv",   "sdivr",   "udiv",    "udivr", "fdiv",   "fdivr",
+    "frsqrt", "frsqrte", "frsqrts", "fsqrt", "ursqrte"};
+
+static const std::unordered_set<std::string> mulOps = {
+    "bfmmla",   "bfmul",     "bfml",     "bfmla",     "bfmlalb",   "bfmlalt",
+    "bfmlal",   "bfmls",     "bfmlslb",  "bfmlslt",   "bfmlsl",    "cmla",
+    "dot",      "bfdot",     "bfvdot",   "fdot",      "fvdot",     "fvdotb",
+    "fvdott",   "sdot",      "sudot",    "suvdot",    "udot",      "usdot",
+    "usvdot",   "uvdot",     "cdot",     "fmla",      "fmlal",     "fmlal2",
+    "fmlalb",   "fmlalt",    "fmlallbb", "fmlallbt",  "fmlalltb",  "fmlalltt",
+    "fmlall",   "fmls",      "fmlsl",    "fmlsl2",    "fmlslb",    "fmlslt",
+    "fmul",     "fmulx",     "fmad",     "fmadd",     "fmmla",     "fmsb",
+    "fmsub",    "ftmad",     "fcmla",    "fnm",       "fnmad",     "fnmla",
+    "fnmls",    "fnmsb",     "fnmadd",   "fnmsub",    "fnmul",     "madd",
+    "maddpt",   "mul",       "mla",      "mlapt",     "mls",       "mneg",
+    "msub",     "msubpt",    "mad",      "madpt",     "msb",       "mop",
+    "bfmopa",   "bfmops",    "bmopa",    "bmops",     "fmopa",     "fmops",
+    "smopa",    "smops",     "sumopa",   "sumops",    "umopa",     "umops",
+    "usmopa",   "usmops",    "pmul",     "pmull",     "pmull2",    "pmullb",
+    "pmullt",   "sml",       "smlalb",   "smlalt",    "smlslb",    "smlslt",
+    "smlal",    "smlal2",    "smlsl",    "smlsl2",    "smlall",    "smlsll",
+    "smmla",    "smul",      "smulh",    "smull",     "smull2",    "smullb",
+    "smullt",   "sqdm",      "sqdmlal",  "sqdmlal2",  "sqdmlsl",   "sqdmlsl2",
+    "sqdmulh",  "sqdmull",   "sqdmull2", "sqdmlalb",  "sqdmlalbt", "sqdmlalt",
+    "sqdmlslb", "sqdmlslbt", "sqdmlslt", "sqdmullb",  "sqdmullt",  "sqrd",
+    "sqrdmlah", "sqrdmlsh",  "sqrdmulh", "sqrdcmlah", "sumlall",   "smaddl",
+    "smnegl",   "smsubl",    "umul",     "umulh",     "umull",     "umull2",
+    "umullb",   "umullt",    "uml",      "umlal",     "umlal2",    "umlsl",
+    "umlsl2",   "umlslt",    "umlalb",   "umlalt",    "umlslb",    "umlall",
+    "umlsll",   "usmlall",   "usmmla",   "ummla",     "umaddl",    "umnegl",
+    "umsubl"};
+
 /********************
  * HELPER FUNCTIONS
  *******************/
@@ -17,13 +73,12 @@ namespace aarch64 {
 constexpr bool bit(uint32_t value, uint8_t start) {
   return (value >> start) & 1;
 }
+
 // Extract bits `start` to `start+width` of `value`
 constexpr uint32_t bits(uint32_t value, uint8_t start, uint8_t width) {
   return ((value >> start) & ((1 << width) - 1));
 }
 
-// Generate a general purpose register identifier with tag `tag`
-constexpr Register genReg(uint16_t tag) { return {RegisterType::GENERAL, tag}; }
 // Generate a NZCV register identifier
 constexpr Register nzcvReg() { return {RegisterType::NZCV, 0}; }
 
@@ -34,133 +89,147 @@ constexpr int32_t signExtend(uint32_t value, int currentLength) {
   return static_cast<int32_t>(value) | (negative ? mask : 0);
 }
 
-/** Parses the Capstone `arm64_reg` value to generate an architectural register
- * representation.
+/** Parses the Capstone `aarch64_reg` value to generate an architectural
+ * register representation.
  *
  * WARNING: this conversion is FRAGILE, and relies on the structure of the
- * `arm64_reg` enum. Updates to the Capstone library version may cause this to
- * break. */
-Register csRegToRegister(arm64_reg reg) {
-  // Check from top of the range downwards
+ * `aarch64_reg` enum. Updates to the Capstone library version may cause this to
+ * break.
+ * */
+Register csRegToRegister(aarch64_reg reg) {
+  // Do not need check for AARCH64_REG_Vn as in Capstone, they are aliased as Qn
+  // (full vector) or Dn (half vector).
+  // As D and Q registers are also of type RegisterType::VECTOR, the outcome
+  // will be the same
+
+  // Assert that reg is not a SME tile as these should be passed to
+  // `getZARowVectors()`
+  assert(reg != AARCH64_REG_ZA);
+  assert(!(AARCH64_REG_ZAB0 <= reg && reg <= AARCH64_REG_ZAS3));
+
+  // AARCH64_REG_ZT0 is a fixed with Table register, reading from the table
+  // register file.
+  if (reg == AARCH64_REG_ZT0) {
+    return {RegisterType::TABLE, 0};
+  }
+
+  // AARCH64_REG_Z0 -> +31 are scalable vector registers (Z) registers, reading
+  // from the vector file
+  if (AARCH64_REG_Z0 <= reg && reg <= AARCH64_REG_Z31) {
+    return {RegisterType::VECTOR, static_cast<uint16_t>(reg - AARCH64_REG_Z0)};
+  }
 
-  // ARM64_REG_V0 -> {end} are vector registers, reading from the vector file
-  if (reg >= ARM64_REG_V0) {
-    return {RegisterType::VECTOR, static_cast<uint16_t>(reg - ARM64_REG_V0)};
+  // AARCH64_REG_X0 -> +28 are 64-bit (X) registers, reading from the general
+  // file. Excludes #29 (FP) and #30 (LR)
+  if (AARCH64_REG_X0 <= reg && reg <= AARCH64_REG_X28) {
+    return {RegisterType::GENERAL, static_cast<uint16_t>(reg - AARCH64_REG_X0)};
   }
 
-  // ARM64_REG_ZAB0 -> +31 are tiles of the matrix register (ZA), reading from
-  // the matrix file.
-  if (reg >= ARM64_REG_ZAB0) {
-    // Placeholder value returned as each tile (what the enum represents)
-    // consists of multiple vectors (rows)
-    return {RegisterType::MATRIX, 0};
+  // AARCH64_REG_W0 -> +30 are 32-bit (W) registers, reading from the general
+  // file. Excludes #31 (WZR/WSP).
+  if (AARCH64_REG_W0 <= reg && reg <= AARCH64_REG_W30) {
+    return {RegisterType::GENERAL, static_cast<uint16_t>(reg - AARCH64_REG_W0)};
   }
 
-  // ARM64_REG_Z0 -> +31 are scalable vector registers (Z) registers, reading
-  // from the vector file
-  if (reg >= ARM64_REG_Z0) {
-    return {RegisterType::VECTOR, static_cast<uint16_t>(reg - ARM64_REG_Z0)};
+  // AARCH64_REG_Q0 -> +31 are 128-bit registers representing scalar access
+  // specifiers on the vector registers
+  if (AARCH64_REG_Q0 <= reg && reg <= AARCH64_REG_Q31) {
+    return {RegisterType::VECTOR, static_cast<uint16_t>(reg - AARCH64_REG_Q0)};
   }
 
-  // ARM64_REG_X0 -> +28 are 64-bit (X) registers, reading from the general
-  // file. Excludes #29 (FP) and #30 (LR)
-  if (reg >= ARM64_REG_X0) {
-    return {RegisterType::GENERAL, static_cast<uint16_t>(reg - ARM64_REG_X0)};
+  // AARCH64_REG_D0 -> +31 are 64-bit registers representing scalar access
+  // specifiers on the vector registers
+  if (AARCH64_REG_D0 <= reg && reg <= AARCH64_REG_D31) {
+    return {RegisterType::VECTOR, static_cast<uint16_t>(reg - AARCH64_REG_D0)};
   }
 
-  // ARM64_REG_W0 -> +30 are 32-bit (W) registers, reading from the general
-  // file. Excludes #31 (WZR/WSP).
-  if (reg >= ARM64_REG_W0) {
-    return {RegisterType::GENERAL, static_cast<uint16_t>(reg - ARM64_REG_W0)};
+  // AARCH64_REG_S0 -> +31 are 32-bit registers representing scalar access
+  // specifiers on the vector registers
+  if (AARCH64_REG_S0 <= reg && reg <= AARCH64_REG_S31) {
+    return {RegisterType::VECTOR, static_cast<uint16_t>(reg - AARCH64_REG_S0)};
   }
 
-  // ARM64_REG_Q0 and above are repeated ranges representing scalar access
-  // specifiers on the vector registers with arrangements Q and S, each
-  // covering 32 registers
-  if (reg >= ARM64_REG_Q0) {
-    return {RegisterType::VECTOR,
-            static_cast<uint16_t>((reg - ARM64_REG_Q0) % 32)};
+  // AARCH64_REG_H0 -> +31 are 16-bit registers representing scalar access
+  // specifiers on the vector registers
+  if (AARCH64_REG_H0 <= reg && reg <= AARCH64_REG_H31) {
+    return {RegisterType::VECTOR, static_cast<uint16_t>(reg - AARCH64_REG_H0)};
   }
 
-  // ARM64_REG_P0 -> +15 are 256-bit (P) registers. Excludes #16 (FFR).
-  if (reg >= ARM64_REG_P0) {
-    return {RegisterType::PREDICATE, static_cast<uint16_t>(reg - ARM64_REG_P0)};
+  // AARCH64_REG_B0 -> +31 are 8-bit registers representing scalar access
+  // specifiers on the vector registers
+  if (AARCH64_REG_B0 <= reg && reg <= AARCH64_REG_B31) {
+    return {RegisterType::VECTOR, static_cast<uint16_t>(reg - AARCH64_REG_B0)};
   }
 
-  // ARM64_REG_Q0 and above are repeated ranges representing scalar access
-  // specifiers on the vector registers with arrangements B, D and H, each
-  // covering 32 registers
-  if (reg >= ARM64_REG_B0) {
-    return {RegisterType::VECTOR,
-            static_cast<uint16_t>((reg - ARM64_REG_B0) % 32)};
+  // AARCH64_REG_P0 -> +15 are 256-bit (P) "predicate-as-mask" registers.
+  // Excludes #16 (FFR).
+  // AARCH64_REG_PN0 -> +15 are 256-bit (PN) "predicate-as-counter" registers.
+  // Occupy same registers as (P) predicates but use a different encoding.
+  if (AARCH64_REG_P0 <= reg && reg <= AARCH64_REG_PN15) {
+    return {RegisterType::PREDICATE,
+            static_cast<uint16_t>(static_cast<uint16_t>(reg - AARCH64_REG_P0) %
+                                  16u)};
   }
 
-  // ARM64_REG_WZR and _XZR are zero registers, and don't read
-  if (reg == ARM64_REG_WZR || reg == ARM64_REG_XZR) {
-    return Instruction::ZERO_REGISTER;
+  // AARCH64_REG_WZR and _XZR are zero registers, and don't read
+  if (reg == AARCH64_REG_WZR || reg == AARCH64_REG_XZR) {
+    return RegisterType::ZERO_REGISTER;
   }
 
-  // ARM64_REG_SP and _WSP are stack pointer registers, stored in r31 of the
+  // AARCH64_REG_SP and _WSP are stack pointer registers, stored in r31 of the
   // general file
-  if (reg == ARM64_REG_SP || reg == ARM64_REG_WSP) {
+  if (reg == AARCH64_REG_SP || reg == AARCH64_REG_WSP) {
     return {RegisterType::GENERAL, 31};
   }
 
-  // ARM64_REG_NZCV is the condition flags register
-  if (reg == ARM64_REG_NZCV) {
+  // AARCH64_REG_NZCV is the condition flags register
+  if (reg == AARCH64_REG_NZCV) {
     return {RegisterType::NZCV, 0};
   }
-  // ARM64_REG_X29 is the frame pointer, stored in r29 of the general file
-  if (reg == ARM64_REG_X29) {
+  // AARCH64_REG_X29 is the frame pointer, stored in r29 of the general file
+  if (reg == AARCH64_REG_X29) {
     return {RegisterType::GENERAL, 29};
   }
-  // ARM64_REG_X30 is the link register, stored in r30 of the general file
-  if (reg == ARM64_REG_X30) {
+  // AARCH64_REG_X30 is the link register, stored in r30 of the general file
+  if (reg == AARCH64_REG_X30) {
     return {RegisterType::GENERAL, 30};
   }
 
-  if (reg == ARM64_REG_FFR) {
+  if (reg == AARCH64_REG_FFR) {
     return {RegisterType::PREDICATE, 16};
   }
 
-  // The matrix register (ZA) can also be referenced as a whole in some
-  // instructions.
-  if (reg == ARM64_REG_ZA) {
-    // Placeholder value returned as each tile (what the enum represents)
-    // consists of multiple vectors (rows)
-    return {RegisterType::MATRIX, 0};
-  }
-
   assert(false && "Decoding failed due to unknown register identifier");
   return {std::numeric_limits<uint8_t>::max(),
           std::numeric_limits<uint16_t>::max()};
 }
 
-/** Resturns a full set of rows from the ZA matrix register that make up the
+/** Returns a full set of rows from the ZA matrix register that make up the
  * supplied SME tile register. */
-std::vector<Register> getZARowVectors(arm64_reg reg, const uint64_t SVL_bits) {
+std::vector<Register> getZARowVectors(aarch64_reg reg,
+                                      const uint64_t SVL_bits) {
   std::vector<Register> outRegs;
   // Get SVL in bytes (will equal total number of implemented ZA rows)
   uint64_t SVL = SVL_bits / 8;
 
   uint8_t base = 0;
   uint8_t tileTypeCount = 0;
-  if (reg == ARM64_REG_ZA || reg == ARM64_REG_ZAB0) {
+  if (reg == AARCH64_REG_ZA || reg == AARCH64_REG_ZAB0) {
     // Treat ZA as byte tile : ZAB0 represents whole matrix, only 1 tile
     // Add all rows for this SVL
     // Don't need to set base as will always be 0
     tileTypeCount = 1;
-  } else if (reg >= ARM64_REG_ZAH0 && reg <= ARM64_REG_ZAH1) {
-    base = reg - ARM64_REG_ZAH0;
+  } else if (reg >= AARCH64_REG_ZAH0 && reg <= AARCH64_REG_ZAH1) {
+    base = reg - AARCH64_REG_ZAH0;
     tileTypeCount = 2;
-  } else if (reg >= ARM64_REG_ZAS0 && reg <= ARM64_REG_ZAS3) {
-    base = reg - ARM64_REG_ZAS0;
+  } else if (reg >= AARCH64_REG_ZAS0 && reg <= AARCH64_REG_ZAS3) {
+    base = reg - AARCH64_REG_ZAS0;
     tileTypeCount = 4;
-  } else if (reg >= ARM64_REG_ZAD0 && reg <= ARM64_REG_ZAD7) {
-    base = reg - ARM64_REG_ZAD0;
+  } else if (reg >= AARCH64_REG_ZAD0 && reg <= AARCH64_REG_ZAD7) {
+    base = reg - AARCH64_REG_ZAD0;
     tileTypeCount = 8;
-  } else if (reg >= ARM64_REG_ZAQ0 && reg <= ARM64_REG_ZAQ15) {
-    base = reg - ARM64_REG_ZAQ0;
+  } else if (reg >= AARCH64_REG_ZAQ0 && reg <= AARCH64_REG_ZAQ15) {
+    base = reg - AARCH64_REG_ZAQ0;
     tileTypeCount = 16;
   }
 
@@ -180,195 +249,187 @@ std::vector<Register> getZARowVectors(arm64_reg reg, const uint64_t SVL_bits) {
  * DECODING LOGIC
  *****************/
 void Instruction::decode() {
-  if (metadata.id == ARM64_INS_INVALID) {
+  if (metadata_.id == AARCH64_INS_INVALID) {
     exception_ = InstructionException::EncodingUnallocated;
     exceptionEncountered_ = true;
     return;
   }
 
-  // Extract implicit writes
-  for (size_t i = 0; i < metadata.implicitDestinationCount; i++) {
-    destinationRegisters.push_back(csRegToRegister(
-        static_cast<arm64_reg>(metadata.implicitDestinations[i])));
-    destinationRegisterCount++;
+  // Extract implicit writes, including pre/post index writeback
+  for (size_t i = 0; i < metadata_.implicitDestinationCount; i++) {
+    destinationRegisters_[destinationRegisterCount_] = csRegToRegister(
+        static_cast<aarch64_reg>(metadata_.implicitDestinations[i]));
+    destinationRegisterCount_++;
   }
+
   // Extract implicit reads
-  for (size_t i = 0; i < metadata.implicitSourceCount; i++) {
-    sourceRegisters.push_back(
-        csRegToRegister(static_cast<arm64_reg>(metadata.implicitSources[i])));
-    operandsPending++;
-    sourceRegisterCount++;
+  for (size_t i = 0; i < metadata_.implicitSourceCount; i++) {
+    // TODO: Implement FPCR usage properly
+    // Ignore implicit reading of FPCR
+    if (static_cast<aarch64_reg>(metadata_.implicitSources[i]) ==
+        AARCH64_REG_FPCR)
+      continue;
+    sourceRegisters_[sourceOperandsPending_] =
+        csRegToRegister(static_cast<aarch64_reg>(metadata_.implicitSources[i]));
+    sourceRegisterCount_++;
+    sourceOperandsPending_++;
   }
 
   bool accessesMemory = false;
 
   // Extract explicit register accesses
-  for (size_t i = 0; i < metadata.operandCount; i++) {
-    const auto& op = metadata.operands[i];
-
-    if (op.type == ARM64_OP_REG) {  // Register operand
-      if ((op.access & cs_ac_type::CS_AC_WRITE) && op.reg != ARM64_REG_WZR &&
-          op.reg != ARM64_REG_XZR) {
-        // Determine the data type the instruction operates on based on the
-        // register operand used
-        // Belongs to the predicate group if the detsination register is a
-        // predicate
-        if (op.reg >= ARM64_REG_V0) {
-          isVectorData_ = true;
-        } else if (op.reg >= ARM64_REG_ZAB0 || op.reg == ARM64_REG_ZA) {
-          isSMEData_ = true;
-        } else if (op.reg >= ARM64_REG_Z0) {
-          isSVEData_ = true;
-        } else if (op.reg <= ARM64_REG_S31 && op.reg >= ARM64_REG_Q0) {
-          isScalarData_ = true;
-        } else if (op.reg <= ARM64_REG_P15 && op.reg >= ARM64_REG_P0) {
-          isPredicate_ = true;
-        } else if (op.reg <= ARM64_REG_H31 && op.reg >= ARM64_REG_B0) {
-          isScalarData_ = true;
-        }
-
-        if ((op.reg >= ARM64_REG_ZAB0 && op.reg < ARM64_REG_V0) ||
-            (op.reg == ARM64_REG_ZA)) {
-          // Add all Matrix register rows as destination operands
-          std::vector<Register> regs =
-              getZARowVectors(op.reg, architecture_.getStreamingVectorLength());
-          for (int i = 0; i < regs.size(); i++) {
-            destinationRegisters.push_back(regs[i]);
-            destinationRegisterCount++;
-            // If WRITE, also need to add to source registers to maintain
-            // unaltered row values
-            sourceRegisters.push_back(regs[i]);
-            sourceRegisterCount++;
-            operandsPending++;
+  for (size_t i = 0; i < metadata_.operandCount; i++) {
+    const auto& op = metadata_.operands[i];
+
+    if (op.type == AARCH64_OP_REG) {  // Register operand
+      if ((op.access & cs_ac_type::CS_AC_WRITE)) {
+        if (op.reg != AARCH64_REG_WZR && op.reg != AARCH64_REG_XZR) {
+          // Determine the data type the instruction operates on based on the
+          // register operand used
+          // SME and Predicate based operations use individual op.type
+          if (op.is_vreg) {
+            setInstructionType(InsnType::isVectorData);
+          } else if ((AARCH64_REG_Z0 <= op.reg && op.reg <= AARCH64_REG_Z31) ||
+                     op.reg == AARCH64_REG_ZT0) {
+            // ZT0 is an SME register, but we declare it as an SVE instruction
+            // due to its 1D format.
+            setInstructionType(InsnType::isSVEData);
+          } else if ((op.reg <= AARCH64_REG_S31 && op.reg >= AARCH64_REG_Q0) ||
+                     (op.reg <= AARCH64_REG_H31 && op.reg >= AARCH64_REG_B0)) {
+            setInstructionType(InsnType::isScalarData);
           }
-        } else {
+
           // Add register writes to destinations, but skip zero-register
           // destinations
-          destinationRegisters.push_back(csRegToRegister(op.reg));
-          destinationRegisterCount++;
+          destinationRegisters_[destinationRegisterCount_] =
+              csRegToRegister(op.reg);
+          destinationRegisterCount_++;
         }
       }
       if (op.access & cs_ac_type::CS_AC_READ) {
-        if ((op.reg >= ARM64_REG_ZAB0 && op.reg < ARM64_REG_V0) ||
-            (op.reg == ARM64_REG_ZA)) {
-          // Add all Matrix register rows as source operands
-          std::vector<Register> regs =
-              getZARowVectors(op.reg, architecture_.getStreamingVectorLength());
-          for (int i = 0; i < regs.size(); i++) {
-            sourceRegisters.push_back(regs[i]);
-            sourceRegisterCount++;
-            operandsPending++;
-          }
-        } else {
-          // Add register reads to destinations
-          sourceRegisters.push_back(csRegToRegister(op.reg));
-          operandsPending++;
-          sourceRegisterCount++;
+        // Add register reads to destinations
+        sourceRegisters_[sourceRegisterCount_] = csRegToRegister(op.reg);
+        sourceRegisterCount_++;
+        sourceOperandsPending_++;
+
+        // Identify shift operands
+        if (op.shift.type != aarch64_shifter::AARCH64_SFT_INVALID &&
+            op.shift.value > 0) {
+          setInstructionType(InsnType::isShift);
         }
-        if (op.shift.value > 0) isNoShift_ = false;  // Identify shift operands
       }
-    } else if (op.type == ARM64_OP_MEM) {  // Memory operand
-      accessesMemory = true;
-      sourceRegisters.push_back(csRegToRegister(op.mem.base));
-      operandsPending++;
-      sourceRegisterCount++;
-
-      if (metadata.writeback) {
-        // Writeback instructions modify the base address
-        destinationRegisters.push_back(csRegToRegister(op.mem.base));
-        destinationRegisterCount++;
+    } else if (op.type == AARCH64_OP_MEM) {  // Memory operand
+      // Check base register exists
+      if (op.mem.base != AARCH64_REG_INVALID) {
+        accessesMemory = true;
+        sourceRegisters_[sourceRegisterCount_] = csRegToRegister(op.mem.base);
+        sourceRegisterCount_++;
+        sourceOperandsPending_++;
       }
-      if (op.mem.index) {
+      if (op.mem.index != AARCH64_REG_INVALID) {
         // Register offset; add to sources
-        sourceRegisters.push_back(csRegToRegister(op.mem.index));
-        operandsPending++;
-        sourceRegisterCount++;
-      }
-    } else if (op.type == ARM64_OP_SME_INDEX) {  // SME instruction with index
-      std::vector<Register> regs;
-      if ((op.sme_index.reg >= ARM64_REG_ZAB0 &&
-           op.sme_index.reg < ARM64_REG_V0) ||
-          (op.sme_index.reg == ARM64_REG_ZA)) {
-        // Set instruction group
-        isSMEData_ = true;
-        regs = getZARowVectors(op.sme_index.reg,
-                               architecture_.getStreamingVectorLength());
-        // If WRITE, then also need to add to souce registers to maintain
-        // un-updated rows
-        for (int i = 0; i < regs.size(); i++) {
-          sourceRegisters.push_back(regs[i]);
-          sourceRegisterCount++;
-          operandsPending++;
-          if (op.access & cs_ac_type::CS_AC_WRITE) {
-            destinationRegisters.push_back(regs[i]);
-            destinationRegisterCount++;
-          }
+        sourceRegisters_[sourceRegisterCount_] = csRegToRegister(op.mem.index);
+        // Early check for WZR/XZR registers used as scalar index. Allows SME
+        // instructions to avoid checking all source operands later on.
+        if (sourceRegisters_[sourceRegisterCount_] ==
+            RegisterType::ZERO_REGISTER) {
+          sourceValues_[sourceRegisterCount_] = RegisterValue(0, 8);
+        } else {
+          sourceOperandsPending_++;
         }
-      } else {
-        // SME_INDEX can also be for predicate
-        // Set instruction group
-        isPredicate_ = true;
+        sourceRegisterCount_++;
+      }
+    } else if (op.type == AARCH64_OP_SME) {
+      setInstructionType(InsnType::isSMEData);
+      std::vector<Register> regs = getZARowVectors(
+          op.sme.tile, architecture_.getStreamingVectorLength());
+      // Update operands structure sizes
+      destinationRegisters_.addSMEOperand(regs.size());
+      results_.addSMEOperand(regs.size());
+      sourceRegisters_.addSMEOperand(regs.size());
+      sourceValues_.addSMEOperand(regs.size());
+      for (size_t i = 0; i < regs.size(); i++) {
+        // If READ access, we only need to add SME rows to source registers.
+        // If WRITE access, then we need to add SME rows to destination
+        // registers AND source registers. The latter is required to maintain
+        // any un-updated rows if an SME op will specifies
+        // one row (or column) to write to.
+        sourceRegisters_[sourceRegisterCount_] = regs[i];
+        sourceRegisterCount_++;
+        sourceOperandsPending_++;
         if (op.access & cs_ac_type::CS_AC_WRITE) {
-          destinationRegisters.push_back(csRegToRegister(op.sme_index.reg));
-          destinationRegisterCount++;
-        } else if (op.access & cs_ac_type::CS_AC_READ) {
-          sourceRegisters.push_back(csRegToRegister(op.sme_index.reg));
-          operandsPending++;
-          sourceRegisterCount++;
+          destinationRegisters_[destinationRegisterCount_] = regs[i];
+          destinationRegisterCount_++;
         }
       }
-      // Register that is base of index will always be a source operand
-      sourceRegisters.push_back(csRegToRegister(op.sme_index.base));
-      operandsPending++;
-      sourceRegisterCount++;
-    } else if (op.type == ARM64_OP_REG_MRS) {
-      int32_t sysRegTag = architecture_.getSystemRegisterTag(op.imm);
-      if (sysRegTag == -1) {
-        exceptionEncountered_ = true;
-        exception_ = InstructionException::UnmappedSysReg;
-        // Clear any registered operands
-        sourceRegisterCount = 0;
-        destinationRegisterCount = 0;
-      } else {
-        sourceRegisters.push_back(
-            {RegisterType::SYSTEM, static_cast<uint16_t>(sysRegTag)});
-        sourceRegisterCount++;
-        operandsPending++;
+      if (op.sme.type == AARCH64_SME_OP_TILE_VEC) {
+        // SME tile has slice determined by register and immidiate.
+        // Add base register to source operands
+        sourceRegisters_[sourceRegisterCount_] =
+            csRegToRegister(op.sme.slice_reg);
+        sourceRegisterCount_++;
+        sourceOperandsPending_++;
+      }
+    } else if (op.type == AARCH64_OP_PRED) {
+      if (i == 0) setInstructionType(InsnType::isPredicate);
+      if (op.access == CS_AC_READ) {
+        sourceRegisters_[sourceRegisterCount_] = csRegToRegister(op.pred.reg);
+        sourceRegisterCount_++;
+        sourceOperandsPending_++;
+      }
+      if (op.access == CS_AC_WRITE) {
+        destinationRegisters_[destinationRegisterCount_] =
+            csRegToRegister(op.pred.reg);
+        destinationRegisterCount_++;
       }
-    } else if (op.type == ARM64_OP_REG_MSR) {
-      int32_t sysRegTag = architecture_.getSystemRegisterTag(op.imm);
+      if (op.pred.vec_select != AARCH64_REG_INVALID) {
+        sourceRegisters_[sourceRegisterCount_] =
+            csRegToRegister(op.pred.vec_select);
+        sourceRegisterCount_++;
+        sourceOperandsPending_++;
+      }
+    } else if (op.type == AARCH64_OP_SYSREG) {
+      int32_t sysRegTag =
+          architecture_.getSystemRegisterTag(op.sysop.reg.sysreg);
+      // Check SYSREG is supported
       if (sysRegTag == -1) {
         exceptionEncountered_ = true;
         exception_ = InstructionException::UnmappedSysReg;
-        // Clear any registered operands
-        sourceRegisterCount = 0;
-        destinationRegisterCount = 0;
-      } else {
-        destinationRegisters.push_back(
-            {RegisterType::SYSTEM, static_cast<uint16_t>(sysRegTag)});
-        destinationRegisterCount++;
+        return;
       }
-    } else if (op.type == ARM64_OP_SVCR) {
-      // Updating of SVCR is done via an exception and not via the sysreg file.
-      // No operands are required for this operation.
-      // Any access to SVCR other than SMSTART and SMSTOP (i.e. this OP_TYPE)
-      // will result in an `unmapped system register` exception.
+      if (op.sysop.sub_type == AARCH64_OP_REG_MRS) {
+        sourceRegisters_[sourceRegisterCount_] = {
+            RegisterType::SYSTEM, static_cast<uint16_t>(sysRegTag)};
+        sourceRegisterCount_++;
+        sourceOperandsPending_++;
+      } else if (op.sysop.sub_type == AARCH64_OP_REG_MSR) {
+        destinationRegisters_[destinationRegisterCount_] = {
+            RegisterType::SYSTEM, static_cast<uint16_t>(sysRegTag)};
+        destinationRegisterCount_++;
+      }
+    } else if (metadata_.operands[0].type == AARCH64_OP_SYSALIAS &&
+               metadata_.operands[0].sysop.sub_type == AARCH64_OP_SVCR) {
+      // This case is for instruction alias SMSTART and SMSTOP. Updating of SVCR
+      // value is done via an exception so no registers required.
     }
   }
 
   // Identify branches
-  for (size_t i = 0; i < metadata.groupCount; i++) {
-    if (metadata.groups[i] == ARM64_GRP_JUMP) {
-      isBranch_ = true;
+  for (size_t i = 0; i < metadata_.groupCount; i++) {
+    if (metadata_.groups[i] == AARCH64_GRP_JUMP ||
+        metadata_.groups[i] == AARCH64_GRP_CALL ||
+        metadata_.groups[i] == AARCH64_GRP_RET ||
+        metadata_.groups[i] == AARCH64_GRP_BRANCH_RELATIVE) {
+      setInstructionType(InsnType::isBranch);
     }
   }
 
   // Identify branch type
-  if (isBranch_) {
-    switch (metadata.opcode) {
+  if (isInstruction(InsnType::isBranch)) {
+    switch (metadata_.opcode) {
       case Opcode::AArch64_B:  // b label
         branchType_ = BranchType::Unconditional;
-        knownOffset_ = metadata.operands[0].imm;
+        knownOffset_ = metadata_.operands[0].imm;
         break;
       case Opcode::AArch64_BR: {  // br xn
         branchType_ = BranchType::Unconditional;
@@ -376,18 +437,18 @@ void Instruction::decode() {
       }
       case Opcode::AArch64_BL:  // bl #imm
         branchType_ = BranchType::SubroutineCall;
-        knownOffset_ = metadata.operands[0].imm;
+        knownOffset_ = metadata_.operands[0].imm;
         break;
       case Opcode::AArch64_BLR: {  // blr xn
         branchType_ = BranchType::SubroutineCall;
         break;
       }
       case Opcode::AArch64_Bcc: {  // b.cond label
-        if (metadata.operands[0].imm < 0)
+        if (metadata_.operands[0].imm < 0)
           branchType_ = BranchType::LoopClosing;
         else
           branchType_ = BranchType::Conditional;
-        knownOffset_ = metadata.operands[0].imm;
+        knownOffset_ = metadata_.operands[0].imm;
         break;
       }
       case Opcode::AArch64_CBNZW:  // cbnz wn, #imm
@@ -397,11 +458,11 @@ void Instruction::decode() {
       case Opcode::AArch64_CBZW:  // cbz wn, #imm
         [[fallthrough]];
       case Opcode::AArch64_CBZX: {  // cbz xn, #imm
-        if (metadata.operands[1].imm < 0)
+        if (metadata_.operands[1].imm < 0)
           branchType_ = BranchType::LoopClosing;
         else
           branchType_ = BranchType::Conditional;
-        knownOffset_ = metadata.operands[1].imm;
+        knownOffset_ = metadata_.operands[1].imm;
         break;
       }
       case Opcode::AArch64_TBNZW:  // tbnz wn, #imm, label
@@ -411,17 +472,16 @@ void Instruction::decode() {
       case Opcode::AArch64_TBZW:  // tbz wn, #imm, label
         [[fallthrough]];
       case Opcode::AArch64_TBZX: {  // tbz xn, #imm, label
-        if (metadata.operands[2].imm < 0)
+        if (metadata_.operands[2].imm < 0)
           branchType_ = BranchType::LoopClosing;
         else
           branchType_ = BranchType::Conditional;
-        knownOffset_ = metadata.operands[2].imm;
+        knownOffset_ = metadata_.operands[2].imm;
         break;
       }
-      case Opcode::AArch64_RET: {  // ret {xr}
+      case Opcode::AArch64_RET:  // ret {xt}
         branchType_ = BranchType::Return;
         break;
-      }
       default:
         break;
     }
@@ -430,224 +490,161 @@ void Instruction::decode() {
   // Identify loads/stores
   if (accessesMemory) {
     // Set size of data to be stored if it hasn't already been set
-    if (!isMicroOp_) dataSize_ = getDataSize(metadata.operands[0]);
+    if (!isMicroOp_) dataSize_ = getDataSize(metadata_.operands[0]);
 
     // Check first operand access to determine if it's a load or store
-    if (metadata.operands[0].access & CS_AC_WRITE) {
-      if (metadata.id == ARM64_INS_STXR || metadata.id == ARM64_INS_STLXR) {
+    if (metadata_.operands[0].access & CS_AC_WRITE) {
+      if (metadata_.id == AARCH64_INS_STXR ||
+          metadata_.id == AARCH64_INS_STLXR) {
         // Exceptions to this is load condition are exclusive store with a
         // success flag as first operand
         if (microOpcode_ != MicroOpcode::STR_DATA) {
-          isStoreAddress_ = true;
+          setInstructionType(InsnType::isStoreAddress);
         }
         if (microOpcode_ != MicroOpcode::STR_ADDR) {
-          isStoreData_ = true;
+          setInstructionType(InsnType::isStoreData);
         }
       } else {
-        isLoad_ = true;
+        setInstructionType(InsnType::isLoad);
       }
     } else {
       if (microOpcode_ != MicroOpcode::STR_DATA) {
-        isStoreAddress_ = true;
+        setInstructionType(InsnType::isStoreAddress);
       }
       if (microOpcode_ != MicroOpcode::STR_ADDR) {
-        isStoreData_ = true;
+        setInstructionType(InsnType::isStoreData);
       }
     }
 
     // LDADD* are considered to be both a load and a store
-    if (metadata.id >= ARM64_INS_LDADD && metadata.id <= ARM64_INS_LDADDLH) {
-      isLoad_ = true;
+    if (Opcode::AArch64_LDADDAB <= metadata_.opcode &&
+        metadata_.opcode <= Opcode::AArch64_LDADDX) {
+      setInstructionType(InsnType::isLoad);
+      setInstructionType(InsnType::isStoreData);
     }
 
     // CASAL* are considered to be both a load and a store
-    if (metadata.opcode == Opcode::AArch64_CASALW ||
-        metadata.opcode == Opcode::AArch64_CASALX) {
-      isLoad_ = true;
+    if (Opcode::AArch64_CASALB <= metadata_.opcode &&
+        metadata_.opcode <= Opcode::AArch64_CASALX) {
+      setInstructionType(InsnType::isLoad);
+      setInstructionType(InsnType::isStoreData);
     }
 
-    if (isStoreData_) {
+    if (isInstruction(InsnType::isStoreData)) {
       // Identify store instruction group
-      if (ARM64_REG_Z0 <= metadata.operands[0].reg &&
-          metadata.operands[0].reg <= ARM64_REG_Z31) {
-        isSVEData_ = true;
-      } else if ((metadata.operands[0].reg <= ARM64_REG_S31 &&
-                  metadata.operands[0].reg >= ARM64_REG_Q0) ||
-                 (metadata.operands[0].reg <= ARM64_REG_H31 &&
-                  metadata.operands[0].reg >= ARM64_REG_B0)) {
-        isScalarData_ = true;
-      } else if (metadata.operands[0].reg >= ARM64_REG_V0) {
-        isVectorData_ = true;
-      } else if ((metadata.operands[0].reg >= ARM64_REG_ZAB0 &&
-                  metadata.operands[0].reg < ARM64_REG_V0) ||
-                 metadata.operands[0].reg == ARM64_REG_ZA) {
-        isSMEData_ = true;
+      if (AARCH64_REG_Z0 <= metadata_.operands[0].reg &&
+          metadata_.operands[0].reg <= AARCH64_REG_Z31) {
+        setInstructionType(InsnType::isSVEData);
+      } else if ((metadata_.operands[0].reg <= AARCH64_REG_S31 &&
+                  metadata_.operands[0].reg >= AARCH64_REG_Q0) ||
+                 (metadata_.operands[0].reg <= AARCH64_REG_H31 &&
+                  metadata_.operands[0].reg >= AARCH64_REG_B0)) {
+        setInstructionType(InsnType::isScalarData);
+      } else if (metadata_.operands[0].is_vreg) {
+        setInstructionType(InsnType::isVectorData);
+      } else if ((metadata_.operands[0].reg >= AARCH64_REG_ZAB0 &&
+                  metadata_.operands[0].reg <= AARCH64_REG_ZT0) ||
+                 metadata_.operands[0].reg == AARCH64_REG_ZA) {
+        setInstructionType(InsnType::isSMEData);
       }
     }
   } else if (microOpcode_ == MicroOpcode::STR_DATA) {
     // Edge case for identifying store data micro-operation
-    isStoreData_ = true;
-  }
-  if (metadata.opcode == Opcode::AArch64_LDRXl ||
-      metadata.opcode == Opcode::AArch64_LDRSWl) {
-    // Literal loads aren't flagged as having a memory operand, so these must be
-    // marked as loads manually
-    isLoad_ = true;
-  }
-
-  if ((264 <= metadata.opcode && metadata.opcode <= 267) ||    // AND
-      (1063 <= metadata.opcode && metadata.opcode <= 1084) ||  // AND (pt.2)
-      (284 <= metadata.opcode && metadata.opcode <= 287) ||    // BIC
-      (1167 <= metadata.opcode && metadata.opcode <= 1183) ||  // BIC (pt.2)
-      (321 <= metadata.opcode && metadata.opcode <= 324) ||    // EOR/EON
-      (1707 <= metadata.opcode && metadata.opcode <= 1736) ||  // EOR/EON (pt.2)
-      (771 <= metadata.opcode && metadata.opcode <= 774) ||    // ORR/ORN
-      (3748 <= metadata.opcode && metadata.opcode <= 3771)) {  // ORR/ORN (pt.2)
-    isLogical_ = true;
-  }
-
-  if ((1252 <= metadata.opcode && metadata.opcode <= 1259) ||
-      (1314 <= metadata.opcode && metadata.opcode <= 1501) ||
-      (1778 <= metadata.opcode && metadata.opcode <= 1799) ||
-      (1842 <= metadata.opcode && metadata.opcode <= 1969)) {
-    isCompare_ = true;
+    setInstructionType(InsnType::isStoreData);
+  }
+  if (metadata_.opcode == Opcode::AArch64_LDRXl ||
+      metadata_.opcode == Opcode::AArch64_LDRSWl) {
+    // Literal loads aren't flagged as having a memory operand, so these must
+    // be marked as loads manually
+    setInstructionType(InsnType::isLoad);
+  }
+
+  // Identify Logical (bitwise) instructions
+  if (logicalOps.find(metadata_.mnemonic) != logicalOps.end()) {
+    setInstructionType(InsnType::isLogical);
+  }
+
+  // Identify comparison insturctions (excluding atomic LD-CMP-STR)
+  if (cmpOps.find(metadata_.mnemonic) != cmpOps.end()) {
+    setInstructionType(InsnType::isCompare);
     // Capture those floating point compare instructions with no destination
     // register
-    if (sourceRegisters.size() != 0) {
-      if (!(isScalarData_ || isVectorData_) &&
-          sourceRegisters[0].type == RegisterType::VECTOR) {
-        isScalarData_ = true;
+    if (sourceRegisterCount_ != 0) {
+      if (!(isInstruction(InsnType::isScalarData) ||
+            isInstruction(InsnType::isVectorData)) &&
+          sourceRegisters_[0].type == RegisterType::VECTOR) {
+        setInstructionType(InsnType::isScalarData);
       }
     }
   }
 
-  if ((347 <= metadata.opcode && metadata.opcode <= 366) ||
-      (1142 <= metadata.opcode && metadata.opcode <= 1146) ||
-      (1976 <= metadata.opcode && metadata.opcode <= 2186) ||
-      (metadata.opcode == 2207) ||
-      (782 <= metadata.opcode && metadata.opcode <= 788) ||
-      (4063 <= metadata.opcode && metadata.opcode <= 4097) ||
-      (898 <= metadata.opcode && metadata.opcode <= 904) ||
-      (5608 <= metadata.opcode && metadata.opcode <= 5642)) {
-    isConvert_ = true;
+  // Identify convert instructions
+  if (cvtOps.find(metadata_.mnemonic) != cvtOps.end()) {
+    setInstructionType(InsnType::isConvert);
     // Capture those floating point convert instructions whose destination
     // register is general purpose
-    if (!(isScalarData_ || isVectorData_ || isSVEData_)) {
-      isScalarData_ = true;
+    if (!(isInstruction(InsnType::isScalarData) ||
+          isInstruction(InsnType::isVectorData) ||
+          isInstruction(InsnType::isSVEData))) {
+      setInstructionType(InsnType::isScalarData);
     }
   }
 
   // Identify divide or square root operations
-  if ((367 <= metadata.opcode && metadata.opcode <= 375) ||
-      (789 <= metadata.opcode && metadata.opcode <= 790) ||
-      (905 <= metadata.opcode && metadata.opcode <= 906) ||
-      (2187 <= metadata.opcode && metadata.opcode <= 2200) ||
-      (4098 <= metadata.opcode && metadata.opcode <= 4103) ||
-      (5644 <= metadata.opcode && metadata.opcode <= 5649) ||
-      (481 <= metadata.opcode && metadata.opcode <= 483) ||
-      (metadata.opcode == 940) ||
-      (2640 <= metadata.opcode && metadata.opcode <= 2661) ||
-      (2665 <= metadata.opcode && metadata.opcode <= 2675) ||
-      (6066 <= metadata.opcode && metadata.opcode <= 6068)) {
-    isDivideOrSqrt_ = true;
+  if (divsqrtOps.find(metadata_.mnemonic) != divsqrtOps.end()) {
+    setInstructionType(InsnType::isDivideOrSqrt);
   }
 
   // Identify multiply operations
-  if ((433 <= metadata.opcode && metadata.opcode <= 447) ||  // all MUL variants
-      (759 <= metadata.opcode && metadata.opcode <= 762) ||
-      (816 <= metadata.opcode && metadata.opcode <= 819) ||
-      (915 <= metadata.opcode && metadata.opcode <= 918) ||
-      (2436 <= metadata.opcode && metadata.opcode <= 2482) ||
-      (2512 <= metadata.opcode && metadata.opcode <= 2514) ||
-      (2702 <= metadata.opcode && metadata.opcode <= 2704) ||
-      (3692 <= metadata.opcode && metadata.opcode <= 3716) ||
-      (3793 <= metadata.opcode && metadata.opcode <= 3805) ||
-      (4352 <= metadata.opcode && metadata.opcode <= 4380) ||
-      (4503 <= metadata.opcode && metadata.opcode <= 4543) ||
-      (4625 <= metadata.opcode && metadata.opcode <= 4643) ||
-      (5804 <= metadata.opcode && metadata.opcode <= 5832) ||
-      (2211 <= metadata.opcode &&
-       metadata.opcode <= 2216) ||  // all MADD/MAD variants
-      (2494 <= metadata.opcode && metadata.opcode <= 2499) ||
-      (2699 <= metadata.opcode && metadata.opcode <= 2701) ||
-      (3610 <= metadata.opcode && metadata.opcode <= 3615) ||
-      (4227 == metadata.opcode) || (5682 == metadata.opcode) ||
-      (2433 <= metadata.opcode &&
-       metadata.opcode <= 2435) ||  // all MSUB variants
-      (2509 <= metadata.opcode && metadata.opcode <= 2511) ||
-      (3690 <= metadata.opcode && metadata.opcode <= 3691) ||
-      (4351 == metadata.opcode) || (5803 == metadata.opcode) ||
-      (424 <= metadata.opcode && metadata.opcode <= 426) ||  // all MLA variants
-      (451 <= metadata.opcode && metadata.opcode <= 453) ||
-      (1151 <= metadata.opcode && metadata.opcode <= 1160) ||
-      (1378 <= metadata.opcode && metadata.opcode <= 1383) ||
-      (1914 <= metadata.opcode && metadata.opcode <= 1926) ||
-      (2341 <= metadata.opcode && metadata.opcode <= 2371) ||
-      (2403 <= metadata.opcode && metadata.opcode <= 2404) ||
-      (2500 <= metadata.opcode && metadata.opcode <= 2502) ||
-      (3618 <= metadata.opcode && metadata.opcode <= 3634) ||
-      (4295 <= metadata.opcode && metadata.opcode <= 4314) ||
-      (4335 <= metadata.opcode && metadata.opcode <= 4336) ||
-      (4453 <= metadata.opcode && metadata.opcode <= 4477) ||
-      (4581 <= metadata.opcode && metadata.opcode <= 4605) ||
-      (5749 <= metadata.opcode && metadata.opcode <= 5768) ||
-      (5789 <= metadata.opcode && metadata.opcode <= 5790) ||
-      (6115 <= metadata.opcode && metadata.opcode <= 6116) ||
-      (427 <= metadata.opcode && metadata.opcode <= 429) ||  // all MLS variants
-      (454 <= metadata.opcode && metadata.opcode <= 456) ||
-      (2372 <= metadata.opcode && metadata.opcode <= 2402) ||
-      (2503 <= metadata.opcode && metadata.opcode <= 2505) ||
-      (3635 <= metadata.opcode && metadata.opcode <= 3651) ||
-      (4315 <= metadata.opcode && metadata.opcode <= 4334) ||
-      (4478 <= metadata.opcode && metadata.opcode <= 4502) ||
-      (4606 <= metadata.opcode && metadata.opcode <= 4624) ||
-      (5769 <= metadata.opcode && metadata.opcode <= 5788) ||
-      (2430 <= metadata.opcode &&
-       metadata.opcode <= 2432) ||  // all MSB variants
-      (2506 <= metadata.opcode && metadata.opcode <= 2508) ||
-      (3682 <= metadata.opcode && metadata.opcode <= 3685) ||
-      (2405 <= metadata.opcode &&
-       metadata.opcode <= 2408) ||  // all SME FMOPS & FMOPA variants
-      (4337 <= metadata.opcode && metadata.opcode <= 4340) ||
-      (5391 <= metadata.opcode && metadata.opcode <= 5394) ||
-      (5791 <= metadata.opcode && metadata.opcode <= 5794) ||
-      (6117 <= metadata.opcode && metadata.opcode <= 6120)) {
-    isMultiply_ = true;
+  if (mulOps.find(metadata_.mnemonic) != mulOps.end()) {
+    setInstructionType(InsnType::isMultiply);
   }
 
   // Catch exceptions to the above identifier assignments
-  // Uncaught preciate assignment due to lacking destination register
-  if (metadata.opcode == Opcode::AArch64_PTEST_PP) {
-    isPredicate_ = true;
+  // Uncaught predicate assignment due to lacking destination register
+  if (metadata_.opcode == Opcode::AArch64_PTEST_PP) {
+    setInstructionType(InsnType::isPredicate);
   }
   // Uncaught float data assignment for FMOV move to general instructions
-  if (((430 <= metadata.opcode && metadata.opcode <= 432) ||
-       (2409 <= metadata.opcode && metadata.opcode <= 2429)) &&
-      !(isScalarData_ || isVectorData_)) {
-    isScalarData_ = true;
+  if (((Opcode::AArch64_FMOVD0 <= metadata_.opcode &&
+        metadata_.opcode <= Opcode::AArch64_FMOVS0) ||
+       (Opcode::AArch64_FMOVDXHighr <= metadata_.opcode &&
+        metadata_.opcode <= Opcode::AArch64_FMOVXHr)) &&
+      !(isInstruction(InsnType::isScalarData) ||
+        isInstruction(InsnType::isVectorData))) {
+    setInstructionType(InsnType::isScalarData);
   }
   // Uncaught vector data assignment for SMOV and UMOV instructions
-  if ((4341 <= metadata.opcode && metadata.opcode <= 4350) ||
-      (5795 <= metadata.opcode && metadata.opcode <= 5802)) {
-    isVectorData_ = true;
+  if ((Opcode::AArch64_SMOVvi16to32 <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::AArch64_SMOVvi8to64_idx0) ||
+      (Opcode::AArch64_UMOVvi16 <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::AArch64_UMOVvi8_idx0)) {
+    setInstructionType(InsnType::isVectorData);
   }
   // Uncaught float data assignment for FCVT convert to general instructions
-  if ((1976 <= metadata.opcode && metadata.opcode <= 2186) &&
-      !(isScalarData_ || isVectorData_)) {
-    isScalarData_ = true;
-  }
-
-  // Allocate enough entries in results vector
-  results.resize(destinationRegisterCount + 1);
-  // Allocate enough entries in the operands vector
-  operands.resize(sourceRegisterCount + 1);
-
-  // Catch zero register references and pre-complete those operands
-  for (uint16_t i = 0; i < sourceRegisterCount; i++) {
-    if (sourceRegisters[i] == Instruction::ZERO_REGISTER) {
-      operands[i] = RegisterValue(0, 8);
-      operandsPending--;
+  if ((Opcode::AArch64_FCVTASUWDr <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::AArch64_FCVT_ZPmZ_StoH) &&
+      !(isInstruction(InsnType::isScalarData) ||
+        isInstruction(InsnType::isVectorData))) {
+    setInstructionType(InsnType::isScalarData);
+  }
+
+  if (!(isInstruction(InsnType::isSMEData))) {
+    // Catch zero register references and pre-complete those operands - not
+    // applicable to SME instructions
+    for (uint16_t i = 0; i < sourceRegisterCount_; i++) {
+      if (sourceRegisters_[i] == RegisterType::ZERO_REGISTER) {
+        sourceValues_[i] = RegisterValue(0, 8);
+        sourceOperandsPending_--;
+      }
     }
+  } else {
+    // For SME instructions, resize the following structures to have the
+    // exact amount of space required
+    sourceRegisters_.resize(sourceRegisterCount_);
+    destinationRegisters_.resize(destinationRegisterCount_);
+    sourceValues_.resize(sourceRegisterCount_);
+    results_.resize(destinationRegisterCount_);
   }
 }
 
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 8ed60eb6bf..8f4bc38142 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -10,11 +10,9 @@
 #include "simeng/arch/aarch64/helpers/conditional.hh"
 #include "simeng/arch/aarch64/helpers/divide.hh"
 #include "simeng/arch/aarch64/helpers/float.hh"
-#include "simeng/arch/aarch64/helpers/load.hh"
 #include "simeng/arch/aarch64/helpers/logical.hh"
 #include "simeng/arch/aarch64/helpers/multiply.hh"
 #include "simeng/arch/aarch64/helpers/neon.hh"
-#include "simeng/arch/aarch64/helpers/store.hh"
 #include "simeng/arch/aarch64/helpers/sve.hh"
 
 namespace simeng {
@@ -68,10 +66,10 @@ void Instruction::execute() {
   assert(
       canExecute() &&
       "Attempted to execute an instruction before all operands were provided");
-  // 0th bit of SVCR register determins if streaming-mode is enabled.
-  const bool SMenabled = architecture_.getSVCRval() & 1;
-  // 1st bit of SVCR register determins if ZA register is enabled.
-  const bool ZAenabled = architecture_.getSVCRval() & 2;
+  // 0th bit of SVCR register determines if streaming-mode is enabled.
+  const bool SMenabled = architecture_.isStreamingModeEnabled();
+  // 1st bit of SVCR register determines if ZA register is enabled.
+  const bool ZAenabled = architecture_.isZARegisterEnabled();
   // When streaming mode is enabled, the architectural vector length goes from
   // SVE's VL to SME's SVL.
   const uint16_t VL_bits = SMenabled ? architecture_.getStreamingVectorLength()
@@ -80,451 +78,637 @@ void Instruction::execute() {
   if (isMicroOp_) {
     switch (microOpcode_) {
       case MicroOpcode::LDR_ADDR: {
-        uint16_t regSize =
-            (isScalarData_ || isVectorData_ || isSVEData_) ? 256 : 8;
+        uint16_t regSize = (isInstruction(InsnType::isScalarData) ||
+                            isInstruction(InsnType::isVectorData) ||
+                            isInstruction(InsnType::isSVEData))
+                               ? 256
+                               : 8;
         for (size_t dest = 0; dest < getDestinationRegisters().size(); dest++) {
-          results[dest] = memoryData[dest].zeroExtend(dataSize_, regSize);
+          results_[dest] = memoryData_[dest].zeroExtend(dataSize_, regSize);
         }
         break;
       }
       case MicroOpcode::OFFSET_IMM: {
-        results[0] = operands[0].get<uint64_t>() + metadata.operands[2].imm;
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[2].imm;
+        break;
+      }
+      case MicroOpcode::OFFSET_REG: {
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + sourceValues_[1].get<uint64_t>();
         break;
       }
       case MicroOpcode::STR_DATA: {
         setMemoryAddresses({{0, 0}});
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       default:
         return executionNYI();
     }
   } else {
-    switch (metadata.opcode) {
+    switch (metadata_.opcode) {
+      case Opcode::AArch64_ADDHA_MPPZ_D: {  // addha zada.d, pn/m, pm/m, zn.d
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[rowCount].getAsVector<uint64_t>();
+        const uint64_t* pm =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint64_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint64_t>();
+
+        for (uint16_t row = 0; row < rowCount; row++) {
+          const uint64_t* zaRow = sourceValues_[row].getAsVector<uint64_t>();
+          uint64_t out[32] = {0};
+          std::memcpy(out, zaRow, rowCount * sizeof(uint64_t));
+          // Slice element is active IFF all of the following conditions hold:
+          //  - Element in 1st source pred corresponding to horizontal
+          //    slice is TRUE
+          //  - Corresponding element in 2nd source pred is TRUE
+          const uint64_t shifted_active_pn = 1ull << ((row % 8) * 8);
+          if (pn[row / 8] & shifted_active_pn) {
+            for (uint16_t elem = 0; elem < rowCount; elem++) {
+              const uint64_t shifted_active_pm = 1ull << ((elem % 8) * 8);
+              if (pm[elem / 8] & shifted_active_pm) {
+                out[elem] = zn[elem];
+              }
+            }
+          }
+          results_[row] = {out, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_ADDHA_MPPZ_S: {  // addha zada.s, pn/m, pm/m, zn.s
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[rowCount].getAsVector<uint64_t>();
+        const uint64_t* pm =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint32_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint32_t>();
+
+        for (uint16_t row = 0; row < rowCount; row++) {
+          const uint32_t* zaRow = sourceValues_[row].getAsVector<uint32_t>();
+          uint32_t out[64] = {0};
+          std::memcpy(out, zaRow, rowCount * sizeof(uint32_t));
+          // Slice element is active IFF all of the following conditions hold:
+          //  - Element in 1st source pred corresponding to horizontal
+          //    slice is TRUE
+          //  - Corresponding element in 2nd source pred is TRUE
+          const uint64_t shifted_active_pn = 1ull << ((row % 16) * 4);
+          if (pn[row / 16] & shifted_active_pn) {
+            for (uint16_t elem = 0; elem < rowCount; elem++) {
+              const uint64_t shifted_active_pm = 1ull << ((elem % 16) * 4);
+              if (pm[elem / 16] & shifted_active_pm) {
+                out[elem] = zn[elem];
+              }
+            }
+          }
+          results_[row] = {out, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_ADDVA_MPPZ_D: {  // addva zada.d, pn/m, pm/m, zn.d
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[rowCount].getAsVector<uint64_t>();
+        const uint64_t* pm =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint64_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint64_t>();
+
+        for (uint16_t row = 0; row < rowCount; row++) {
+          const uint64_t* zaRow = sourceValues_[row].getAsVector<uint64_t>();
+          uint64_t out[32] = {0};
+          std::memcpy(out, zaRow, rowCount * sizeof(uint64_t));
+          // Slice element is active IFF all of the following conditions hold:
+          //  - Corresponding element in 1st source pred is TRUE
+          //  - Element in 2nd source pred corresponding to vertical
+          //    slice is TRUE
+          const uint64_t shifted_active_pn = 1ull << ((row % 8) * 8);
+          if (pn[row / 8] & shifted_active_pn) {
+            // Corresponding slice element is active (i.e. all elements in row).
+            // Now check if each vertical slice (i.e. each row element) is
+            // active
+            for (uint16_t elem = 0; elem < rowCount; elem++) {
+              const uint64_t shifted_active_pm = 1ull << ((elem % 8) * 8);
+              if (pm[elem / 8] & shifted_active_pm) {
+                out[elem] = zn[row];
+              }
+            }
+          }
+          results_[row] = {out, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_ADDVA_MPPZ_S: {  // addva zada.s, pn/m, pm/m, zn.s
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[rowCount].getAsVector<uint64_t>();
+        const uint64_t* pm =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint32_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint32_t>();
+
+        for (uint16_t row = 0; row < rowCount; row++) {
+          const uint32_t* zaRow = sourceValues_[row].getAsVector<uint32_t>();
+          uint32_t out[64] = {0};
+          std::memcpy(out, zaRow, rowCount * sizeof(uint32_t));
+          // Slice element is active IFF all of the following conditions hold:
+          //  - Corresponding element in 1st source pred is TRUE
+          //  - Element in 2nd source pred corresponding to vertical
+          //    slice is TRUE
+          const uint64_t shifted_active_pn = 1ull << ((row % 16) * 4);
+          if (pn[row / 16] & shifted_active_pn) {
+            // Corresponding slice element is active (i.e. all elements in row).
+            // Now check if each vertical slice (i.e. each row element) is
+            // active in 2nd pred
+            for (uint16_t elem = 0; elem < rowCount; elem++) {
+              const uint64_t shifted_active_pm = 1ull << ((elem % 16) * 4);
+              if (pm[elem / 16] & shifted_active_pm) {
+                out[elem] = zn[row];
+              }
+            }
+          }
+          results_[row] = {out, 256};
+        }
+        break;
+      }
       case Opcode::AArch64_ADCXr: {  // adc xd, xn, xm
-        auto [result, nzcv] = arithmeticHelp::addCarry_3ops<uint64_t>(operands);
-        results[0] = result;
+        auto [result, nzcv] = addCarry_3ops<uint64_t>(sourceValues_);
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = result;
         break;
       }
       case Opcode::AArch64_ADDPL_XXI: {  // addpl xd, xn, #imm
-        auto x = operands[0].get<uint64_t>();
-        auto y = static_cast<int64_t>(metadata.operands[2].imm);
+        auto x = sourceValues_[0].get<uint64_t>();
+        auto y = static_cast<int64_t>(metadata_.operands[2].imm);
         // convert PL from VL_bits
         const uint64_t PL = VL_bits / 64;
-        results[0] = x + (PL * y);
+        results_[0] = x + (PL * y);
         break;
       }
       case Opcode::AArch64_ADDPv16i8: {  // addp vd.16b, vn.16b, vm.16b
-        results[0] = neonHelp::vecAddp_3ops<uint8_t, 16>(operands);
+        results_[0] = vecAddp_3ops<uint8_t, 16>(sourceValues_);
         break;
       }
       case Opcode::AArch64_ADDPv2i64: {  // addp vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecAddp_3ops<uint64_t, 2>(operands);
+        results_[0] = vecAddp_3ops<uint64_t, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_ADDPv2i64p: {  // addp dd, vn.2d
-        results[0] = neonHelp::vecSumElems_2ops<uint64_t, 2>(operands);
+        results_[0] = vecSumElems_2ops<uint64_t, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_ADDPv4i32: {  // addp vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecAddp_3ops<uint32_t, 4>(operands);
+        results_[0] = vecAddp_3ops<uint32_t, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_ADDPv8i16: {  // addp vd.8h, vn.8h, vm.8h
-        results[0] = neonHelp::vecAddp_3ops<uint16_t, 8>(operands);
+        results_[0] = vecAddp_3ops<uint16_t, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_ADDSWri: {  // adds wd, wn, #imm{, shift}
         auto [result, nzcv] =
-            arithmeticHelp::addShift_imm<uint32_t>(operands, metadata, true);
-        results[0] = nzcv;
-        results[1] = {result, 8};
+            addShift_imm<uint32_t>(sourceValues_, metadata_, true);
+        results_[0] = nzcv;
+        results_[1] = {result, 8};
         break;
       }
       case Opcode::AArch64_ADDSWrs: {  // adds wd, wn, wm{, shift}
         auto [result, nzcv] =
-            arithmeticHelp::addShift_3ops<uint32_t>(operands, metadata, true);
-        results[0] = nzcv;
-        results[1] = {result, 8};
+            addShift_3ops<uint32_t>(sourceValues_, metadata_, true);
+        results_[0] = nzcv;
+        results_[1] = {result, 8};
         break;
       }
       case Opcode::AArch64_ADDSWrx: {  // adds wd, wn, wm{, extend {#amount}}
         auto [result, nzcv] =
-            arithmeticHelp::addExtend_3ops<uint32_t>(operands, metadata, true);
-        results[0] = nzcv;
-        results[1] = {result, 8};
+            addExtend_3ops<uint32_t>(sourceValues_, metadata_, true);
+        results_[0] = nzcv;
+        results_[1] = {result, 8};
         break;
       }
       case Opcode::AArch64_ADDSXri: {  // adds xd, xn, #imm{, shift}
         auto [result, nzcv] =
-            arithmeticHelp::addShift_imm<uint64_t>(operands, metadata, true);
-        results[0] = nzcv;
-        results[1] = result;
+            addShift_imm<uint64_t>(sourceValues_, metadata_, true);
+        results_[0] = nzcv;
+        results_[1] = result;
         break;
       }
       case Opcode::AArch64_ADDSXrs: {  // adds xd, xn, xm{, shift}
         auto [result, nzcv] =
-            arithmeticHelp::addShift_3ops<uint64_t>(operands, metadata, true);
-        results[0] = nzcv;
-        results[1] = result;
+            addShift_3ops<uint64_t>(sourceValues_, metadata_, true);
+        results_[0] = nzcv;
+        results_[1] = result;
         break;
       }
       case Opcode::AArch64_ADDSXrx:      // adds xd, xn, wm{, extend {#amount}}
       case Opcode::AArch64_ADDSXrx64: {  // adds xd, xn, xm{, extend {#amount}}
         auto [result, nzcv] =
-            arithmeticHelp::addExtend_3ops<uint64_t>(operands, metadata, true);
-        results[0] = nzcv;
-        results[1] = RegisterValue(result, 8);
+            addExtend_3ops<uint64_t>(sourceValues_, metadata_, true);
+        results_[0] = nzcv;
+        results_[1] = RegisterValue(result, 8);
         break;
       }
       case Opcode::AArch64_ADDVL_XXI: {  // addvl xd, xn, #imm
-        auto x = operands[0].get<uint64_t>();
-        auto y = static_cast<int64_t>(metadata.operands[2].imm);
+        auto x = sourceValues_[0].get<uint64_t>();
+        auto y = static_cast<int64_t>(metadata_.operands[2].imm);
         // convert VL from LEN (number of 128-bits) to bytes
         const uint64_t VL = VL_bits / 8;
-        results[0] = x + (VL * y);
+        results_[0] = x + (VL * y);
+        break;
+      }
+      case Opcode::AArch64_ADDVv4i16v: {  // addv hd, vn.4h
+        results_[0] = vecSumElems_2ops<uint16_t, 4>(sourceValues_);
+        break;
+      }
+      case Opcode::AArch64_ADDVv4i32v: {  // addv sd, vn.4s
+        results_[0] = vecSumElems_2ops<uint32_t, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_ADDVv8i8v: {  // addv bd, vn.8b
-        results[0] = neonHelp::vecSumElems_2ops<uint8_t, 8>(operands);
+        results_[0] = vecSumElems_2ops<uint8_t, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_ADDWri: {  // add wd, wn, #imm{, shift}
         auto [result, nzcv] =
-            arithmeticHelp::addShift_imm<uint32_t>(operands, metadata, false);
-        results[0] = {result, 8};
+            addShift_imm<uint32_t>(sourceValues_, metadata_, false);
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = {result, 8};
         break;
       }
       case Opcode::AArch64_ADDWrs: {  // add wd, wn, wm{, shift #amount}
         auto [result, nzcv] =
-            arithmeticHelp::addShift_3ops<uint32_t>(operands, metadata, false);
-        results[0] = {result, 8};
+            addShift_3ops<uint32_t>(sourceValues_, metadata_, false);
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = {result, 8};
         break;
       }
       case Opcode::AArch64_ADDWrx: {  // add wd, wn, wm{, extend #amount}
         auto [result, nzcv] =
-            arithmeticHelp::addExtend_3ops<uint32_t>(operands, metadata, false);
-        results[0] = {result, 8};
+            addExtend_3ops<uint32_t>(sourceValues_, metadata_, false);
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = {result, 8};
         break;
       }
       case Opcode::AArch64_ADDXri: {  // add xd, xn, #imm{, shift}
         auto [result, nzcv] =
-            arithmeticHelp::addShift_imm<uint64_t>(operands, metadata, false);
-        results[0] = result;
+            addShift_imm<uint64_t>(sourceValues_, metadata_, false);
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = result;
         break;
       }
       case Opcode::AArch64_ADDXrs: {  // add xd, xn, xm, {shift #amount}
         auto [result, nzcv] =
-            arithmeticHelp::addShift_3ops<uint64_t>(operands, metadata, false);
-        results[0] = result;
+            addShift_3ops<uint64_t>(sourceValues_, metadata_, false);
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = result;
         break;
       }
       case Opcode::AArch64_ADDXrx:      // add xd, xn, wm{, extend {#amount}}
       case Opcode::AArch64_ADDXrx64: {  // add xd, xn, xm{, extend {#amount}}
         auto [result, nzcv] =
-            arithmeticHelp::addExtend_3ops<uint64_t>(operands, metadata, false);
-        results[0] = result;
+            addExtend_3ops<uint64_t>(sourceValues_, metadata_, false);
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = result;
+        break;
+      }
+      case Opcode::AArch64_ADD_ZI_B: {  // add zdn.b, zdn.b, imm{, shift}
+        results_[0] = sveAdd_imm<uint8_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_ADD_ZI_D: {  // add zdn.d, zdn.d, imm{, shift}
+        results_[0] = sveAdd_imm<uint64_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_ADD_ZI_H: {  // add zdn.h zdn.h, imm{, shift}
+        results_[0] = sveAdd_imm<uint16_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_ADD_ZI_S: {  // add zdn.s, zdn.s, imm{, shift}
+        results_[0] = sveAdd_imm<uint32_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_ADD_ZPmZ_B: {  // add zdn.b, pg/m, zdn.b, zm.b
-        results[0] = sveHelp::sveAddPredicated_vecs<uint8_t>(operands, VL_bits);
+        results_[0] = sveAddPredicated_vecs<uint8_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_ADD_ZPmZ_D: {  // add zdn.d, pg/m, zdn.d, zm.d
-        results[0] =
-            sveHelp::sveAddPredicated_vecs<uint64_t>(operands, VL_bits);
+        results_[0] = sveAddPredicated_vecs<uint64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_ADD_ZPmZ_H: {  // add zdn.h, pg/m, zdn.h, zm.h
-        results[0] =
-            sveHelp::sveAddPredicated_vecs<uint16_t>(operands, VL_bits);
+        results_[0] = sveAddPredicated_vecs<uint16_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_ADD_ZPmZ_S: {  // add zdn.s, pg/m, zdn.s, zm.s
-        results[0] =
-            sveHelp::sveAddPredicated_vecs<uint32_t>(operands, VL_bits);
+        results_[0] = sveAddPredicated_vecs<uint32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_ADD_ZZZ_B: {  // add zd.b, zn.b, zm.b
-        results[0] = sveHelp::sveAdd_3ops<uint8_t>(operands, VL_bits);
+        results_[0] = sveAdd_3ops<uint8_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_ADD_ZZZ_D: {  // add zd.d, zn.d, zm.d
-        results[0] = sveHelp::sveAdd_3ops<uint64_t>(operands, VL_bits);
+        results_[0] = sveAdd_3ops<uint64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_ADD_ZZZ_H: {  // add zd.h, zn.h, zm.h
-        results[0] = sveHelp::sveAdd_3ops<uint16_t>(operands, VL_bits);
+        results_[0] = sveAdd_3ops<uint16_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_ADD_ZZZ_S: {  // add zd.s, zn.s, zm.s
-        results[0] = sveHelp::sveAdd_3ops<uint32_t>(operands, VL_bits);
+        results_[0] = sveAdd_3ops<uint32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_ADDv16i8: {  // add vd.16b, vn.16b, vm.16b
-        results[0] = neonHelp::vecAdd_3ops<uint8_t, 16>(operands);
+        results_[0] = vecAdd_3ops<uint8_t, 16>(sourceValues_);
         break;
       }
       case Opcode::AArch64_ADDv1i64: {  // add dd, dn, dm
-        results[0] = neonHelp::vecAdd_3ops<uint64_t, 1>(operands);
+        results_[0] = vecAdd_3ops<uint64_t, 1>(sourceValues_);
         break;
       }
       case Opcode::AArch64_ADDv2i32: {  // add vd.2s, vn.2s, vm.2s
-        results[0] = neonHelp::vecAdd_3ops<uint32_t, 2>(operands);
+        results_[0] = vecAdd_3ops<uint32_t, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_ADDv2i64: {  // add vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecAdd_3ops<uint64_t, 2>(operands);
+        results_[0] = vecAdd_3ops<uint64_t, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_ADDv4i16: {  // add vd.4h, vn.4h, vm.4h
-        results[0] = neonHelp::vecAdd_3ops<uint16_t, 4>(operands);
+        results_[0] = vecAdd_3ops<uint16_t, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_ADDv4i32: {  // add vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecAdd_3ops<uint32_t, 4>(operands);
+        results_[0] = vecAdd_3ops<uint32_t, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_ADDv8i16: {  // add vd.8h, vn.8h, vm.8h
-        results[0] = neonHelp::vecAdd_3ops<uint16_t, 8>(operands);
+        results_[0] = vecAdd_3ops<uint16_t, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_ADDv8i8: {  // add vd.8b, vn.8b, vm.8b
-        results[0] = neonHelp::vecAdd_3ops<uint8_t, 8>(operands);
+        results_[0] = vecAdd_3ops<uint8_t, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_ADR: {  // adr xd, #imm
-        results[0] = instructionAddress_ + metadata.operands[1].imm;
+        results_[0] = instructionAddress_ + metadata_.operands[1].imm;
         break;
       }
       case Opcode::AArch64_ADRP: {  // adrp xd, #imm
         // Clear lowest 12 bits of address and add immediate (already shifted by
         // decoder)
-        results[0] =
-            (instructionAddress_ & ~(0xFFF)) + metadata.operands[1].imm;
+        results_[0] =
+            (instructionAddress_ & ~(0xFFF)) + metadata_.operands[1].imm;
         break;
       }
       case Opcode::AArch64_ADR_LSL_ZZZ_D_0:    // adr zd.d, [zn.d, zm.d]
       case Opcode::AArch64_ADR_LSL_ZZZ_D_1:    // adr zd.d, [zn.d, zm.d, lsl #1]
       case Opcode::AArch64_ADR_LSL_ZZZ_D_2:    // adr zd.d, [zn.d, zm.d, lsl #2]
       case Opcode::AArch64_ADR_LSL_ZZZ_D_3: {  // adr zd.d, [zn.d, zm.d, lsl #3]
-        results[0] = sveHelp::sveAdr_packedOffsets<uint64_t>(operands, metadata,
-                                                             VL_bits);
+        results_[0] =
+            sveAdr_packedOffsets<uint64_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_ADR_LSL_ZZZ_S_0:    // adr zd.s, [zn.s, zm.s]
       case Opcode::AArch64_ADR_LSL_ZZZ_S_1:    // adr zd.s, [zn.s, zm.s, lsl #1]
       case Opcode::AArch64_ADR_LSL_ZZZ_S_2:    // adr zd.s, [zn.s, zm.s, lsl #2]
       case Opcode::AArch64_ADR_LSL_ZZZ_S_3: {  // adr zd.s, [zn.s, zm.s, lsl #3]
-        results[0] = sveHelp::sveAdr_packedOffsets<uint32_t>(operands, metadata,
-                                                             VL_bits);
+        results_[0] =
+            sveAdr_packedOffsets<uint32_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_ANDSWri: {  // ands wd, wn, #imm
-        auto [result, nzcv] = logicalHelp::logicOp_imm<uint32_t>(
-            operands, metadata, true,
+        auto [result, nzcv] = logicOp_imm<uint32_t>(
+            sourceValues_, metadata_, true,
             [](uint32_t x, uint32_t y) -> uint32_t { return x & y; });
-        results[0] = nzcv;
-        results[1] = {result, 8};
+        results_[0] = nzcv;
+        results_[1] = {result, 8};
         break;
       }
       case Opcode::AArch64_ANDSWrs: {  // ands wd, wn, wm{, shift #amount}
-        auto [result, nzcv] = logicalHelp::logicOpShift_3ops<uint32_t>(
-            operands, metadata, true,
+        auto [result, nzcv] = logicOpShift_3ops<uint32_t>(
+            sourceValues_, metadata_, true,
             [](uint32_t x, uint32_t y) -> uint32_t { return x & y; });
-        results[0] = nzcv;
-        results[1] = {result, 8};
+        results_[0] = nzcv;
+        results_[1] = {result, 8};
         break;
       }
       case Opcode::AArch64_ANDSXri: {  // ands xd, xn, #imm
-        auto [result, nzcv] = logicalHelp::logicOp_imm<uint64_t>(
-            operands, metadata, true,
+        auto [result, nzcv] = logicOp_imm<uint64_t>(
+            sourceValues_, metadata_, true,
             [](uint64_t x, uint64_t y) -> uint64_t { return x & y; });
-        results[0] = nzcv;
-        results[1] = result;
+        results_[0] = nzcv;
+        results_[1] = result;
         break;
       }
       case Opcode::AArch64_ANDSXrs: {  // ands xd, xn, xm{, shift #amount}
-        auto [result, nzcv] = logicalHelp::logicOpShift_3ops<uint64_t>(
-            operands, metadata, true,
+        auto [result, nzcv] = logicOpShift_3ops<uint64_t>(
+            sourceValues_, metadata_, true,
             [](uint64_t x, uint64_t y) -> uint64_t { return x & y; });
-        results[0] = nzcv;
-        results[1] = result;
+        results_[0] = nzcv;
+        results_[1] = result;
         break;
       }
       case Opcode::AArch64_ANDWri: {  // and wd, wn, #imm
-        auto [result, nzcv] = logicalHelp::logicOp_imm<uint32_t>(
-            operands, metadata, false,
+        auto [result, nzcv] = logicOp_imm<uint32_t>(
+            sourceValues_, metadata_, false,
             [](uint32_t x, uint32_t y) -> uint32_t { return x & y; });
-        results[0] = {result, 8};
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = {result, 8};
         break;
       }
       case Opcode::AArch64_ANDWrs: {  // and wd, wn, wm{, shift #amount}
-        auto [result, nzcv] = logicalHelp::logicOpShift_3ops<uint32_t>(
-            operands, metadata, false,
+        auto [result, nzcv] = logicOpShift_3ops<uint32_t>(
+            sourceValues_, metadata_, false,
             [](uint32_t x, uint32_t y) -> uint32_t { return x & y; });
-        results[0] = {result, 8};
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = {result, 8};
         break;
       }
       case Opcode::AArch64_ANDXri: {  // and xd, xn, #imm
-        auto [result, nzcv] = logicalHelp::logicOp_imm<uint64_t>(
-            operands, metadata, false,
+        auto [result, nzcv] = logicOp_imm<uint64_t>(
+            sourceValues_, metadata_, false,
             [](uint64_t x, uint64_t y) -> uint64_t { return x & y; });
-        results[0] = result;
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = result;
         break;
       }
       case Opcode::AArch64_ANDXrs: {  // and xd, xn, xm{, shift #amount}
-        auto [result, nzcv] = logicalHelp::logicOpShift_3ops<uint64_t>(
-            operands, metadata, false,
+        auto [result, nzcv] = logicOpShift_3ops<uint64_t>(
+            sourceValues_, metadata_, false,
             [](uint64_t x, uint64_t y) -> uint64_t { return x & y; });
-        results[0] = result;
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = result;
         break;
       }
       case Opcode::AArch64_AND_PPzPP: {  // and pd.b, pg/z, pn.b, pm.b
-        results[0] = sveHelp::sveLogicOp_preds<uint8_t>(
-            operands, VL_bits,
+        results_[0] = sveLogicOp_preds<uint8_t>(
+            sourceValues_, VL_bits,
             [](uint64_t x, uint64_t y) -> uint64_t { return x & y; });
         break;
       }
       case Opcode::AArch64_AND_ZI: {  // and zdn, zdn, #imm
-        const uint64_t* dn = operands[0].getAsVector<uint64_t>();
-        const uint64_t imm = static_cast<uint64_t>(metadata.operands[2].imm);
+        const uint64_t* dn = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t imm = static_cast<uint64_t>(metadata_.operands[2].imm);
 
         const uint16_t partition_num = VL_bits / 64;
         uint64_t out[32] = {0};
         for (int i = 0; i < partition_num; i++) {
           out[i] = dn[i] & imm;
         }
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_AND_ZPmZ_B: {  // and zdn.b, pg/m, zdn.b, zm.b
-        results[0] = sveHelp::sveLogicOpPredicated_3vecs<uint8_t>(
-            operands, VL_bits,
+        results_[0] = sveLogicOpPredicated_3vecs<uint8_t>(
+            sourceValues_, VL_bits,
             [](uint8_t x, uint8_t y) -> uint8_t { return x & y; });
         break;
       }
       case Opcode::AArch64_AND_ZPmZ_D: {  // and zdn.d, pg/m, zdn.d, zm.d
-        results[0] = sveHelp::sveLogicOpPredicated_3vecs<uint64_t>(
-            operands, VL_bits,
+        results_[0] = sveLogicOpPredicated_3vecs<uint64_t>(
+            sourceValues_, VL_bits,
             [](uint64_t x, uint64_t y) -> uint64_t { return x & y; });
         break;
       }
       case Opcode::AArch64_AND_ZPmZ_H: {  // and zdn.h, pg/m, zdn.h, zm.h
-        results[0] = sveHelp::sveLogicOpPredicated_3vecs<uint16_t>(
-            operands, VL_bits,
+        results_[0] = sveLogicOpPredicated_3vecs<uint16_t>(
+            sourceValues_, VL_bits,
             [](uint16_t x, uint16_t y) -> uint16_t { return x & y; });
         break;
       }
       case Opcode::AArch64_AND_ZPmZ_S: {  // and zdn.s, pg/m, zdn.s, zm.s
-        results[0] = sveHelp::sveLogicOpPredicated_3vecs<uint32_t>(
-            operands, VL_bits,
+        results_[0] = sveLogicOpPredicated_3vecs<uint32_t>(
+            sourceValues_, VL_bits,
             [](uint32_t x, uint32_t y) -> uint32_t { return x & y; });
         break;
       }
       case Opcode::AArch64_ANDv16i8: {  // and vd.16b, vn.16b, vm.16b
-        results[0] = neonHelp::vecLogicOp_3vecs<uint8_t, 16>(
-            operands, [](uint8_t x, uint8_t y) -> uint8_t { return x & y; });
+        results_[0] = vecLogicOp_3vecs<uint8_t, 16>(
+            sourceValues_,
+            [](uint8_t x, uint8_t y) -> uint8_t { return x & y; });
         break;
       }
       case Opcode::AArch64_ANDv8i8: {  // and vd.8b, vn.8b, vm.8b
-        results[0] = neonHelp::vecLogicOp_3vecs<uint8_t, 8>(
-            operands, [](uint8_t x, uint8_t y) -> uint8_t { return x & y; });
+        results_[0] = vecLogicOp_3vecs<uint8_t, 8>(
+            sourceValues_,
+            [](uint8_t x, uint8_t y) -> uint8_t { return x & y; });
         break;
       }
       case Opcode::AArch64_ASRVWr: {  // asrv wd, wn, wm
-        results[0] = {logicalHelp::asrv_3gpr<int32_t>(operands), 8};
+        results_[0] = {asrv_3gpr<int32_t>(sourceValues_), 8};
         break;
       }
       case Opcode::AArch64_ASRVXr: {  // asrv xd, xn, xm
-        results[0] = logicalHelp::asrv_3gpr<int64_t>(operands);
+        results_[0] = asrv_3gpr<int64_t>(sourceValues_);
         break;
       }
       case Opcode::AArch64_B: {  // b label
         branchTaken_ = true;
-        branchAddress_ = instructionAddress_ + metadata.operands[0].imm;
+        branchAddress_ = instructionAddress_ + metadata_.operands[0].imm;
         break;
       }
       case Opcode::AArch64_BFMWri: {  // bfm wd, wn, #immr, #imms
-        results[0] = {
-            bitmanipHelp::bfm_2imms<uint32_t>(operands, metadata, false, false),
-            8};
+        results_[0] = {
+            bfm_2imms<uint32_t>(sourceValues_, metadata_, false, false), 8};
         break;
       }
       case Opcode::AArch64_BFMXri: {  // bfm xd, xn, #immr, #imms
-        results[0] =
-            bitmanipHelp::bfm_2imms<uint64_t>(operands, metadata, false, false);
+        results_[0] =
+            bfm_2imms<uint64_t>(sourceValues_, metadata_, false, false);
         break;
       }
       case Opcode::AArch64_BICSWrs: {  // bics wd, wn, wm{, shift #amount}
         auto [result, nzcv] =
-            logicalHelp::bicShift_3ops<uint32_t>(operands, metadata, true);
-        results[0] = nzcv;
-        results[1] = {result, 8};
+            bicShift_3ops<uint32_t>(sourceValues_, metadata_, true);
+        results_[0] = nzcv;
+        results_[1] = {result, 8};
         break;
       }
       case Opcode::AArch64_BICSXrs: {  // bics xd, xn, xm{, shift #amount}
         auto [result, nzcv] =
-            logicalHelp::bicShift_3ops<uint64_t>(operands, metadata, true);
-        results[0] = nzcv;
-        results[1] = result;
+            bicShift_3ops<uint64_t>(sourceValues_, metadata_, true);
+        results_[0] = nzcv;
+        results_[1] = result;
         break;
       }
       case Opcode::AArch64_BICWrs: {  // bic wd, wn, wm{, shift #amount}
         auto [result, nzcv] =
-            logicalHelp::bicShift_3ops<uint32_t>(operands, metadata, false);
-        results[0] = {result, 8};
+            bicShift_3ops<uint32_t>(sourceValues_, metadata_, false);
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = {result, 8};
         break;
       }
       case Opcode::AArch64_BICXrs: {  // bic xd, xn, xm{, shift #amount}
         auto [result, nzcv] =
-            logicalHelp::bicShift_3ops<uint64_t>(operands, metadata, false);
-        results[0] = result;
+            bicShift_3ops<uint64_t>(sourceValues_, metadata_, false);
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = result;
         break;
       }
       case Opcode::AArch64_BICv16i8: {  // bic vd.16b, vn.16b, vm.16b
-        results[0] = neonHelp::vecBic_3ops<uint8_t, 16>(operands);
+        results_[0] = vecBic_3ops<uint8_t, 16>(sourceValues_);
         break;
       }
       case Opcode::AArch64_BICv4i32: {  // bic vd.4s, #imm{, lsl #shift}
-        results[0] = neonHelp::vecBicShift_imm<uint32_t, 4>(operands, metadata);
+        results_[0] = vecBicShift_imm<uint32_t, 4>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_BICv8i16: {  // bic vd.8h, #imm{, lsl #shift}
-        results[0] = neonHelp::vecBicShift_imm<uint16_t, 8>(operands, metadata);
+        results_[0] = vecBicShift_imm<uint16_t, 8>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_BICv8i8: {  // bic vd.8b, vn.8b, vm.8b
-        results[0] = neonHelp::vecBic_3ops<uint8_t, 8>(operands);
+        results_[0] = vecBic_3ops<uint8_t, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_BIFv16i8: {  // bif vd.16b, vn.16b, vm.16b
-        results[0] = neonHelp::vecBitwiseInsert<16>(operands, true);
+        results_[0] = vecBitwiseInsert<16>(sourceValues_, true);
         break;
       }
       case Opcode::AArch64_BITv16i8: {  // bit vd.16b, vn.16b, vm.16b
-        results[0] = neonHelp::vecBitwiseInsert<16>(operands, false);
+        results_[0] = vecBitwiseInsert<16>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_BITv8i8: {  // bit vd.8b, vn.8b, vm.8b
-        results[0] = neonHelp::vecBitwiseInsert<8>(operands, false);
+        results_[0] = vecBitwiseInsert<8>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_BL: {  // bl #imm
         branchTaken_ = true;
-        branchAddress_ = instructionAddress_ + metadata.operands[0].imm;
-        results[0] = static_cast<uint64_t>(instructionAddress_ + 4);
+        branchAddress_ = instructionAddress_ + metadata_.operands[0].imm;
+        results_[0] = static_cast<uint64_t>(instructionAddress_ + 4);
         break;
       }
       case Opcode::AArch64_BLR: {  // blr xn
         branchTaken_ = true;
-        branchAddress_ = operands[0].get<uint64_t>();
-        results[0] = static_cast<uint64_t>(instructionAddress_ + 4);
+        branchAddress_ = sourceValues_[0].get<uint64_t>();
+        results_[0] = static_cast<uint64_t>(instructionAddress_ + 4);
         break;
       }
       case Opcode::AArch64_BR: {  // br xn
         branchTaken_ = true;
-        branchAddress_ = operands[0].get<uint64_t>();
+        branchAddress_ = sourceValues_[0].get<uint64_t>();
         break;
       }
       case Opcode::AArch64_BRK: {
@@ -532,13 +716,13 @@ void Instruction::execute() {
         break;
       }
       case Opcode::AArch64_BSLv16i8: {  // bsl vd.16b, vn.16b, vm.16b
-        results[0] = neonHelp::vecBsl<16>(operands);
+        results_[0] = vecBsl<16>(sourceValues_);
         break;
       }
       case Opcode::AArch64_Bcc: {  // b.cond label
-        if (AuxFunc::conditionHolds(metadata.cc, operands[0].get<uint8_t>())) {
+        if (conditionHolds(metadata_.cc, sourceValues_[0].get<uint8_t>())) {
           branchTaken_ = true;
-          branchAddress_ = instructionAddress_ + metadata.operands[0].imm;
+          branchAddress_ = instructionAddress_ + metadata_.operands[0].imm;
         } else {
           branchTaken_ = false;
           branchAddress_ = instructionAddress_ + 4;
@@ -547,432 +731,433 @@ void Instruction::execute() {
       }
       case Opcode::AArch64_CASALW: {  // casal ws, wt, [xn|sp]
         // LOAD / STORE
-        const uint32_t s = operands[0].get<uint32_t>();
-        const uint32_t t = operands[1].get<uint32_t>();
-        const uint32_t n = memoryData[0].get<uint32_t>();
-        if (n == s) memoryData[0] = t;
+        const uint32_t s = sourceValues_[0].get<uint32_t>();
+        const uint32_t t = sourceValues_[1].get<uint32_t>();
+        const uint32_t n = memoryData_[0].get<uint32_t>();
+        if (n == s) memoryData_[0] = t;
         break;
       }
       case Opcode::AArch64_CASALX: {  // casal xs, xt, [xn|sp]
         // LOAD / STORE
-        const uint64_t s = operands[0].get<uint64_t>();
-        const uint64_t t = operands[1].get<uint64_t>();
-        const uint64_t n = memoryData[0].get<uint64_t>();
-        if (n == s) memoryData[0] = t;
+        const uint64_t s = sourceValues_[0].get<uint64_t>();
+        const uint64_t t = sourceValues_[1].get<uint64_t>();
+        const uint64_t n = memoryData_[0].get<uint64_t>();
+        if (n == s) memoryData_[0] = t;
         break;
       }
       case Opcode::AArch64_CBNZW: {  // cbnz wn, #imm
-        auto [taken, addr] = conditionalHelp::condBranch_cmpToZero<uint32_t>(
-            operands, metadata, instructionAddress_,
+        auto [taken, addr] = condBranch_cmpToZero<uint32_t>(
+            sourceValues_, metadata_, instructionAddress_,
             [](uint32_t x) -> bool { return x != 0; });
         branchTaken_ = taken;
         branchAddress_ = addr;
         break;
       }
       case Opcode::AArch64_CBNZX: {  // cbnz xn, #imm
-        auto [taken, addr] = conditionalHelp::condBranch_cmpToZero<uint64_t>(
-            operands, metadata, instructionAddress_,
+        auto [taken, addr] = condBranch_cmpToZero<uint64_t>(
+            sourceValues_, metadata_, instructionAddress_,
             [](uint64_t x) -> bool { return x != 0; });
         branchTaken_ = taken;
         branchAddress_ = addr;
         break;
       }
       case Opcode::AArch64_CBZW: {  // cbz wn, #imm
-        auto [taken, addr] = conditionalHelp::condBranch_cmpToZero<uint32_t>(
-            operands, metadata, instructionAddress_,
+        auto [taken, addr] = condBranch_cmpToZero<uint32_t>(
+            sourceValues_, metadata_, instructionAddress_,
             [](uint32_t x) -> bool { return x == 0; });
         branchTaken_ = taken;
         branchAddress_ = addr;
         break;
       }
       case Opcode::AArch64_CBZX: {  // cbz xn, #imm
-        auto [taken, addr] = conditionalHelp::condBranch_cmpToZero<uint64_t>(
-            operands, metadata, instructionAddress_,
+        auto [taken, addr] = condBranch_cmpToZero<uint64_t>(
+            sourceValues_, metadata_, instructionAddress_,
             [](uint64_t x) -> bool { return x == 0; });
         branchTaken_ = taken;
         branchAddress_ = addr;
         break;
       }
       case Opcode::AArch64_CCMNWi: {  // ccmn wn, #imm, #nzcv, cc
-        results[0] = conditionalHelp::ccmn_imm<uint32_t>(operands, metadata);
+        results_[0] = ccmn_imm<uint32_t>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_CCMNXi: {  // ccmn xn, #imm, #nzcv, cc
-        results[0] = conditionalHelp::ccmn_imm<uint64_t>(operands, metadata);
+        results_[0] = ccmn_imm<uint64_t>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_CCMPWi: {  // ccmp wn, #imm, #nzcv, cc
-        results[0] = conditionalHelp::ccmp_imm<uint32_t>(operands, metadata);
+        results_[0] = ccmp_imm<uint32_t>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_CCMPWr: {  // ccmp wn, wm, #nzcv, cc
-        results[0] = conditionalHelp::ccmp_reg<uint32_t>(operands, metadata);
+        results_[0] = ccmp_reg<uint32_t>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_CCMPXi: {  // ccmp xn, #imm, #nzcv, cc
-        results[0] = conditionalHelp::ccmp_imm<uint64_t>(operands, metadata);
+        results_[0] = ccmp_imm<uint64_t>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_CCMPXr: {  // ccmp xn, xm, #nzcv, cc
-        results[0] = conditionalHelp::ccmp_reg<uint64_t>(operands, metadata);
+        results_[0] = ccmp_reg<uint64_t>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_CLZXr: {  // clz xd, xn
-        results[0] = arithmeticHelp::clz_reg<int64_t>(operands);
+        results_[0] = clz_reg<int64_t>(sourceValues_);
         break;
       }
       case Opcode::AArch64_CMEQv16i8: {  // cmeq vd.16b, vn.16b, vm.16b
-        results[0] = neonHelp::vecCompare<uint8_t, 16>(
-            operands, false,
+        results_[0] = vecCompare<uint8_t, 16>(
+            sourceValues_, false,
             [](uint8_t x, uint8_t y) -> bool { return (x == y); });
         break;
       }
       case Opcode::AArch64_CMEQv16i8rz: {  // cmeq vd.16b, vn.16b, #0
-        results[0] = neonHelp::vecCompare<uint8_t, 16>(
-            operands, true,
+        results_[0] = vecCompare<uint8_t, 16>(
+            sourceValues_, true,
             [](uint8_t x, uint8_t y) -> bool { return (x == y); });
         break;
       }
       case Opcode::AArch64_CMEQv4i32: {  // cmeq vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecCompare<uint32_t, 4>(
-            operands, false,
+        results_[0] = vecCompare<uint32_t, 4>(
+            sourceValues_, false,
             [](uint32_t x, uint32_t y) -> bool { return (x == y); });
         break;
       }
       case Opcode::AArch64_CMEQv8i8: {  // cmeq vd.8b, vn.8b, vm.8b
-        results[0] = neonHelp::vecCompare<int8_t, 8>(
-            operands, false,
+        results_[0] = vecCompare<int8_t, 8>(
+            sourceValues_, false,
             [](int8_t x, int8_t y) -> bool { return (x == y); });
         break;
       }
       case Opcode::AArch64_CMEQv8i8rz: {  // cmeq vd.8b, vn.8b, #0
-        results[0] = neonHelp::vecCompare<int8_t, 8>(
-            operands, true,
+        results_[0] = vecCompare<int8_t, 8>(
+            sourceValues_, true,
             [](int8_t x, int8_t y) -> bool { return (x == y); });
         break;
       }
       case Opcode::AArch64_CMHIv4i32: {  // cmhi vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecCompare<uint32_t, 4>(
-            operands, false,
+        results_[0] = vecCompare<uint32_t, 4>(
+            sourceValues_, false,
             [](uint32_t x, uint32_t y) -> bool { return (x > y); });
         break;
       }
       case Opcode::AArch64_CMHSv16i8: {  // cmhs vd.16b, vn.16b, vm.16b
-        results[0] = neonHelp::vecCompare<int8_t, 16>(
-            operands, false,
-            [](int8_t x, int8_t y) -> bool { return (x >= y); });
+        results_[0] = vecCompare<uint8_t, 16>(
+            sourceValues_, false,
+            [](uint8_t x, uint8_t y) -> bool { return (x >= y); });
         break;
       }
       case Opcode::AArch64_CMPEQ_PPzZI_B: {  // cmpeq pd.b, pg/z, zn.b, #imm
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<uint8_t>(
-            operands, metadata, VL_bits, true,
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint8_t>(
+            sourceValues_, metadata_, VL_bits, true,
             [](uint8_t x, uint8_t y) -> bool { return x == y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPEQ_PPzZI_D: {  // cmpeq pd.d, pg/z, zn.d, #imm
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<uint64_t>(
-            operands, metadata, VL_bits, true,
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint64_t>(
+            sourceValues_, metadata_, VL_bits, true,
             [](uint64_t x, uint64_t y) -> bool { return x == y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPEQ_PPzZI_H: {  // cmpeq pd.h, pg/z, zn.h, #imm
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<uint16_t>(
-            operands, metadata, VL_bits, true,
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint16_t>(
+            sourceValues_, metadata_, VL_bits, true,
             [](uint16_t x, uint16_t y) -> bool { return x == y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPEQ_PPzZI_S: {  // cmpeq pd.s, pg/z, zn.s, #imm
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<uint32_t>(
-            operands, metadata, VL_bits, true,
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint32_t>(
+            sourceValues_, metadata_, VL_bits, true,
             [](uint32_t x, uint32_t y) -> bool { return x == y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPEQ_PPzZZ_B: {  // cmpeq pd.b, pg/z, zn.b, zm.b
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<uint8_t>(
-            operands, metadata, VL_bits, false,
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint8_t>(
+            sourceValues_, metadata_, VL_bits, false,
             [](uint8_t x, uint8_t y) -> bool { return x == y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPEQ_PPzZZ_D: {  // cmpeq pd.d, pg/z, zn.d, zm.d
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<uint64_t>(
-            operands, metadata, VL_bits, false,
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint64_t>(
+            sourceValues_, metadata_, VL_bits, false,
             [](uint64_t x, uint64_t y) -> bool { return x == y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPEQ_PPzZZ_H: {  // cmpeq pd.h, pg/z, zn.h, zm.h
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<uint16_t>(
-            operands, metadata, VL_bits, false,
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint16_t>(
+            sourceValues_, metadata_, VL_bits, false,
             [](uint16_t x, uint16_t y) -> bool { return x == y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPEQ_PPzZZ_S: {  // cmpeq pd.s, pg/z, zn.s, zm.s
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<uint32_t>(
-            operands, metadata, VL_bits, false,
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint32_t>(
+            sourceValues_, metadata_, VL_bits, false,
             [](uint32_t x, uint32_t y) -> bool { return x == y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPGT_PPzZZ_B: {  // cmpgt pd.b, pg/z, zn.b, zm.b
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<int8_t>(
-            operands, metadata, VL_bits, false,
+        auto [output, nzcv] = sveCmpPredicated_toPred<int8_t>(
+            sourceValues_, metadata_, VL_bits, false,
             [](int8_t x, int8_t y) -> bool { return x > y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPGT_PPzZZ_D: {  // cmpgt pd.d, pg/z, zn.d, zm.d
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<int64_t>(
-            operands, metadata, VL_bits, false,
+        auto [output, nzcv] = sveCmpPredicated_toPred<int64_t>(
+            sourceValues_, metadata_, VL_bits, false,
             [](int64_t x, int64_t y) -> bool { return x > y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPGT_PPzZZ_H: {  // cmpgt pd.h, pg/z, zn.h, zm.h
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<int16_t>(
-            operands, metadata, VL_bits, false,
+        auto [output, nzcv] = sveCmpPredicated_toPred<int16_t>(
+            sourceValues_, metadata_, VL_bits, false,
             [](int16_t x, int16_t y) -> bool { return x > y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPGT_PPzZZ_S: {  // cmpgt pd.s, pg/z, zn.s, zm.s
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<int32_t>(
-            operands, metadata, VL_bits, false,
+        auto [output, nzcv] = sveCmpPredicated_toPred<int32_t>(
+            sourceValues_, metadata_, VL_bits, false,
             [](int32_t x, int32_t y) -> bool { return x > y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPHI_PPzZZ_B: {  // cmphi pd.b, pg/z, zn.b, zm.b
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<uint8_t>(
-            operands, metadata, VL_bits, false,
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint8_t>(
+            sourceValues_, metadata_, VL_bits, false,
             [](uint8_t x, uint8_t y) -> bool { return x > y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPHI_PPzZZ_D: {  // cmphi pd.d, pg/z, zn.d, zm.d
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<uint64_t>(
-            operands, metadata, VL_bits, false,
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint64_t>(
+            sourceValues_, metadata_, VL_bits, false,
             [](uint64_t x, uint64_t y) -> bool { return x > y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPHI_PPzZZ_H: {  // cmphi pd.h, pg/z, zn.h, zm.h
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<uint16_t>(
-            operands, metadata, VL_bits, false,
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint16_t>(
+            sourceValues_, metadata_, VL_bits, false,
             [](uint16_t x, uint16_t y) -> bool { return x > y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPHI_PPzZZ_S: {  // cmphi pd.s, pg/z, zn.s, zm.s
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<uint32_t>(
-            operands, metadata, VL_bits, false,
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint32_t>(
+            sourceValues_, metadata_, VL_bits, false,
             [](uint32_t x, uint32_t y) -> bool { return x > y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPNE_PPzZI_B: {  // cmpne pd.b, pg/z. zn.b, #imm
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<int8_t>(
-            operands, metadata, VL_bits, true,
+        auto [output, nzcv] = sveCmpPredicated_toPred<int8_t>(
+            sourceValues_, metadata_, VL_bits, true,
             [](int8_t x, int8_t y) -> bool { return x != y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPNE_PPzZI_D: {  // cmpne pd.d, pg/z. zn.d, #imm
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<int64_t>(
-            operands, metadata, VL_bits, true,
+        auto [output, nzcv] = sveCmpPredicated_toPred<int64_t>(
+            sourceValues_, metadata_, VL_bits, true,
             [](int64_t x, int64_t y) -> bool { return x != y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPNE_PPzZI_H: {  // cmpne pd.h, pg/z. zn.h, #imm
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<int16_t>(
-            operands, metadata, VL_bits, true,
+        auto [output, nzcv] = sveCmpPredicated_toPred<int16_t>(
+            sourceValues_, metadata_, VL_bits, true,
             [](int16_t x, int16_t y) -> bool { return x != y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPNE_PPzZI_S: {  // cmpne pd.s, pg/z. zn.s, #imm
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<int32_t>(
-            operands, metadata, VL_bits, true,
+        auto [output, nzcv] = sveCmpPredicated_toPred<int32_t>(
+            sourceValues_, metadata_, VL_bits, true,
             [](int32_t x, int32_t y) -> bool { return x != y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPNE_PPzZZ_B: {  // cmpne pd.b, pg/z, zn.b, zm.b
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<int8_t>(
-            operands, metadata, VL_bits, false,
+        auto [output, nzcv] = sveCmpPredicated_toPred<int8_t>(
+            sourceValues_, metadata_, VL_bits, false,
             [](int8_t x, int8_t y) -> bool { return x != y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPNE_PPzZZ_D: {  // cmpne pd.d, pg/z, zn.d, zm.d
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<int64_t>(
-            operands, metadata, VL_bits, false,
+        auto [output, nzcv] = sveCmpPredicated_toPred<int64_t>(
+            sourceValues_, metadata_, VL_bits, false,
             [](int64_t x, int64_t y) -> bool { return x != y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPNE_PPzZZ_H: {  // cmpne pd.h, pg/z, zn.h, zm.h
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<int16_t>(
-            operands, metadata, VL_bits, false,
+        auto [output, nzcv] = sveCmpPredicated_toPred<int16_t>(
+            sourceValues_, metadata_, VL_bits, false,
             [](int16_t x, int16_t y) -> bool { return x != y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CMPNE_PPzZZ_S: {  // cmpne pd.s, pg/z, zn.s, zm.s
-        auto [output, nzcv] = sveHelp::sveCmpPredicated_toPred<int32_t>(
-            operands, metadata, VL_bits, false,
+        auto [output, nzcv] = sveCmpPredicated_toPred<int32_t>(
+            sourceValues_, metadata_, VL_bits, false,
             [](int32_t x, int32_t y) -> bool { return x != y; });
-        results[0] = nzcv;
-        results[1] = output;
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_CNTB_XPiI: {  // cntb xd{, pattern{, #imm}}
-        results[0] = sveHelp::sveCnt_gpr<uint8_t>(metadata, VL_bits);
+        results_[0] = sveCnt_gpr<uint8_t>(metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_CNTD_XPiI: {  // cntd xd{, pattern{, #imm}}
-        results[0] = sveHelp::sveCnt_gpr<uint64_t>(metadata, VL_bits);
+        results_[0] = sveCnt_gpr<uint64_t>(metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_CNTH_XPiI: {  // cnth xd{, pattern{, #imm}}
-        results[0] = sveHelp::sveCnt_gpr<uint16_t>(metadata, VL_bits);
+        results_[0] = sveCnt_gpr<uint16_t>(metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_CNTP_XPP_B: {  // cntp xd, pg, pn.b
-        results[0] = sveHelp::sveCntp<uint8_t>(operands, VL_bits);
+        results_[0] = sveCntp<uint8_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_CNTP_XPP_D: {  // cntp xd, pg, pn.d
-        results[0] = sveHelp::sveCntp<uint8_t>(operands, VL_bits);
+        results_[0] = sveCntp<uint8_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_CNTP_XPP_H: {  // cntp xd, pg, pn.h
-        results[0] = sveHelp::sveCntp<uint8_t>(operands, VL_bits);
+        results_[0] = sveCntp<uint8_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_CNTP_XPP_S: {  // cntp xd, pg, pn.s
-        results[0] = sveHelp::sveCntp<uint8_t>(operands, VL_bits);
+        results_[0] = sveCntp<uint8_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_CNTW_XPiI: {  // cntw xd{, pattern{, #imm}}
-        results[0] = sveHelp::sveCnt_gpr<uint32_t>(metadata, VL_bits);
+        results_[0] = sveCnt_gpr<uint32_t>(metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_CNTv8i8: {  // cnt vd.8b, vn.8b
-        results[0] = neonHelp::vecCountPerByte<uint8_t, 8>(operands);
+        results_[0] = vecCountPerByte<uint8_t, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_CPY_ZPzI_B: {  // cpy zd.b, pg/z, #imm{, shift}
-        results[0] = sveHelp::sveCpy_imm<int8_t>(operands, metadata, VL_bits);
+        results_[0] = sveCpy_imm<int8_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_CPY_ZPzI_D: {  // cpy zd.d, pg/z, #imm{, shift}
-        results[0] = sveHelp::sveCpy_imm<int64_t>(operands, metadata, VL_bits);
+        results_[0] = sveCpy_imm<int64_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_CPY_ZPzI_H: {  // cpy zd.h, pg/z, #imm{, shift}
-        results[0] = sveHelp::sveCpy_imm<int16_t>(operands, metadata, VL_bits);
+        results_[0] = sveCpy_imm<int16_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_CPY_ZPzI_S: {  // cpy zd.s, pg/z, #imm{, shift}
-        results[0] = sveHelp::sveCpy_imm<int32_t>(operands, metadata, VL_bits);
+        results_[0] = sveCpy_imm<int32_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_DUPi32: {  // dup vd, vn.s[index]
-        results[0] =
-            neonHelp::vecDup_gprOrIndex<uint32_t, 1>(operands, metadata, false);
+        results_[0] =
+            vecDup_gprOrIndex<uint32_t, 1>(sourceValues_, metadata_, false);
         break;
       }
       case Opcode::AArch64_DUPi64: {  // dup vd, vn.d[index]
-        results[0] =
-            neonHelp::vecDup_gprOrIndex<uint64_t, 1>(operands, metadata, false);
+        results_[0] =
+            vecDup_gprOrIndex<uint64_t, 1>(sourceValues_, metadata_, false);
         break;
       }
       case Opcode::AArch64_CSELWr: {  // csel wd, wn, wm, cc
-        results[0] = {
-            conditionalHelp::cs_4ops<uint32_t>(
-                operands, metadata, [](uint32_t x) -> uint32_t { return x; }),
+        results_[0] = {
+            cs_4ops<uint32_t>(sourceValues_, metadata_,
+                              [](uint32_t x) -> uint32_t { return x; }),
             8};
         break;
       }
       case Opcode::AArch64_CSELXr: {  // csel xd, xn, xm, cc
-        results[0] = conditionalHelp::cs_4ops<uint64_t>(
-            operands, metadata, [](uint64_t x) -> uint64_t { return x; });
+        results_[0] = cs_4ops<uint64_t>(
+            sourceValues_, metadata_, [](uint64_t x) -> uint64_t { return x; });
         break;
       }
       case Opcode::AArch64_CSINCWr: {  // csinc wd, wn, wm, cc
-        results[0] = {conditionalHelp::cs_4ops<uint32_t>(
-                          operands, metadata,
-                          [](uint32_t x) -> uint32_t { return x + 1; }),
-                      8};
+        results_[0] = {
+            cs_4ops<uint32_t>(sourceValues_, metadata_,
+                              [](uint32_t x) -> uint32_t { return x + 1; }),
+            8};
         break;
       }
       case Opcode::AArch64_CSINCXr: {  // csinc xd, xn, xm, cc
-        results[0] = conditionalHelp::cs_4ops<uint64_t>(
-            operands, metadata, [](uint64_t x) -> uint64_t { return x + 1; });
+        results_[0] =
+            cs_4ops<uint64_t>(sourceValues_, metadata_,
+                              [](uint64_t x) -> uint64_t { return x + 1; });
         break;
       }
       case Opcode::AArch64_CSINVWr: {  // csinv wd, wn, wm, cc
-        results[0] = {
-            conditionalHelp::cs_4ops<uint32_t>(
-                operands, metadata, [](uint32_t x) -> uint32_t { return ~x; }),
+        results_[0] = {
+            cs_4ops<uint32_t>(sourceValues_, metadata_,
+                              [](uint32_t x) -> uint32_t { return ~x; }),
             8};
         break;
       }
       case Opcode::AArch64_CSINVXr: {  // csinv xd, xn, xm, cc
-        results[0] = conditionalHelp::cs_4ops<uint64_t>(
-            operands, metadata, [](uint64_t x) -> uint64_t { return ~x; });
+        results_[0] =
+            cs_4ops<uint64_t>(sourceValues_, metadata_,
+                              [](uint64_t x) -> uint64_t { return ~x; });
         break;
       }
       case Opcode::AArch64_CSNEGWr: {  // csneg wd, wn, wm, cc
-        results[0] = {
-            conditionalHelp::cs_4ops<int32_t>(
-                operands, metadata, [](int32_t x) -> int32_t { return -x; }),
+        results_[0] = {
+            cs_4ops<int32_t>(sourceValues_, metadata_,
+                             [](int32_t x) -> int32_t { return -x; }),
             8};
         break;
       }
       case Opcode::AArch64_CSNEGXr: {  // csneg xd, xn, xm, cc
-        results[0] = conditionalHelp::cs_4ops<uint64_t>(
-            operands, metadata, [](uint64_t x) -> uint64_t { return -x; });
+        results_[0] =
+            cs_4ops<uint64_t>(sourceValues_, metadata_,
+                              [](uint64_t x) -> uint64_t { return -x; });
         break;
       }
       case Opcode::AArch64_DECB_XPiI: {  // decb xdn{, pattern{, MUL #imm}}
-        results[0] =
-            sveHelp::sveDec_scalar<int8_t>(operands, metadata, VL_bits);
+        results_[0] = sveDec_scalar<int8_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_DECD_XPiI: {  // decd xdn{, pattern{, MUL #imm}}
-        results[0] =
-            sveHelp::sveDec_scalar<int64_t>(operands, metadata, VL_bits);
+        results_[0] = sveDec_scalar<int64_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_DMB: {  // dmb option|#imm
@@ -980,65 +1165,65 @@ void Instruction::execute() {
         break;
       }
       case Opcode::AArch64_DUPM_ZI: {  // dupm zd.t, #imm
-        const uint64_t imm = static_cast<uint64_t>(metadata.operands[1].imm);
+        const uint64_t imm = static_cast<uint64_t>(metadata_.operands[1].imm);
         uint64_t out[32] = {0};
         for (int i = 0; i < (VL_bits / 64); i++) {
           out[i] = imm;
         }
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_DUP_ZI_B: {  // dup zd.b, #imm{, shift}
-        results[0] = sveHelp::sveDup_immOrScalar<int8_t>(operands, metadata,
-                                                         VL_bits, true);
+        results_[0] =
+            sveDup_immOrScalar<int8_t>(sourceValues_, metadata_, VL_bits, true);
         break;
       }
       case Opcode::AArch64_DUP_ZI_D: {  // dup zd.d, #imm{, shift}
-        results[0] = sveHelp::sveDup_immOrScalar<int64_t>(operands, metadata,
-                                                          VL_bits, true);
+        results_[0] = sveDup_immOrScalar<int64_t>(sourceValues_, metadata_,
+                                                  VL_bits, true);
         break;
       }
       case Opcode::AArch64_DUP_ZI_H: {  // dup zd.h, #imm{, shift}
-        results[0] = sveHelp::sveDup_immOrScalar<int16_t>(operands, metadata,
-                                                          VL_bits, true);
+        results_[0] = sveDup_immOrScalar<int16_t>(sourceValues_, metadata_,
+                                                  VL_bits, true);
         break;
       }
       case Opcode::AArch64_DUP_ZI_S: {  // dup zd.s, #imm{, shift}
-        results[0] = sveHelp::sveDup_immOrScalar<int32_t>(operands, metadata,
-                                                          VL_bits, true);
+        results_[0] = sveDup_immOrScalar<int32_t>(sourceValues_, metadata_,
+                                                  VL_bits, true);
         break;
       }
       case Opcode::AArch64_DUP_ZR_B: {  // dup zd.b, wn
-        results[0] = sveHelp::sveDup_immOrScalar<int8_t>(operands, metadata,
-                                                         VL_bits, false);
+        results_[0] = sveDup_immOrScalar<int8_t>(sourceValues_, metadata_,
+                                                 VL_bits, false);
         break;
       }
       case Opcode::AArch64_DUP_ZR_D: {  // dup zd.d, xn
-        results[0] = sveHelp::sveDup_immOrScalar<int64_t>(operands, metadata,
-                                                          VL_bits, false);
+        results_[0] = sveDup_immOrScalar<int64_t>(sourceValues_, metadata_,
+                                                  VL_bits, false);
         break;
       }
       case Opcode::AArch64_DUP_ZR_H: {  // dup zd.h, wn
-        results[0] = sveHelp::sveDup_immOrScalar<int16_t>(operands, metadata,
-                                                          VL_bits, false);
+        results_[0] = sveDup_immOrScalar<int16_t>(sourceValues_, metadata_,
+                                                  VL_bits, false);
         break;
       }
       case Opcode::AArch64_DUP_ZR_S: {  // dup zd.s, wn
-        results[0] = sveHelp::sveDup_immOrScalar<int32_t>(operands, metadata,
-                                                          VL_bits, false);
+        results_[0] = sveDup_immOrScalar<int32_t>(sourceValues_, metadata_,
+                                                  VL_bits, false);
         break;
       }
       case Opcode::AArch64_DUP_ZZI_D: {  // dup zd.d, zn.d[#imm]
-        results[0] =
-            sveHelp::sveDup_vecIndexed<uint64_t>(operands, metadata, VL_bits);
+        results_[0] =
+            sveDup_vecIndexed<uint64_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_DUP_ZZI_Q: {  // dup zd.q, zn.q[#imm]
         // No data-type for quadwords, but as data is just being moved around we
         // can use uint64_t.
         const uint16_t index =
-            2 * static_cast<uint16_t>(metadata.operands[1].vector_index);
-        const uint64_t* n = operands[0].getAsVector<uint64_t>();
+            2 * static_cast<uint16_t>(metadata_.operands[1].vector_index);
+        const uint64_t* n = sourceValues_[0].getAsVector<uint64_t>();
 
         const uint16_t partition_num = VL_bits / 128;
         uint64_t out[32] = {0};
@@ -1051,748 +1236,1091 @@ void Instruction::execute() {
             out[2 * i + 1] = elementLo;  // Copy over lower half of quadword
           }
         }
-        results[0] = out;
+        results_[0] = out;
         break;
       }
       case Opcode::AArch64_DUP_ZZI_S: {  // dup zd.s, zn.s[#imm]
-        results[0] =
-            sveHelp::sveDup_vecIndexed<uint32_t>(operands, metadata, VL_bits);
+        results_[0] =
+            sveDup_vecIndexed<uint32_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_DUPv16i8gpr: {  // dup vd.16b, wn
-        results[0] =
-            neonHelp::vecDup_gprOrIndex<uint8_t, 16>(operands, metadata, true);
+        results_[0] =
+            vecDup_gprOrIndex<uint8_t, 16>(sourceValues_, metadata_, true);
         break;
       }
       case Opcode::AArch64_DUPv2i32gpr: {  // dup vd.2s, wn
-        results[0] =
-            neonHelp::vecDup_gprOrIndex<uint32_t, 2>(operands, metadata, true);
+        results_[0] =
+            vecDup_gprOrIndex<uint32_t, 2>(sourceValues_, metadata_, true);
         break;
       }
       case Opcode::AArch64_DUPv2i32lane: {  // dup vd.2s, vn.s[index]
-        results[0] =
-            neonHelp::vecDup_gprOrIndex<uint32_t, 2>(operands, metadata, false);
+        results_[0] =
+            vecDup_gprOrIndex<uint32_t, 2>(sourceValues_, metadata_, false);
         break;
       }
       case Opcode::AArch64_DUPv2i64gpr: {  // dup vd.2d, xn
-        results[0] =
-            neonHelp::vecDup_gprOrIndex<uint64_t, 2>(operands, metadata, true);
+        results_[0] =
+            vecDup_gprOrIndex<uint64_t, 2>(sourceValues_, metadata_, true);
         break;
       }
       case Opcode::AArch64_DUPv2i64lane: {  // dup vd.2d, vn.d[index]
-        results[0] =
-            neonHelp::vecDup_gprOrIndex<uint64_t, 2>(operands, metadata, false);
+        results_[0] =
+            vecDup_gprOrIndex<uint64_t, 2>(sourceValues_, metadata_, false);
         break;
       }
       case Opcode::AArch64_DUPv4i16gpr: {  // dup vd.4h, wn
-        results[0] =
-            neonHelp::vecDup_gprOrIndex<uint16_t, 4>(operands, metadata, true);
+        results_[0] =
+            vecDup_gprOrIndex<uint16_t, 4>(sourceValues_, metadata_, true);
         break;
       }
       case Opcode::AArch64_DUPv4i32gpr: {  // dup vd.4s, wn
-        results[0] =
-            neonHelp::vecDup_gprOrIndex<uint32_t, 4>(operands, metadata, true);
+        results_[0] =
+            vecDup_gprOrIndex<uint32_t, 4>(sourceValues_, metadata_, true);
         break;
       }
       case Opcode::AArch64_DUPv4i32lane: {  // dup vd.4s, vn.s[index]
-        results[0] =
-            neonHelp::vecDup_gprOrIndex<uint32_t, 4>(operands, metadata, false);
+        results_[0] =
+            vecDup_gprOrIndex<uint32_t, 4>(sourceValues_, metadata_, false);
         break;
       }
       case Opcode::AArch64_DUPv8i16gpr: {  // dup vd.8h, wn
-        results[0] =
-            neonHelp::vecDup_gprOrIndex<uint16_t, 8>(operands, metadata, true);
+        results_[0] =
+            vecDup_gprOrIndex<uint16_t, 8>(sourceValues_, metadata_, true);
         break;
       }
       case Opcode::AArch64_EORWri: {  // eor wd, wn, #imm
-        auto [result, nzcv] = logicalHelp::logicOp_imm<uint32_t>(
-            operands, metadata, false,
+        auto [result, nzcv] = logicOp_imm<uint32_t>(
+            sourceValues_, metadata_, false,
             [](uint32_t x, uint32_t y) -> uint32_t { return x ^ y; });
-        results[0] = {result, 8};
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = {result, 8};
         break;
       }
       case Opcode::AArch64_EORWrs: {  // eor wd, wn, wm{, shift #imm}
-        auto [result, nzcv] = logicalHelp::logicOpShift_3ops<uint32_t>(
-            operands, metadata, false,
+        auto [result, nzcv] = logicOpShift_3ops<uint32_t>(
+            sourceValues_, metadata_, false,
             [](uint32_t x, uint32_t y) -> uint32_t { return x ^ y; });
-        results[0] = {result, 8};
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = {result, 8};
         break;
       }
       case Opcode::AArch64_EORXri: {  // eor xd, xn, #imm
-        auto [result, nzcv] = logicalHelp::logicOp_imm<uint64_t>(
-            operands, metadata, false,
+        auto [result, nzcv] = logicOp_imm<uint64_t>(
+            sourceValues_, metadata_, false,
             [](uint64_t x, uint64_t y) -> uint64_t { return x ^ y; });
-        results[0] = result;
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = result;
         break;
       }
       case Opcode::AArch64_EORXrs: {  // eor xd, xn, xm{, shift #amount}
-        auto [result, nzcv] = logicalHelp::logicOpShift_3ops<uint64_t>(
-            operands, metadata, false,
+        auto [result, nzcv] = logicOpShift_3ops<uint64_t>(
+            sourceValues_, metadata_, false,
             [](uint64_t x, uint64_t y) -> uint64_t { return x ^ y; });
-        results[0] = result;
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = result;
         break;
       }
       case Opcode::AArch64_EOR_PPzPP: {
-        results[0] = sveHelp::sveLogicOp_preds<uint8_t>(
-            operands, VL_bits,
+        results_[0] = sveLogicOp_preds<uint8_t>(
+            sourceValues_, VL_bits,
             [](uint64_t x, uint64_t y) -> uint64_t { return x ^ y; });
         break;
       }
       case Opcode::AArch64_EOR_ZPmZ_B: {  // eor zdn.b, pg/m, zdn.b, zm.b
-        results[0] = sveHelp::sveLogicOpPredicated_3vecs<uint8_t>(
-            operands, VL_bits,
+        results_[0] = sveLogicOpPredicated_3vecs<uint8_t>(
+            sourceValues_, VL_bits,
             [](uint8_t x, uint8_t y) -> uint8_t { return x ^ y; });
         break;
       }
       case Opcode::AArch64_EOR_ZPmZ_D: {  // eor zdn.d, pg/m, zdn.d, zm.d
-        results[0] = sveHelp::sveLogicOpPredicated_3vecs<uint64_t>(
-            operands, VL_bits,
+        results_[0] = sveLogicOpPredicated_3vecs<uint64_t>(
+            sourceValues_, VL_bits,
             [](uint64_t x, uint64_t y) -> uint64_t { return x ^ y; });
         break;
       }
       case Opcode::AArch64_EOR_ZPmZ_H: {  // eor zdn.h, pg/m, zdn.h, zm.h
-        results[0] = sveHelp::sveLogicOpPredicated_3vecs<uint16_t>(
-            operands, VL_bits,
+        results_[0] = sveLogicOpPredicated_3vecs<uint16_t>(
+            sourceValues_, VL_bits,
             [](uint16_t x, uint16_t y) -> uint16_t { return x ^ y; });
         break;
       }
       case Opcode::AArch64_EOR_ZPmZ_S: {  // eor zdn.s, pg/m, zdn.s, zm.s
-        results[0] = sveHelp::sveLogicOpPredicated_3vecs<uint32_t>(
-            operands, VL_bits,
+        results_[0] = sveLogicOpPredicated_3vecs<uint32_t>(
+            sourceValues_, VL_bits,
             [](uint32_t x, uint32_t y) -> uint32_t { return x ^ y; });
         break;
       }
+      case Opcode::AArch64_EOR_ZZZ: {  // eor zd.d, zn.d, zm.d
+        results_[0] = sveLogicOpUnPredicated_3vecs<uint64_t>(
+            sourceValues_, VL_bits,
+            [](uint64_t x, uint64_t y) -> uint64_t { return x ^ y; });
+        break;
+      }
       case Opcode::AArch64_EORv16i8: {  // eor vd.16b, vn.16b, vm.16b
-        results[0] = neonHelp::vecLogicOp_3vecs<uint8_t, 16>(
-            operands, [](uint8_t x, uint8_t y) -> uint8_t { return x ^ y; });
+        results_[0] = vecLogicOp_3vecs<uint8_t, 16>(
+            sourceValues_,
+            [](uint8_t x, uint8_t y) -> uint8_t { return x ^ y; });
         break;
       }
       case Opcode::AArch64_EORv8i8: {  // eor vd.8b, vn.8b, vm.8b
-        results[0] = neonHelp::vecLogicOp_3vecs<uint8_t, 8>(
-            operands, [](uint8_t x, uint8_t y) -> uint8_t { return x ^ y; });
+        results_[0] = vecLogicOp_3vecs<uint8_t, 8>(
+            sourceValues_,
+            [](uint8_t x, uint8_t y) -> uint8_t { return x ^ y; });
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_H_B: {  // mova zd.b, pg/m, zanh.b[ws,
+                                                 // #imm]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 8;
+        const uint8_t* zd = sourceValues_[0].getAsVector<uint8_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            (sourceValues_[2 + rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[2].sme.slice_offset.imm)) %
+            rowCount;
+        const uint8_t* zaRow =
+            sourceValues_[2 + sliceNum].getAsVector<uint8_t>();
+
+        uint8_t out[256] = {0};
+        for (int elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << (elem % 64);
+          if (pg[elem / 64] & shifted_active)
+            out[elem] = zaRow[elem];
+          else
+            out[elem] = zd[elem];
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_H_D: {  // mova zd.d, pg/m, zanh.d[ws,
+                                                 // #imm]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 64;
+        const uint64_t* zd = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            (sourceValues_[2 + rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[2].sme.slice_offset.imm)) %
+            rowCount;
+        const uint64_t* zaRow =
+            sourceValues_[2 + sliceNum].getAsVector<uint64_t>();
+
+        uint64_t out[32] = {0};
+        for (int elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << ((elem % 8) * 8);
+          if (pg[elem / 8] & shifted_active)
+            out[elem] = zaRow[elem];
+          else
+            out[elem] = zd[elem];
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_H_H: {  // mova zd.h, pg/m, zanh.h[ws,
+                                                 // #imm]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 16;
+        const uint16_t* zd = sourceValues_[0].getAsVector<uint16_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            (sourceValues_[2 + rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[2].sme.slice_offset.imm)) %
+            rowCount;
+        const uint16_t* zaRow =
+            sourceValues_[2 + sliceNum].getAsVector<uint16_t>();
+
+        uint16_t out[128] = {0};
+        for (int elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << ((elem % 32) * 2);
+          if (pg[elem / 32] & shifted_active)
+            out[elem] = zaRow[elem];
+          else
+            out[elem] = zd[elem];
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_H_Q: {  // mova zd.q, pg/m, zanh.q[ws]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 128;
+        // Use uint64_t as no 128-bit
+        const uint64_t* zd = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            sourceValues_[2 + rowCount].get<uint32_t>() % rowCount;
+        // Use uint64_t as no 128-bit
+        const uint64_t* zaRow =
+            sourceValues_[2 + sliceNum].getAsVector<uint64_t>();
+
+        // Use uint64_t as no 128-bit
+        uint64_t out[32] = {0};
+        for (int elem = 0; elem < rowCount; elem++) {
+          // For 128-bit there are 16-bit for each active element
+          uint64_t shifted_active = 1ull << ((elem % 4) * 16);
+          if (pg[elem / 4] & shifted_active) {
+            // Need to move two consecutive 64-bit elements
+            out[2 * elem] = zaRow[2 * elem];
+            out[2 * elem + 1] = zaRow[2 * elem + 1];
+          } else {
+            // Need to move two consecutive 64-bit elements
+            out[2 * elem] = zd[2 * elem];
+            out[2 * elem + 1] = zd[2 * elem + 1];
+          }
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_H_S: {  // mova zd.s, pg/m, zanh.s[ws,
+                                                 // #imm]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 32;
+        const uint32_t* zd = sourceValues_[0].getAsVector<uint32_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            (sourceValues_[2 + rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[2].sme.slice_offset.imm)) %
+            rowCount;
+        const uint32_t* zaRow =
+            sourceValues_[2 + sliceNum].getAsVector<uint32_t>();
+
+        uint32_t out[64] = {0};
+        for (int elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << ((elem % 16) * 4);
+          if (pg[elem / 16] & shifted_active)
+            out[elem] = zaRow[elem];
+          else
+            out[elem] = zd[elem];
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_V_B: {  // mova zd.b, pg/m, zanv.b[ws,
+                                                 // #imm]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 8;
+        const uint8_t* zd = sourceValues_[0].getAsVector<uint8_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            (sourceValues_[2 + rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[2].sme.slice_offset.imm)) %
+            rowCount;
+
+        uint8_t out[256] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << (elem % 64);
+          if (pg[elem / 64] & shifted_active)
+            out[elem] =
+                sourceValues_[2 + elem].getAsVector<uint8_t>()[sliceNum];
+          else
+            out[elem] = zd[elem];
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_V_D: {  // mova zd.d, pg/m, zanv.d[ws,
+                                                 // #imm]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 64;
+        const uint64_t* zd = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            (sourceValues_[2 + rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[2].sme.slice_offset.imm)) %
+            rowCount;
+
+        uint64_t out[32] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << ((elem % 8) * 8);
+          if (pg[elem / 8] & shifted_active)
+            out[elem] =
+                sourceValues_[2 + elem].getAsVector<uint64_t>()[sliceNum];
+          else
+            out[elem] = zd[elem];
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_V_H: {  // mova zd.h, pg/m, zanv.h[ws,
+                                                 // #imm]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 16;
+        const uint16_t* zd = sourceValues_[0].getAsVector<uint16_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            (sourceValues_[2 + rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[2].sme.slice_offset.imm)) %
+            rowCount;
+
+        uint16_t out[128] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << ((elem % 32) * 2);
+          if (pg[elem / 32] & shifted_active)
+            out[elem] =
+                sourceValues_[2 + elem].getAsVector<uint16_t>()[sliceNum];
+          else
+            out[elem] = zd[elem];
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_V_Q: {  // mova zd.q, pg/m, zanv.q[ws]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 128;
+        // Use uint64_t as no 128-bit
+        const uint64_t* zd = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            sourceValues_[2 + rowCount].get<uint32_t>() % rowCount;
+
+        // Use uint64_t as no 128-bit
+        uint64_t out[32] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          // For 128-bit there are 16-bit for each active element
+          uint64_t shifted_active = 1ull << ((elem % 4) * 16);
+          if (pg[elem / 4] & shifted_active) {
+            // Need to move two consecutive 64-bit elements
+            const uint64_t* zaRow =
+                sourceValues_[2 + elem].getAsVector<uint64_t>();
+            out[2 * elem] = zaRow[2 * sliceNum];
+            out[2 * elem + 1] = zaRow[2 * sliceNum + 1];
+          } else {
+            // Need to move two consecutive 64-bit elements
+            out[2 * elem] = zd[2 * elem];
+            out[2 * elem + 1] = zd[2 * elem + 1];
+          }
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_V_S: {  // mova zd.s, pg/m, zanv.s[ws,
+                                                 // #imm]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 32;
+        const uint32_t* zd = sourceValues_[0].getAsVector<uint32_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            (sourceValues_[2 + rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[2].sme.slice_offset.imm)) %
+            rowCount;
+
+        uint32_t out[64] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << ((elem % 16) * 4);
+          if (pg[elem / 16] & shifted_active)
+            out[elem] =
+                sourceValues_[2 + elem].getAsVector<uint32_t>()[sliceNum];
+          else
+            out[elem] = zd[elem];
+        }
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_EXTRWrri: {  // extr wd, wn, wm, #lsb
-        results[0] = {
-            bitmanipHelp::extrLSB_registers<uint32_t>(operands, metadata), 8};
+        results_[0] = {extrLSB_registers<uint32_t>(sourceValues_, metadata_),
+                       8};
         break;
       }
       case Opcode::AArch64_EXTRXrri: {  // extr xd, xn, xm, #lsb
-        results[0] =
-            bitmanipHelp::extrLSB_registers<uint64_t>(operands, metadata);
+        results_[0] = extrLSB_registers<uint64_t>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_EXTv16i8: {  // ext vd.16b, vn.16b, vm.16b, #index
-        results[0] =
-            neonHelp::vecExtVecs_index<uint8_t, 16>(operands, metadata);
+        results_[0] = vecExtVecs_index<uint8_t, 16>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_EXTv8i8: {  // ext vd.8b, vn.8b, vm.8b, #index
-        results[0] = neonHelp::vecExtVecs_index<uint8_t, 8>(operands, metadata);
+        results_[0] = vecExtVecs_index<uint8_t, 8>(sourceValues_, metadata_);
+        break;
+      }
+      case Opcode::AArch64_FABDv2f64: {  // fabd vd.2d, vn.2d, vm.2d
+        results_[0] = vecFabd<double, 2>(sourceValues_);
+        break;
+      }
+      case Opcode::AArch64_FABDv4f32: {  // fabd vd.4s, vn.4s, vm.4s
+        results_[0] = vecFabd<float, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FABD32: {  // fabd sd, sn, sm
-        results[0] = floatHelp::fabd_3ops<float>(operands);
+        results_[0] = fabd_3ops<float>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FABD64: {  // fabd dd, dn, dm
-        results[0] = floatHelp::fabd_3ops<double>(operands);
+        results_[0] = fabd_3ops<double>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FABSDr: {  // fabs dd, dn
-        results[0] = floatHelp::fabs_2ops<double>(operands);
+        results_[0] = fabs_2ops<double>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FABSSr: {  // fabs sd, sn
-        results[0] = floatHelp::fabs_2ops<float>(operands);
+        results_[0] = fabs_2ops<float>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FABS_ZPmZ_D: {  // fabs zd.d, pg/m, zn.d
-        results[0] = sveHelp::sveFabsPredicated<double>(operands, VL_bits);
+        results_[0] = sveFabsPredicated<double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FABS_ZPmZ_S: {  // fabs zd.s, pg/m, zn.s
-        results[0] = sveHelp::sveFabsPredicated<float>(operands, VL_bits);
+        results_[0] = sveFabsPredicated<float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FABSv2f64: {  // fabs vd.2d, vn.2d
-        results[0] = neonHelp::vecFabs_2ops<double, 2>(operands);
+        results_[0] = vecFabs_2ops<double, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FABSv4f32: {  // fabs vd.4s, vn.4s
-        results[0] = neonHelp::vecFabs_2ops<float, 4>(operands);
+        results_[0] = vecFabs_2ops<float, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FADDA_VPZ_D: {  // fadda dd, pg/m, dn, zm.d
-        results[0] = sveHelp::sveFaddaPredicated<double>(operands, VL_bits);
+        results_[0] = sveFaddaPredicated<double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FADDA_VPZ_S: {  // fadda sd, pg/m, sn, zm.s
-        results[0] = sveHelp::sveFaddaPredicated<float>(operands, VL_bits);
+        results_[0] = sveFaddaPredicated<float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FADDDrr: {  // fadd dd, dn, dm
-        results[0] = {arithmeticHelp::add_3ops<double>(operands), 256};
+        results_[0] = {add_3ops<double>(sourceValues_), 256};
         break;
       }
       case Opcode::AArch64_FADDPv2f32: {  // faddp vd.2s, vn.2s, vm.2s
-        results[0] = neonHelp::vecAddp_3ops<float, 2>(operands);
+        results_[0] = vecAddp_3ops<float, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FADDPv2f64: {  // faddp vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecAddp_3ops<double, 2>(operands);
+        results_[0] = vecAddp_3ops<double, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FADDPv2i32p: {  // faddp dd, vn.2s
-        results[0] = neonHelp::vecSumElems_2ops<float, 2>(operands);
+        results_[0] = vecSumElems_2ops<float, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FADDPv2i64p: {  // faddp dd, vn.2d
-        results[0] = neonHelp::vecSumElems_2ops<double, 2>(operands);
+        results_[0] = vecSumElems_2ops<double, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FADDPv4f32: {  // faddp vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecAddp_3ops<float, 4>(operands);
+        results_[0] = vecAddp_3ops<float, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FADDSrr: {  // fadd sd, sn, sm
-        results[0] = {arithmeticHelp::add_3ops<float>(operands), 256};
+        results_[0] = {add_3ops<float>(sourceValues_), 256};
         break;
       }
       case Opcode::AArch64_FADD_ZPmI_D: {  // fadd zdn.d, pg/m, zdn.d, const
-        results[0] = sveHelp::sveAddPredicated_const<double>(operands, metadata,
-                                                             VL_bits);
+        results_[0] =
+            sveAddPredicated_const<double>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_FADD_ZPmI_S: {  // fadd zdn.s, pg/m, zdn.s, const
-        results[0] =
-            sveHelp::sveAddPredicated_const<float>(operands, metadata, VL_bits);
+        results_[0] =
+            sveAddPredicated_const<float>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_FADD_ZPmZ_D: {  // fadd zdn.d, pg/m, zdn.d, zm.d
-        results[0] = sveHelp::sveAddPredicated_vecs<double>(operands, VL_bits);
+        results_[0] = sveAddPredicated_vecs<double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FADD_ZPmZ_S: {  // fadd zdn.s, pg/m, zdn.s, zm.s
-        results[0] = sveHelp::sveAddPredicated_vecs<float>(operands, VL_bits);
+        results_[0] = sveAddPredicated_vecs<float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FADD_ZZZ_D: {  // fadd zd.d, zn.d, zm.d
-        results[0] = sveHelp::sveAdd_3ops<double>(operands, VL_bits);
+        results_[0] = sveAdd_3ops<double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FADD_ZZZ_S: {  // fadd zd.s, zn.s, zm.s
-        results[0] = sveHelp::sveAdd_3ops<float>(operands, VL_bits);
+        results_[0] = sveAdd_3ops<float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FADDv2f32: {  // fadd vd.2s, vn.2s, vm.2s
-        results[0] = neonHelp::vecAdd_3ops<float, 2>(operands);
+        results_[0] = vecAdd_3ops<float, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FADDv2f64: {  // fadd vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecAdd_3ops<double, 2>(operands);
+        results_[0] = vecAdd_3ops<double, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FADDv4f32: {  // fadd vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecAdd_3ops<float, 4>(operands);
+        results_[0] = vecAdd_3ops<float, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FCADD_ZPmZ_D: {  // fcadd zdn.d, pg/m, zdn.d, zm.d,
                                             // #imm
-        results[0] =
-            sveHelp::sveFcaddPredicated<double>(operands, metadata, VL_bits);
+        results_[0] =
+            sveFcaddPredicated<double>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_FCCMPDrr:     // fccmp sn, sm, #nzcv, cc
       case Opcode::AArch64_FCCMPEDrr: {  // fccmpe sn, sm, #nzcv, cc
-        results[0] = floatHelp::fccmp<double>(operands, metadata);
+        results_[0] = fccmp<double>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_FCCMPESrr: {  // fccmpe sn, sm, #nzcv, cc
-        results[0] = floatHelp::fccmp<float>(operands, metadata);
+        results_[0] = fccmp<float>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_FCCMPSrr: {  // fccmp sn, sm, #nzcv, cc
-        results[0] = floatHelp::fccmp<float>(operands, metadata);
+        results_[0] = fccmp<float>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_FCMEQv2i32rz: {  // fcmeq vd.2s, vd.2s, #0.0
-        results[0] = neonHelp::vecFCompare<float, uint32_t, 2>(
-            operands, true, [](float x, float y) -> bool { return x == y; });
+        results_[0] = vecFCompare<float, uint32_t, 2>(
+            sourceValues_, true,
+            [](float x, float y) -> bool { return x == y; });
         break;
       }
       case Opcode::AArch64_FCMEQv4i32rz: {  // fcmeq vd.4s vn.4s, #0.0
-        results[0] = neonHelp::vecFCompare<float, uint32_t, 4>(
-            operands, true, [](float x, float y) -> bool { return x == y; });
+        results_[0] = vecFCompare<float, uint32_t, 4>(
+            sourceValues_, true,
+            [](float x, float y) -> bool { return x == y; });
         break;
       }
       case Opcode::AArch64_FCMGE_PPzZ0_D: {  // fcmge pd.d, pg/z, zn.d, #0.0
-        results[0] = sveHelp::sveComparePredicated_vecsToPred<double>(
-            operands, metadata, VL_bits, true,
+        results_[0] = sveComparePredicated_vecsToPred<double>(
+            sourceValues_, metadata_, VL_bits, true,
             [](double x, double y) -> bool { return x >= y; });
         break;
       }
       case Opcode::AArch64_FCMGE_PPzZ0_S: {  // fcmge pd.s, pg/z, zn.s, #0.0
-        results[0] = sveHelp::sveComparePredicated_vecsToPred<float>(
-            operands, metadata, VL_bits, true,
+        results_[0] = sveComparePredicated_vecsToPred<float>(
+            sourceValues_, metadata_, VL_bits, true,
             [](float x, float y) -> bool { return x >= y; });
         break;
       }
       case Opcode::AArch64_FCMGE_PPzZZ_D: {  // fcmge pd.d, pg/z, zn.d, zm.d
-        results[0] = sveHelp::sveComparePredicated_vecsToPred<double>(
-            operands, metadata, VL_bits, false,
+        results_[0] = sveComparePredicated_vecsToPred<double>(
+            sourceValues_, metadata_, VL_bits, false,
             [](double x, double y) -> bool { return x >= y; });
         break;
       }
       case Opcode::AArch64_FCMGE_PPzZZ_S: {  // fcmge pd.s, pg/z, zn.s, zm.s
-        results[0] = sveHelp::sveComparePredicated_vecsToPred<float>(
-            operands, metadata, VL_bits, false,
+        results_[0] = sveComparePredicated_vecsToPred<float>(
+            sourceValues_, metadata_, VL_bits, false,
             [](float x, float y) -> bool { return x >= y; });
         break;
       }
       case Opcode::AArch64_FCMGEv2f32: {  // fcmge vd.2s, vn.2s, vm.2s
-        results[0] = neonHelp::vecFCompare<float, uint32_t, 2>(
-            operands, false, [](float x, float y) -> bool { return x >= y; });
+        results_[0] = vecFCompare<float, uint32_t, 2>(
+            sourceValues_, false,
+            [](float x, float y) -> bool { return x >= y; });
         break;
       }
       case Opcode::AArch64_FCMGEv2f64: {  // fcmge vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecFCompare<double, uint64_t, 2>(
-            operands, false, [](float x, double y) -> bool { return x >= y; });
+        results_[0] = vecFCompare<double, uint64_t, 2>(
+            sourceValues_, false,
+            [](float x, double y) -> bool { return x >= y; });
         break;
       }
       case Opcode::AArch64_FCMGEv2i64rz: {  // fcmge vd.2d, vn.2d, 0.0
-        results[0] = neonHelp::vecFCompare<double, uint64_t, 2>(
-            operands, true, [](double x, double y) -> bool { return x >= y; });
+        results_[0] = vecFCompare<double, uint64_t, 2>(
+            sourceValues_, true,
+            [](double x, double y) -> bool { return x >= y; });
         break;
       }
       case Opcode::AArch64_FCMGEv4f32: {  // fcmge vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecFCompare<float, uint32_t, 4>(
-            operands, false, [](float x, float y) -> bool { return x >= y; });
+        results_[0] = vecFCompare<float, uint32_t, 4>(
+            sourceValues_, false,
+            [](float x, float y) -> bool { return x >= y; });
         break;
       }
       case Opcode::AArch64_FCMGEv4i32rz: {  // fcmge vd.4s, vn.4s, 0.0
-        results[0] = neonHelp::vecFCompare<float, uint32_t, 4>(
-            operands, true, [](float x, float y) -> bool { return x >= y; });
+        results_[0] = vecFCompare<float, uint32_t, 4>(
+            sourceValues_, true,
+            [](float x, float y) -> bool { return x >= y; });
         break;
       }
       case Opcode::AArch64_FCMGT_PPzZ0_D: {  // fcmgt pd.d, pg/z, zn.d, #0.0
-        results[0] = sveHelp::sveComparePredicated_vecsToPred<double>(
-            operands, metadata, VL_bits, true,
+        results_[0] = sveComparePredicated_vecsToPred<double>(
+            sourceValues_, metadata_, VL_bits, true,
             [](double x, double y) -> bool { return x > y; });
         break;
       }
       case Opcode::AArch64_FCMGT_PPzZ0_S: {  // fcmgt pd.s, pg/z, zn.s, #0.0
-        results[0] = sveHelp::sveComparePredicated_vecsToPred<float>(
-            operands, metadata, VL_bits, true,
+        results_[0] = sveComparePredicated_vecsToPred<float>(
+            sourceValues_, metadata_, VL_bits, true,
             [](float x, float y) -> bool { return x > y; });
         break;
       }
       case Opcode::AArch64_FCMGT_PPzZZ_D: {  // fcmgt pd.d, pg/z, zn.d, zm.d
-        results[0] = sveHelp::sveComparePredicated_vecsToPred<double>(
-            operands, metadata, VL_bits, false,
+        results_[0] = sveComparePredicated_vecsToPred<double>(
+            sourceValues_, metadata_, VL_bits, false,
             [](double x, double y) -> bool { return x > y; });
         break;
       }
       case Opcode::AArch64_FCMGT_PPzZZ_S: {  // fcmgt pd.s, pg/z, zn.s, zm.
-        results[0] = sveHelp::sveComparePredicated_vecsToPred<float>(
-            operands, metadata, VL_bits, false,
+        results_[0] = sveComparePredicated_vecsToPred<float>(
+            sourceValues_, metadata_, VL_bits, false,
             [](float x, float y) -> bool { return x > y; });
         break;
       }
       case Opcode::AArch64_FCMGTv2i32rz: {  // fcmgt vd.2s, vn.2s, #0.0
-        results[0] = neonHelp::vecFCompare<float, uint32_t, 2>(
-            operands, true, [](float x, float y) -> bool { return x > y; });
+        results_[0] = vecFCompare<float, uint32_t, 2>(
+            sourceValues_, true,
+            [](float x, float y) -> bool { return x > y; });
         break;
       }
       case Opcode::AArch64_FCMGTv2i64rz: {  // fcmgt vd.2d, vn.2d, #0.0
-        results[0] = neonHelp::vecFCompare<double, uint64_t, 2>(
-            operands, true, [](double x, double y) -> bool { return x > y; });
+        results_[0] = vecFCompare<double, uint64_t, 2>(
+            sourceValues_, true,
+            [](double x, double y) -> bool { return x > y; });
+        break;
+      }
+      case Opcode::AArch64_FCMGTv2f64: {  // fcmgt vd.2d, vn.2d, vm.2d
+        results_[0] = vecFCompare<double, uint64_t, 2>(
+            sourceValues_, false,
+            [](double x, double y) -> bool { return x > y; });
         break;
       }
       case Opcode::AArch64_FCMGTv4f32: {  // fcmgt vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecFCompare<float, uint32_t, 4>(
-            operands, false, [](float x, float y) -> bool { return x > y; });
+        results_[0] = vecFCompare<float, uint32_t, 4>(
+            sourceValues_, false,
+            [](float x, float y) -> bool { return x > y; });
         break;
       }
       case Opcode::AArch64_FCMGTv4i32rz: {  // fcmgt vd.4s, vn.4s, #0.0
-        results[0] = neonHelp::vecFCompare<float, uint32_t, 4>(
-            operands, true, [](float x, float y) -> bool { return x > y; });
+        results_[0] = vecFCompare<float, uint32_t, 4>(
+            sourceValues_, true,
+            [](float x, float y) -> bool { return x > y; });
         break;
       }
       case Opcode::AArch64_FCMLA_ZPmZZ_D: {  // fcmla zda, pg/m, zn, zm, #imm
-        results[0] =
-            sveHelp::sveFcmlaPredicated<double>(operands, metadata, VL_bits);
+        results_[0] =
+            sveFcmlaPredicated<double>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_FCMLE_PPzZ0_D: {  // fcmle pd.d, pg/z, zn.d, #0.0
-        results[0] = sveHelp::sveComparePredicated_vecsToPred<double>(
-            operands, metadata, VL_bits, true,
+        results_[0] = sveComparePredicated_vecsToPred<double>(
+            sourceValues_, metadata_, VL_bits, true,
             [](double x, double y) -> bool { return x <= y; });
         break;
       }
       case Opcode::AArch64_FCMLE_PPzZ0_S: {  // fcmle pd.s, pg/z, zn.s, #0.0
-        results[0] = sveHelp::sveComparePredicated_vecsToPred<float>(
-            operands, metadata, VL_bits, true,
+        results_[0] = sveComparePredicated_vecsToPred<float>(
+            sourceValues_, metadata_, VL_bits, true,
             [](float x, float y) -> bool { return x <= y; });
         break;
       }
       case Opcode::AArch64_FCMLT_PPzZ0_S: {  // fcmlt pd.s, pg/z, zn.s, #0.0
-        results[0] = sveHelp::sveComparePredicated_vecsToPred<float>(
-            operands, metadata, VL_bits, true,
+        results_[0] = sveComparePredicated_vecsToPred<float>(
+            sourceValues_, metadata_, VL_bits, true,
             [](float x, float y) -> bool { return x < y; });
         break;
       }
       case Opcode::AArch64_FCMLTv2i32rz: {  // fcmlt vd.2s, vn.2s, #0.0
-        results[0] = neonHelp::vecFCompare<float, uint32_t, 2>(
-            operands, true, [](float x, float y) -> bool { return x < y; });
+        results_[0] = vecFCompare<float, uint32_t, 2>(
+            sourceValues_, true,
+            [](float x, float y) -> bool { return x < y; });
         break;
       }
       case Opcode::AArch64_FCMLTv2i64rz: {  // fcmlt vd.2d, vn.2d, #0.0
-        results[0] = neonHelp::vecFCompare<double, uint64_t, 2>(
-            operands, true, [](double x, double y) -> bool { return x < y; });
+        results_[0] = vecFCompare<double, uint64_t, 2>(
+            sourceValues_, true,
+            [](double x, double y) -> bool { return x < y; });
         break;
       }
       case Opcode::AArch64_FCMLTv4i32rz: {  // fcmlt vd.4s, vn.4s, #0.0
-        results[0] = neonHelp::vecFCompare<float, uint32_t, 4>(
-            operands, true, [](float x, float y) -> bool { return x < y; });
+        results_[0] = vecFCompare<float, uint32_t, 4>(
+            sourceValues_, true,
+            [](float x, float y) -> bool { return x < y; });
         break;
       }
       case Opcode::AArch64_FCMPDri: {  // fcmp dn, #imm
-        results[0] = floatHelp::fcmp<double>(operands, true);
+        results_[0] = fcmp<double>(sourceValues_, true);
         break;
       }
       case Opcode::AArch64_FCMPDrr: {  // fcmp dn, dm
-        results[0] = floatHelp::fcmp<double>(operands, false);
+        results_[0] = fcmp<double>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_FCMPEDri: {  // fcmpe dn, #imm
-        results[0] = floatHelp::fcmp<double>(operands, true);
+        results_[0] = fcmp<double>(sourceValues_, true);
         break;
       }
       case Opcode::AArch64_FCMPEDrr: {  // fcmpe dn, dm
-        results[0] = floatHelp::fcmp<double>(operands, false);
+        results_[0] = fcmp<double>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_FCMPESri: {  // fcmpe sn, #imm
-        results[0] = floatHelp::fcmp<float>(operands, true);
+        results_[0] = fcmp<float>(sourceValues_, true);
         break;
       }
       case Opcode::AArch64_FCMPESrr: {  // fcmpe sn, sm
-        results[0] = floatHelp::fcmp<float>(operands, false);
+        results_[0] = fcmp<float>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_FCMPSri: {  // fcmp sn, #imm
-        results[0] = floatHelp::fcmp<float>(operands, true);
+        results_[0] = fcmp<float>(sourceValues_, true);
         break;
       }
       case Opcode::AArch64_FCMPSrr: {  // fcmp sn, sm
-        results[0] = floatHelp::fcmp<float>(operands, false);
+        results_[0] = fcmp<float>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_FCPY_ZPmI_D: {  // fcpy zd.d, pg/m, #const
-        results[0] = sveHelp::sveFcpy_imm<double>(operands, metadata, VL_bits);
+        results_[0] = sveFcpy_imm<double>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_FCPY_ZPmI_S: {  // fcpy zd.s, pg/m, #const
-        results[0] = sveHelp::sveFcpy_imm<float>(operands, metadata, VL_bits);
+        results_[0] = sveFcpy_imm<float>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_FCSELDrrr: {  // fcsel dd, dn, dm, cond
-        results[0] = {
-            conditionalHelp::cs_4ops<double>(
-                operands, metadata, [](double x) -> double { return x; }),
-            256};
+        results_[0] = {cs_4ops<double>(sourceValues_, metadata_,
+                                       [](double x) -> double { return x; }),
+                       256};
         break;
       }
       case Opcode::AArch64_FCSELSrrr: {  // fcsel sd, sn, sm, cond
-        results[0] = {
-            conditionalHelp::cs_4ops<float>(operands, metadata,
-                                            [](float x) -> float { return x; }),
-            256};
+        results_[0] = {cs_4ops<float>(sourceValues_, metadata_,
+                                      [](float x) -> float { return x; }),
+                       256};
         break;
       }
       case Opcode::AArch64_FCVTASUWDr: {  // fcvtas wd, dn
-        results[0] = {static_cast<int32_t>(round(operands[0].get<double>())),
-                      8};
+        results_[0] = {
+            static_cast<int32_t>(round(sourceValues_[0].get<double>())), 8};
         break;
       }
       case Opcode::AArch64_FCVTASUXDr: {  // fcvtas xd, dn
-        results[0] = static_cast<int64_t>(round(operands[0].get<double>()));
+        results_[0] =
+            static_cast<int64_t>(round(sourceValues_[0].get<double>()));
         break;
       }
       case Opcode::AArch64_FCVTDSr: {  // fcvt dd, sn
         // TODO: Handle NaNs, denorms, and saturation?
-        results[0] = neonHelp::vecFcvtl<double, float, 1>(operands, false);
+        results_[0] = vecFcvtl<double, float, 1>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_FCVTLv2i32: {  // fcvtl vd.2d, vn.2s
-        results[0] = neonHelp::vecFcvtl<double, float, 2>(operands, false);
+        results_[0] = vecFcvtl<double, float, 2>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_FCVTLv4i32: {  // fcvtl2 vd.2d, vn.4s
-        results[0] = neonHelp::vecFcvtl<double, float, 2>(operands, true);
+        results_[0] = vecFcvtl<double, float, 2>(sourceValues_, true);
         break;
       }
       case Opcode::AArch64_FCVTNv2i32: {  // fcvtn vd.2s, vn.2d
-        results[0] = neonHelp::vecFcvtn<float, double, 2>(operands, false);
+        results_[0] = vecFcvtn<float, double, 2>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_FCVTNv4i32: {  // fcvtn2 vd.4s, vn.2d
-        results[0] = neonHelp::vecFcvtn<float, double, 4>(operands, true);
+        results_[0] = vecFcvtn<float, double, 4>(sourceValues_, true);
         break;
       }
       case Opcode::AArch64_FCVTSDr: {  // fcvt sd, dn
         // TODO: Handle NaNs, denorms, and saturation?
-        results[0] = neonHelp::vecFcvtl<float, double, 1>(operands, false);
+        results_[0] = vecFcvtl<float, double, 1>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_FCVTZSUWDr: {  // fcvtzs wd, dn
         // TODO: Handle NaNs, denorms, and saturation
-        results[0] = {
-            static_cast<int32_t>(std::trunc(operands[0].get<double>())), 8};
+        results_[0] = {
+            static_cast<int32_t>(std::trunc(sourceValues_[0].get<double>())),
+            8};
         break;
       }
       case Opcode::AArch64_FCVTZSUWSr: {  // fcvtzs wd, sn
         // TODO: Handle NaNs, denorms, and saturation
-        results[0] = {
-            static_cast<int32_t>(std::trunc(operands[0].get<float>())), 8};
+        results_[0] = {
+            static_cast<int32_t>(std::trunc(sourceValues_[0].get<float>())), 8};
         break;
       }
       case Opcode::AArch64_FCVTZSUXDr: {
         // TODO: Handle NaNs, denorms, and saturation
-        results[0] = {
-            static_cast<int64_t>(std::trunc(operands[0].get<double>())), 8};
+        results_[0] = {
+            static_cast<int64_t>(std::trunc(sourceValues_[0].get<double>())),
+            8};
         break;
       }
       case Opcode::AArch64_FCVTZS_ZPmZ_DtoD: {  // fcvtzs zd.d, pg/m, zn.d
-        results[0] =
-            sveHelp::sveFcvtzsPredicated<int64_t, double>(operands, VL_bits);
+        results_[0] =
+            sveFcvtzsPredicated<int64_t, double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FCVTZS_ZPmZ_DtoS: {  // fcvtzs zd.s, pg/m, zn.d
-        results[0] =
-            sveHelp::sveFcvtzsPredicated<int32_t, double>(operands, VL_bits);
+        results_[0] =
+            sveFcvtzsPredicated<int32_t, double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FCVTZS_ZPmZ_StoD: {  // fcvtzs zd.d, pg/m, zn.s
-        results[0] =
-            sveHelp::sveFcvtzsPredicated<int64_t, float>(operands, VL_bits);
+        results_[0] =
+            sveFcvtzsPredicated<int64_t, float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FCVTZS_ZPmZ_StoS: {  // fcvtzs zd.s, pg/m, zn.s
-        results[0] =
-            sveHelp::sveFcvtzsPredicated<int32_t, float>(operands, VL_bits);
+        results_[0] =
+            sveFcvtzsPredicated<int32_t, float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FCVTZSv2f64: {  // fcvtzs vd.2d, vn.2d
-        results[0] = neonHelp::vecFcvtzs<int64_t, double, 2>(operands);
+        results_[0] = vecFcvtzs<int64_t, double, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FCVTZUUWDr: {  // fcvtzu wd, dn
-        // TODO: Handle NaNs, denorms, and saturation
-        results[0] = {
-            static_cast<int32_t>(std::trunc(operands[0].get<double>())), 8};
+        results_[0] = {fcvtzu_integer<uint32_t, double>(sourceValues_), 8};
         break;
       }
       case Opcode::AArch64_FCVTZUUWSr: {  // fcvtzu wd, sn
-        // TODO: Handle NaNs, denorms, and saturation
-        results[0] = {
-            static_cast<int32_t>(std::trunc(operands[0].get<float>())), 8};
+        results_[0] = {fcvtzu_integer<uint32_t, float>(sourceValues_), 8};
         break;
       }
       case Opcode::AArch64_FCVTZUUXDr: {  // fcvtzu xd, dn
-        // TODO: Handle NaNs, denorms, and saturation
-        results[0] =
-            static_cast<int64_t>(std::trunc(operands[0].get<double>()));
+        results_[0] = {fcvtzu_integer<uint64_t, double>(sourceValues_), 8};
         break;
       }
       case Opcode::AArch64_FCVTZUUXSr: {  // fcvtzu xd, sn
-        // TODO: Handle NaNs, denorms, and saturation
-        results[0] = static_cast<int64_t>(std::trunc(operands[0].get<float>()));
+        results_[0] = {fcvtzu_integer<uint64_t, float>(sourceValues_), 8};
         break;
       }
       case Opcode::AArch64_FCVTZUv1i64: {  // fcvtzu dd, dn
-        // TODO: Handle NaNs, denorms, and saturation
-        results[0] = {
-            static_cast<uint64_t>(std::trunc(operands[0].get<double>())), 256};
+        results_[0] = {fcvtzu_integer<uint64_t, double>(sourceValues_), 256};
         break;
       }
       case Opcode::AArch64_FCVT_ZPmZ_DtoS: {  // fcvt zd.s, pg/m, zn.d
-        results[0] =
-            sveHelp::sveFcvtPredicated<float, double>(operands, VL_bits);
+        results_[0] = sveFcvtPredicated<float, double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FCVT_ZPmZ_StoD: {  // fcvt zd.d, pg/m, zn.s
-        results[0] =
-            sveHelp::sveFcvtPredicated<double, float>(operands, VL_bits);
+        results_[0] = sveFcvtPredicated<double, float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FDIVDrr: {  // fdiv dd, dn, dm
-        results[0] = {divideHelp::div_3ops<double>(operands), 256};
+        results_[0] = {div_3ops<double>(sourceValues_), 256};
         break;
       }
       case Opcode::AArch64_FDIVR_ZPmZ_D: {  // fdivr zdn.d, pg/m, zdn.d, zm.d
-        results[0] = sveHelp::sveLogicOpPredicated_3vecs<double>(
-            operands, VL_bits,
-            [](double x, double y) -> double { return (y / x); });
+        results_[0] = sveFDivPredicated<double, true>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FDIVR_ZPmZ_S: {  // fdivr zdn.s, pg/m, zdn.s, zm.s
-        results[0] = sveHelp::sveLogicOpPredicated_3vecs<float>(
-            operands, VL_bits,
-            [](float x, float y) -> float { return (y / x); });
+        results_[0] = sveFDivPredicated<float, true>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FDIVSrr: {  // fdiv sd, sn, sm
-        results[0] = {divideHelp::div_3ops<float>(operands), 256};
+        results_[0] = {div_3ops<float>(sourceValues_), 256};
         break;
       }
       case Opcode::AArch64_FDIV_ZPmZ_D: {  // fdiv zdn.d, pg/m, zdn.d, zm.d
-        results[0] = sveHelp::sveLogicOpPredicated_3vecs<double>(
-            operands, VL_bits,
-            [](double x, double y) -> double { return (x / y); });
+        results_[0] = sveFDivPredicated<double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FDIVv2f64: {  // fdiv vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecLogicOp_3vecs<double, 2>(
-            operands, [](double x, double y) -> double { return x / y; });
+        results_[0] = vecFDiv<double, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FDUP_ZI_D: {  // fdup zd.d, #imm
-        results[0] = sveHelp::sveDup_immOrScalar<double>(operands, metadata,
-                                                         VL_bits, true);
+        results_[0] =
+            sveDup_immOrScalar<double>(sourceValues_, metadata_, VL_bits, true);
         break;
       }
       case Opcode::AArch64_FDUP_ZI_S: {  // fdup zd.s, #imm
-        results[0] = sveHelp::sveDup_immOrScalar<float>(operands, metadata,
-                                                        VL_bits, true);
+        results_[0] =
+            sveDup_immOrScalar<float>(sourceValues_, metadata_, VL_bits, true);
         break;
       }
       case Opcode::AArch64_FMADDDrrr: {  // fmadd dn, dm, da
-        results[0] = {multiplyHelp::madd_4ops<double>(operands), 256};
+        results_[0] = {madd_4ops<double>(sourceValues_), 256};
         break;
       }
       case Opcode::AArch64_FMADDSrrr: {  // fmadd sn, sm, sa
-        results[0] = {multiplyHelp::madd_4ops<float>(operands), 256};
+        results_[0] = {madd_4ops<float>(sourceValues_), 256};
         break;
       }
       case Opcode::AArch64_FMAD_ZPmZZ_D: {  // fmad zd.d, pg/m, zn.d, zm.d
-        results[0] = sveHelp::sveFmadPredicated_vecs<double>(operands, VL_bits);
+        results_[0] = sveFmadPredicated_vecs<double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FMAD_ZPmZZ_S: {  // fmad zd.s, pg/m, zn.s, zm.s
-        results[0] = sveHelp::sveFmadPredicated_vecs<float>(operands, VL_bits);
+        results_[0] = sveFmadPredicated_vecs<float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FMAXNMDrr: {  // fmaxnm dd, dn, dm
-        results[0] = floatHelp::fmaxnm_3ops<double>(operands);
+        results_[0] = fmaxnm_3ops<double>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FMAXNMPv2i64p: {  // fmaxnmp dd, vd.2d
-        results[0] = neonHelp::vecMaxnmp_2ops<double, 2>(operands);
+        results_[0] = vecMaxnmp_2ops<double, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FMAXNMSrr: {  // fmaxnm sd, sn, sm
-        results[0] = floatHelp::fmaxnm_3ops<float>(operands);
+        results_[0] = fmaxnm_3ops<float>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FMAXNMv2f64: {  // fmaxnm vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecLogicOp_3vecs<double, 2>(
-            operands,
+        results_[0] = vecLogicOp_3vecs<double, 2>(
+            sourceValues_,
             [](double x, double y) -> double { return std::fmax(x, y); });
         break;
       }
       case Opcode::AArch64_FMINNMDrr: {  // fminnm dd, dn, dm
-        results[0] = floatHelp::fminnm_3ops<double>(operands);
+        results_[0] = fminnm_3ops<double>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FMINNMPv2i64p: {  // fminnmp dd, vd.2d
-        results[0] = neonHelp::vecMinv_2ops<double, 2>(operands);
+        results_[0] = vecMinv_2ops<double, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FMINNMSrr: {  // fminnm sd, sn, sm
-        results[0] = floatHelp::fminnm_3ops<float>(operands);
+        results_[0] = fminnm_3ops<float>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FMINNMv2f64: {  // fminnm vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecLogicOp_3vecs<double, 2>(
-            operands,
+        results_[0] = vecLogicOp_3vecs<double, 2>(
+            sourceValues_,
             [](double x, double y) -> double { return std::fmin(x, y); });
         break;
       }
       case Opcode::AArch64_FMLA_ZPmZZ_D: {  // fmla zd.d, pg/m, zn.d, zm.d
-        results[0] = sveHelp::sveMlaPredicated_vecs<double>(operands, VL_bits);
+        results_[0] = sveMlaPredicated_vecs<double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FMLA_ZPmZZ_S: {  // fmla zd.s, pg/m, zn.s, zm.s
-        results[0] = sveHelp::sveMlaPredicated_vecs<float>(operands, VL_bits);
+        results_[0] = sveMlaPredicated_vecs<float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FMLAv2f32: {  // fmla vd.2s, vn.2s, vm.2s
-        results[0] = neonHelp::vecFmla_3vecs<float, 2>(operands);
+        results_[0] = vecFmla_3vecs<float, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FMLA_ZZZI_D: {  // fmla zda.d, zn.d, zm.d[index]
-        results[0] =
-            sveHelp::sveMlaIndexed_vecs<double>(operands, metadata, VL_bits);
+        results_[0] =
+            sveMlaIndexed_vecs<double>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_FMLA_ZZZI_S: {  // fmla zda.s, zn.s, zm.s[index]
-        results[0] =
-            sveHelp::sveMlaIndexed_vecs<float>(operands, metadata, VL_bits);
+        results_[0] =
+            sveMlaIndexed_vecs<float>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_FMLAv2f64: {  // fmla vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecFmla_3vecs<double, 2>(operands);
+        results_[0] = vecFmla_3vecs<double, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FMLAv2i32_indexed: {  // fmla vd.2s, vn.2s,
                                                  // vm.2s[index]
-        results[0] =
-            neonHelp::vecFmlaIndexed_3vecs<float, 2>(operands, metadata);
+        results_[0] = vecFmlaIndexed_3vecs<float, 2>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_FMLAv2i64_indexed: {  // fmla vd.2d, vn.2d,
                                                  // vm.d[index]
-        results[0] =
-            neonHelp::vecFmlaIndexed_3vecs<double, 2>(operands, metadata);
+        results_[0] = vecFmlaIndexed_3vecs<double, 2>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_FMLAv4f32: {  // fmla vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecFmla_3vecs<float, 4>(operands);
+        results_[0] = vecFmla_3vecs<float, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FMLAv4i32_indexed: {  // fmla vd.4s, vn.4s,
                                                  // vm.s[index]
-        results[0] =
-            neonHelp::vecFmlaIndexed_3vecs<float, 4>(operands, metadata);
+        results_[0] = vecFmlaIndexed_3vecs<float, 4>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_FMLS_ZPmZZ_D: {  // fmls zd.d, pg/m, zn.d, zm.d
-        results[0] = sveHelp::sveFmlsPredicated_vecs<double>(operands, VL_bits);
+        results_[0] = sveFmlsPredicated_vecs<double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FMLS_ZPmZZ_S: {  // fmls zd.s, pg/m, zn.s, zm.s
-        results[0] = sveHelp::sveFmlsPredicated_vecs<float>(operands, VL_bits);
+        results_[0] = sveFmlsPredicated_vecs<float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FMLSv2f64: {  // fmls vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecFmls_3vecs<double, 2>(operands);
+        results_[0] = vecFmls_3vecs<double, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FMLSv2i64_indexed: {
-        results[0] =
-            neonHelp::vecFmlsIndexed_3vecs<double, 2>(operands, metadata);
+        results_[0] = vecFmlsIndexed_3vecs<double, 2>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_FMLSv4f32: {  // fmls vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecFmls_3vecs<float, 4>(operands);
+        results_[0] = vecFmls_3vecs<float, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FMLSv4i32_indexed: {  // fmls vd.4s, vn.4s,
                                                  // vm.s[index]
-        results[0] =
-            neonHelp::vecFmlsIndexed_3vecs<float, 4>(operands, metadata);
+        results_[0] = vecFmlsIndexed_3vecs<float, 4>(sourceValues_, metadata_);
+        break;
+      }
+      case Opcode::AArch64_FMOPA_MPPZZ_D: {  // fmopa zada.d, pn/m, pm/m, zn.d,
+                                             // zm.d
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[rowCount].getAsVector<uint64_t>();
+        const uint64_t* pm =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const double* zn = sourceValues_[rowCount + 2].getAsVector<double>();
+        const double* zm = sourceValues_[rowCount + 3].getAsVector<double>();
+
+        // zn is row, zm is col
+        for (int row = 0; row < rowCount; row++) {
+          double outRow[32] = {0};
+          uint64_t shifted_active_row = 1ull << ((row % 8) * 8);
+          const double* zadaRow = sourceValues_[row].getAsVector<double>();
+          for (int col = 0; col < rowCount; col++) {
+            double zadaElem = zadaRow[col];
+            uint64_t shifted_active_col = 1ull << ((col % 8) * 8);
+            if ((pm[col / 8] & shifted_active_col) &&
+                (pn[row / 8] & shifted_active_row))
+              outRow[col] = zadaElem + (zn[row] * zm[col]);
+            else
+              outRow[col] = zadaElem;
+          }
+          results_[row] = {outRow, 256};
+        }
         break;
       }
       case Opcode::AArch64_FMOPA_MPPZZ_S: {  // fmopa zada.s, pn/m, pm/m, zn.s,
@@ -1803,16 +2331,17 @@ void Instruction::execute() {
         if (!ZAenabled) return ZAdisabled();
 
         const uint16_t rowCount = VL_bits / 32;
-        const uint64_t* pn = operands[rowCount].getAsVector<uint64_t>();
-        const uint64_t* pm = operands[rowCount + 1].getAsVector<uint64_t>();
-        const float* zn = operands[rowCount + 2].getAsVector<float>();
-        const float* zm = operands[rowCount + 3].getAsVector<float>();
+        const uint64_t* pn = sourceValues_[rowCount].getAsVector<uint64_t>();
+        const uint64_t* pm =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const float* zn = sourceValues_[rowCount + 2].getAsVector<float>();
+        const float* zm = sourceValues_[rowCount + 3].getAsVector<float>();
 
         // zn is row, zm is col
         for (int row = 0; row < rowCount; row++) {
           float outRow[64] = {0};
           uint64_t shifted_active_row = 1ull << ((row % 16) * 4);
-          const float* zadaRow = operands[row].getAsVector<float>();
+          const float* zadaRow = sourceValues_[row].getAsVector<float>();
           for (int col = 0; col < rowCount; col++) {
             float zadaElem = zadaRow[col];
             uint64_t shifted_active_col = 1ull << ((col % 16) * 4);
@@ -1822,379 +2351,436 @@ void Instruction::execute() {
             else
               outRow[col] = zadaElem;
           }
-          results[row] = {outRow, 256};
+          results_[row] = {outRow, 256};
         }
         break;
       }
-      case Opcode::AArch64_FMOVDXHighr: {  // fmov xd, vn.d[1]
-        results[0] = operands[0].getAsVector<double>()[1];
-        break;
-      }
-      case Opcode::AArch64_FMOVDXr: {  // fmov xd, dn
-        results[0] = operands[0].get<double>();
-        break;
-      }
-      case Opcode::AArch64_FMOVDi: {  // fmov dn, #imm
-        results[0] = {metadata.operands[1].fp, 256};
-        break;
-      }
-      case Opcode::AArch64_FMOVDr: {  // fmov dd, dn
-        results[0] = {operands[0].get<double>(), 256};
+      case Opcode::AArch64_FMOPS_MPPZZ_D: {  // fmops zada.d, pn/m, pm/m, zn.d,
+                                             // zm.d
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[rowCount].getAsVector<uint64_t>();
+        const uint64_t* pm =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const double* zn = sourceValues_[rowCount + 2].getAsVector<double>();
+        const double* zm = sourceValues_[rowCount + 3].getAsVector<double>();
+
+        // zn is row, zm is col
+        for (int row = 0; row < rowCount; row++) {
+          double outRow[32] = {0};
+          uint64_t shifted_active_row = 1ull << ((row % 8) * 8);
+          const double* zadaRow = sourceValues_[row].getAsVector<double>();
+          for (int col = 0; col < rowCount; col++) {
+            double zadaElem = zadaRow[col];
+            uint64_t shifted_active_col = 1ull << ((col % 8) * 8);
+            if ((pm[col / 8] & shifted_active_col) &&
+                (pn[row / 8] & shifted_active_row))
+              outRow[col] = zadaElem - (zn[row] * zm[col]);
+            else
+              outRow[col] = zadaElem;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_FMOPS_MPPZZ_S: {  // fmops zada.s, pn/m, pm/m, zn.s,
+                                             // zm.s
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[rowCount].getAsVector<uint64_t>();
+        const uint64_t* pm =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const float* zn = sourceValues_[rowCount + 2].getAsVector<float>();
+        const float* zm = sourceValues_[rowCount + 3].getAsVector<float>();
+
+        // zn is row, zm is col
+        for (int row = 0; row < rowCount; row++) {
+          float outRow[64] = {0};
+          uint64_t shifted_active_row = 1ull << ((row % 16) * 4);
+          const float* zadaRow = sourceValues_[row].getAsVector<float>();
+          for (int col = 0; col < rowCount; col++) {
+            float zadaElem = zadaRow[col];
+            uint64_t shifted_active_col = 1ull << ((col % 16) * 4);
+            if ((pm[col / 16] & shifted_active_col) &&
+                (pn[row / 16] & shifted_active_row))
+              outRow[col] = zadaElem - (zn[row] * zm[col]);
+            else
+              outRow[col] = zadaElem;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_FMOVDXHighr: {  // fmov xd, vn.d[1]
+        results_[0] = sourceValues_[0].getAsVector<double>()[1];
+        break;
+      }
+      case Opcode::AArch64_FMOVDXr: {  // fmov xd, dn
+        results_[0] = sourceValues_[0].get<double>();
+        break;
+      }
+      case Opcode::AArch64_FMOVDi: {  // fmov dn, #imm
+        results_[0] = {metadata_.operands[1].fp, 256};
+        break;
+      }
+      case Opcode::AArch64_FMOVDr: {  // fmov dd, dn
+        results_[0] = {sourceValues_[0].get<double>(), 256};
         break;
       }
       case Opcode::AArch64_FMOVSWr: {  // fmov wd, sn
-        results[0] = {operands[0].get<float>(), 8};
+        results_[0] = {sourceValues_[0].get<float>(), 8};
         break;
       }
       case Opcode::AArch64_FMOVSi: {  // fmov sn, #imm
-        results[0] = {static_cast<float>(metadata.operands[1].fp), 256};
+        results_[0] = {static_cast<float>(metadata_.operands[1].fp), 256};
         break;
       }
       case Opcode::AArch64_FMOVSr: {  // fmov sd, sn
-        results[0] = {operands[0].get<float>(), 256};
+        results_[0] = {sourceValues_[0].get<float>(), 256};
         break;
       }
       case Opcode::AArch64_FMOVWSr: {  // fmov sd, wn
-        results[0] = {operands[0].get<float>(), 256};
+        results_[0] = {sourceValues_[0].get<float>(), 256};
         break;
       }
       case Opcode::AArch64_FMOVXDHighr: {  // fmov vd.d[1], xn
-        double out[2] = {operands[0].get<double>(), operands[1].get<double>()};
-        results[0] = {out, 256};
+        double out[2] = {sourceValues_[0].get<double>(),
+                         sourceValues_[1].get<double>()};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_FMOVXDr: {  // fmov dd, xn
-        results[0] = {operands[0].get<double>(), 256};
+        results_[0] = {sourceValues_[0].get<double>(), 256};
         break;
       }
       case Opcode::AArch64_FMOVv2f32_ns: {  // fmov vd.2s, #imm
-        results[0] = neonHelp::vecMovi_imm<float, 2>(metadata);
+        results_[0] = vecMovi_imm<float, 2>(metadata_);
         break;
       }
       case Opcode::AArch64_FMOVv2f64_ns: {  // fmov vd.2d, #imm
-        results[0] = neonHelp::vecMovi_imm<double, 2>(metadata);
+        results_[0] = vecMovi_imm<double, 2>(metadata_);
         break;
       }
       case Opcode::AArch64_FMOVv4f32_ns: {  // fmov vd.4s, #imm
-        results[0] = neonHelp::vecMovi_imm<float, 4>(metadata);
+        results_[0] = vecMovi_imm<float, 4>(metadata_);
         break;
       }
       case Opcode::AArch64_FMSB_ZPmZZ_D: {  // fmsb zd.d, pg/m, zn.d, zm.d
-        results[0] = sveHelp::sveFmsbPredicated_vecs<double>(operands, VL_bits);
+        results_[0] = sveFmsbPredicated_vecs<double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FMSB_ZPmZZ_S: {  // fmsb zd.s, pg/m, zn.s, zm.s
-        results[0] = sveHelp::sveFmsbPredicated_vecs<float>(operands, VL_bits);
+        results_[0] = sveFmsbPredicated_vecs<float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FMSUBDrrr: {  // fmsub dn, dm, da
-        results[0] = {multiplyHelp::msub_4ops<double>(operands), 256};
+        results_[0] = {msub_4ops<double>(sourceValues_), 256};
         break;
       }
       case Opcode::AArch64_FMSUBSrrr: {  // fmsub sn, sm, sa
-        results[0] = {multiplyHelp::msub_4ops<float>(operands), 256};
+        results_[0] = {msub_4ops<float>(sourceValues_), 256};
         break;
       }
       case Opcode::AArch64_FMULDrr: {  // fmul dd, dn, dm
-        results[0] = {multiplyHelp::mul_3ops<double>(operands), 256};
+        results_[0] = {mul_3ops<double>(sourceValues_), 256};
         break;
       }
       case Opcode::AArch64_FMULSrr: {  // fmul sd, sn, sm
-        results[0] = {multiplyHelp::mul_3ops<float>(operands), 256};
+        results_[0] = {mul_3ops<float>(sourceValues_), 256};
         break;
       }
       case Opcode::AArch64_FMUL_ZPmI_D: {  // fmul zd.d, pg/m, zn.d, #imm
-        results[0] = sveHelp::sveMulPredicated<double>(operands, metadata,
-                                                       VL_bits, true);
+        results_[0] =
+            sveMulPredicated<double>(sourceValues_, metadata_, VL_bits, true);
         break;
       }
       case Opcode::AArch64_FMUL_ZPmI_S: {  // fmul zd.s, pg/m, zn.s, #imm
-        results[0] =
-            sveHelp::sveMulPredicated<float>(operands, metadata, VL_bits, true);
+        results_[0] =
+            sveMulPredicated<float>(sourceValues_, metadata_, VL_bits, true);
         break;
       }
       case Opcode::AArch64_FMUL_ZPmZ_D: {  // fmul zdn.d, pg/m, zdn.d, zm.d
-        results[0] = sveHelp::sveMulPredicated<double>(operands, metadata,
-                                                       VL_bits, false);
+        results_[0] =
+            sveMulPredicated<double>(sourceValues_, metadata_, VL_bits, false);
         break;
       }
       case Opcode::AArch64_FMUL_ZPmZ_S: {  // fmul zdn.s, pg/m, zdn.s, zm.s
-        results[0] = sveHelp::sveMulPredicated<float>(operands, metadata,
-                                                      VL_bits, false);
+        results_[0] =
+            sveMulPredicated<float>(sourceValues_, metadata_, VL_bits, false);
         break;
       }
       case Opcode::AArch64_FMUL_ZZZ_D: {  // fmul zd.d, zn.d, zm.d
-        results[0] = sveHelp::sveFmul_3ops<double>(operands, VL_bits);
+        results_[0] = sveFmul_3ops<double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FMUL_ZZZ_S: {  // fmul zd.s, zn.s, zm.s
-        results[0] = sveHelp::sveFmul_3ops<float>(operands, VL_bits);
+        results_[0] = sveFmul_3ops<float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FMULv1i32_indexed: {  // fmul sd, sn, vm.s[index]
-        results[0] =
-            neonHelp::vecFmulIndexed_vecs<float, 1>(operands, metadata);
+        results_[0] = vecFmulIndexed_vecs<float, 1>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_FMULv1i64_indexed: {  // fmul dd, dn, vm.d[index]
-        results[0] =
-            neonHelp::vecFmulIndexed_vecs<double, 1>(operands, metadata);
+        results_[0] = vecFmulIndexed_vecs<double, 1>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_FMULv2f32: {  // fmul vd.2s, vn.2s, vm.2s
-        results[0] = neonHelp::vecLogicOp_3vecs<float, 2>(
-            operands, [](float x, float y) -> float { return x * y; });
+        results_[0] = vecLogicOp_3vecs<float, 2>(
+            sourceValues_, [](float x, float y) -> float { return x * y; });
         break;
       }
       case Opcode::AArch64_FMULv2f64: {  // fmul vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecLogicOp_3vecs<double, 2>(
-            operands, [](double x, double y) -> double { return x * y; });
+        results_[0] = vecLogicOp_3vecs<double, 2>(
+            sourceValues_, [](double x, double y) -> double { return x * y; });
         break;
       }
       case Opcode::AArch64_FMULv2i32_indexed: {  // fmul vd.2s, vn.2s,
                                                  // vm.s[index]
-        results[0] =
-            neonHelp::vecFmulIndexed_vecs<float, 2>(operands, metadata);
+        results_[0] = vecFmulIndexed_vecs<float, 2>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_FMULv2i64_indexed: {  // fmul vd.2d, vn.2d,
                                                  // vm.d[index]
-        results[0] =
-            neonHelp::vecFmulIndexed_vecs<double, 2>(operands, metadata);
+        results_[0] = vecFmulIndexed_vecs<double, 2>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_FMULv4f32: {  // fmul vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecLogicOp_3vecs<float, 4>(
-            operands, [](float x, float y) -> float { return x * y; });
+        results_[0] = vecLogicOp_3vecs<float, 4>(
+            sourceValues_, [](float x, float y) -> float { return x * y; });
         break;
       }
       case Opcode::AArch64_FMULv4i32_indexed: {  // fmul vd.4s, vn.4s,
                                                  // vm.s[index]
-        results[0] =
-            neonHelp::vecFmulIndexed_vecs<float, 4>(operands, metadata);
+        results_[0] = vecFmulIndexed_vecs<float, 4>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_FNEGDr: {  // fneg dd, dn
-        results[0] = {-operands[0].get<double>(), 256};
+        results_[0] = {-sourceValues_[0].get<double>(), 256};
         break;
       }
       case Opcode::AArch64_FNEGSr: {  // fneg sd, sn
-        results[0] = {-operands[0].get<float>(), 256};
+        results_[0] = {-sourceValues_[0].get<float>(), 256};
         break;
       }
       case Opcode::AArch64_FNEG_ZPmZ_D: {  // fneg zd.d, pg/m, zn.d
-        results[0] = sveHelp::sveFnegPredicated<double>(operands, VL_bits);
+        results_[0] = sveFnegPredicated<double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FNEG_ZPmZ_S: {  // fneg zd.s, pg/m, zn.s
-        results[0] = sveHelp::sveFnegPredicated<float>(operands, VL_bits);
+        results_[0] = sveFnegPredicated<float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FNEGv2f64: {  // fneg vd.2d, vn.2d
-        results[0] = neonHelp::vecFneg_2ops<double, 2>(operands);
+        results_[0] = vecFneg_2ops<double, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FNEGv4f32: {  // fneg vd.4s, vn.4s
-        results[0] = neonHelp::vecFneg_2ops<float, 4>(operands);
+        results_[0] = vecFneg_2ops<float, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FNMADDDrrr: {  // fnmadd dd, dn, dm, da
-        results[0] = floatHelp::fnmadd_4ops<double>(operands);
+        results_[0] = fnmadd_4ops<double>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FNMADDSrrr: {  // fnmadd sd, sn, sm, sa
-        results[0] = floatHelp::fnmadd_4ops<float>(operands);
+        results_[0] = fnmadd_4ops<float>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FNMLS_ZPmZZ_D: {  // fnmls zd.d, pg/m, zn.d, zm.d
-        results[0] = sveHelp::sveFnmlsPredicated<double>(operands, VL_bits);
+        results_[0] = sveFnmlsPredicated<double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FNMLS_ZPmZZ_S: {  // fnmls zd.s, pg/m, zn.s, zm.s
-        results[0] = sveHelp::sveFnmlsPredicated<float>(operands, VL_bits);
+        results_[0] = sveFnmlsPredicated<float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FNMSB_ZPmZZ_D: {  // fnmsb zdn.d, pg/m, zm.d, za.d
-        results[0] = sveHelp::sveFnmsbPredicated<double>(operands, VL_bits);
+        results_[0] = sveFnmsbPredicated<double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FNMSB_ZPmZZ_S: {  // fnmsb zdn.s, pg/m, zm.s, za.s
-        results[0] = sveHelp::sveFnmsbPredicated<float>(operands, VL_bits);
+        results_[0] = sveFnmsbPredicated<float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FNMSUBDrrr: {  // fnmsub dd, dn, dm, da
-        results[0] = floatHelp::fnmsub_4ops<double>(operands);
+        results_[0] = fnmsub_4ops<double>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FNMSUBSrrr: {  // fnmsub sd, sn, sm, sa
-        results[0] = floatHelp::fnmsub_4ops<float>(operands);
+        results_[0] = fnmsub_4ops<float>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FNMULDrr: {  // fnmul dd, dn, dm
-        results[0] = neonHelp::vecLogicOp_3vecs<double, 1>(
-            operands, [](double x, double y) -> double { return -(x * y); });
+        results_[0] = vecLogicOp_3vecs<double, 1>(
+            sourceValues_,
+            [](double x, double y) -> double { return -(x * y); });
         break;
       }
       case Opcode::AArch64_FNMULSrr: {  // fnmul sd, sn, sm
-        results[0] = neonHelp::vecLogicOp_3vecs<float, 1>(
-            operands, [](float x, float y) -> float { return -(x * y); });
+        results_[0] = vecLogicOp_3vecs<float, 1>(
+            sourceValues_, [](float x, float y) -> float { return -(x * y); });
         break;
       }
       case Opcode::AArch64_FRINTADr: {  // frinta dd, dn
-        results[0] = {round(operands[0].get<double>()), 256};
+        results_[0] = {round(sourceValues_[0].get<double>()), 256};
         break;
       }
       case Opcode::AArch64_FRINTN_ZPmZ_D: {  // frintn zd.d, pg/m, zn.d
-        results[0] =
-            sveHelp::sveFrintnPredicated<int64_t, double>(operands, VL_bits);
+        results_[0] = sveFrintnPredicated<double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FRINTN_ZPmZ_S: {  // frintn zd.s, pg/m, zn.s
-        results[0] =
-            sveHelp::sveFrintnPredicated<int32_t, float>(operands, VL_bits);
+        results_[0] = sveFrintnPredicated<float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FRINTPDr: {  // frintp dd, dn
-        results[0] = floatHelp::frintpScalar_2ops<double>(operands);
+        results_[0] = frintpScalar_2ops<double>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FRINTPSr: {  // frintp sd, sn
-        results[0] = floatHelp::frintpScalar_2ops<float>(operands);
+        results_[0] = frintpScalar_2ops<float>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FRSQRTEv1i32: {  // frsqrte sd, sn
-        results[0] = neonHelp::vecFrsqrte_2ops<float, 1>(operands);
+        results_[0] = vecFrsqrte_2ops<float, 1>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FRSQRTEv1i64: {  // frsqrte dd, dn
-        results[0] = neonHelp::vecFrsqrte_2ops<double, 1>(operands);
+        results_[0] = vecFrsqrte_2ops<double, 1>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FRSQRTEv2f32: {  // frsqrte vd.2s, vn.2s
-        results[0] = neonHelp::vecFrsqrte_2ops<float, 2>(operands);
+        results_[0] = vecFrsqrte_2ops<float, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FRSQRTEv2f64: {  // frsqrte vd.2d, vn.2d
-        results[0] = neonHelp::vecFrsqrte_2ops<double, 2>(operands);
+        results_[0] = vecFrsqrte_2ops<double, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FRSQRTEv4f32: {  // frsqrte vd.4s, vn.4s
-        results[0] = neonHelp::vecFrsqrte_2ops<float, 4>(operands);
+        results_[0] = vecFrsqrte_2ops<float, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FRSQRTS32: {  // frsqrts sd, sn, sm
-        results[0] = neonHelp::vecFrsqrts_3ops<float, 1>(operands);
+        results_[0] = vecFrsqrts_3ops<float, 1>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FRSQRTS64: {  // frsqrts dd, dn, dm
-        results[0] = neonHelp::vecFrsqrts_3ops<double, 1>(operands);
+        results_[0] = vecFrsqrts_3ops<double, 1>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FRSQRTSv2f32: {  // frsqrts vd.2s, vn.2s, vn.2s
-        results[0] = neonHelp::vecFrsqrts_3ops<float, 2>(operands);
+        results_[0] = vecFrsqrts_3ops<float, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FRSQRTSv2f64: {  // frsqrts vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecFrsqrts_3ops<double, 2>(operands);
+        results_[0] = vecFrsqrts_3ops<double, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FRSQRTSv4f32: {  // frsqrts vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecFrsqrts_3ops<float, 4>(operands);
+        results_[0] = vecFrsqrts_3ops<float, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FSQRTDr: {  // fsqrt dd, dn
-        results[0] = {::sqrt(operands[0].get<double>()), 256};
+        results_[0] = {::sqrt(sourceValues_[0].get<double>()), 256};
         break;
       }
       case Opcode::AArch64_FSQRTSr: {  // fsqrt sd, sn
-        results[0] = {::sqrtf(operands[0].get<float>()), 256};
+        results_[0] = {::sqrtf(sourceValues_[0].get<float>()), 256};
         break;
       }
       case Opcode::AArch64_FSQRT_ZPmZ_D: {  // fsqrt zd.d, pg/m, zn.d
-        results[0] =
-            sveHelp::sveFsqrtPredicated_2vecs<double>(operands, VL_bits);
+        results_[0] = sveFsqrtPredicated_2vecs<double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FSQRT_ZPmZ_S: {  // fsqrt zd.s, pg/m, zn.s
-        results[0] =
-            sveHelp::sveFsqrtPredicated_2vecs<float>(operands, VL_bits);
+        results_[0] = sveFsqrtPredicated_2vecs<float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FSQRTv2f64: {  // fsqrt vd.2d, vn.2d
-        results[0] = neonHelp::vecFsqrt_2ops<double, 2>(operands);
+        results_[0] = vecFsqrt_2ops<double, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FSQRTv4f32: {  // fsqrt vd.4s, vn.4s
-        results[0] = neonHelp::vecFsqrt_2ops<float, 4>(operands);
+        results_[0] = vecFsqrt_2ops<float, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_FSUBDrr: {  // fsub dd, dn, dm
-        results[0] = neonHelp::vecLogicOp_3vecs<double, 1>(
-            operands, [](double x, double y) -> double { return x - y; });
+        results_[0] = vecLogicOp_3vecs<double, 1>(
+            sourceValues_, [](double x, double y) -> double { return x - y; });
         break;
       }
       case Opcode::AArch64_FSUBR_ZPmZ_D: {  // fsubr zdn.d, pg/m, zdn.d, zm.d
-        results[0] =
-            sveHelp::sveSubrPredicated_3vecs<double>(operands, VL_bits);
+        results_[0] = sveSubrPredicated_3vecs<double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FSUBR_ZPmZ_S: {  // fsubr zdn.s, pg/m, zdn.s, zm.s
-        results[0] = sveHelp::sveSubrPredicated_3vecs<float>(operands, VL_bits);
+        results_[0] = sveSubrPredicated_3vecs<float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FSUBSrr: {  // fsub ss, sn, sm
-        results[0] = neonHelp::vecLogicOp_3vecs<float, 1>(
-            operands, [](double x, double y) -> double { return x - y; });
+        results_[0] = vecLogicOp_3vecs<float, 1>(
+            sourceValues_, [](double x, double y) -> double { return x - y; });
         break;
       }
       case Opcode::AArch64_FSUB_ZPmI_D: {  // fsub zdn.d, pg/m, zdn.d, #imm
-        results[0] =
-            sveHelp::sveSubPredicated_imm<double>(operands, metadata, VL_bits);
+        results_[0] =
+            sveSubPredicated_imm<double>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_FSUB_ZPmI_S: {  // fsub zdn.s, pg/m, zdn.s, #imm
-        results[0] =
-            sveHelp::sveSubPredicated_imm<float>(operands, metadata, VL_bits);
+        results_[0] =
+            sveSubPredicated_imm<float>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_FSUB_ZPmZ_D: {  // fsub zdn.d, pg/m, zdn.d, zm.d
-        results[0] = sveHelp::sveLogicOpPredicated_3vecs<double>(
-            operands, VL_bits,
+        results_[0] = sveLogicOpPredicated_3vecs<double>(
+            sourceValues_, VL_bits,
             [](double x, double y) -> double { return x - y; });
         break;
       }
       case Opcode::AArch64_FSUB_ZPmZ_S: {  // fsub zdn.s, pg/m, zdn.s, zm.s
-        results[0] = sveHelp::sveLogicOpPredicated_3vecs<float>(
-            operands, VL_bits, [](float x, float y) -> float { return x - y; });
+        results_[0] = sveLogicOpPredicated_3vecs<float>(
+            sourceValues_, VL_bits,
+            [](float x, float y) -> float { return x - y; });
         break;
       }
       case Opcode::AArch64_FSUB_ZZZ_D: {  // fsub zd.d, zn.d, zm.d
-        results[0] = sveHelp::sveSub_3vecs<double>(operands, VL_bits);
+        results_[0] = sveSub_3vecs<double>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FSUB_ZZZ_S: {  // fsub zd.s, zn.s, zm.s
-        results[0] = sveHelp::sveSub_3vecs<float>(operands, VL_bits);
+        results_[0] = sveSub_3vecs<float>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_FSUBv2f32: {
-        results[0] = neonHelp::vecLogicOp_3vecs<float, 2>(
-            operands, [](float x, float y) -> float { return x - y; });
+        results_[0] = vecLogicOp_3vecs<float, 2>(
+            sourceValues_, [](float x, float y) -> float { return x - y; });
         break;
       }
       case Opcode::AArch64_FSUBv2f64: {  // fsub vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecLogicOp_3vecs<double, 2>(
-            operands, [](double x, double y) -> double { return x - y; });
+        results_[0] = vecLogicOp_3vecs<double, 2>(
+            sourceValues_, [](double x, double y) -> double { return x - y; });
         break;
       }
       case Opcode::AArch64_FSUBv4f32: {  // fsub vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecLogicOp_3vecs<float, 4>(
-            operands, [](float x, float y) -> float { return x - y; });
+        results_[0] = vecLogicOp_3vecs<float, 4>(
+            sourceValues_, [](float x, float y) -> float { return x - y; });
         break;
       }
       case Opcode::AArch64_GLD1D_IMM_REAL: {  // ld1d {zd.d}, pg/z, [zn.d{,
                                               // #imm}]
         // LOAD
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
 
         const uint16_t partition_num = VL_bits / 64;
         uint64_t out[32] = {0};
@@ -2202,11 +2788,11 @@ void Instruction::execute() {
         for (int i = 0; i < partition_num; i++) {
           uint64_t shifted_active = 1ull << ((i % 8) * 8);
           if (p[i / 8] & shifted_active) {
-            out[i] = memoryData[index].get<uint64_t>();
+            out[i] = memoryData_[index].get<uint64_t>();
             index++;
           }
         }
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_GLD1D_REAL:  // ld1d {zt.d}, pg/z, [xn, zm.d]
@@ -2215,7 +2801,7 @@ void Instruction::execute() {
       case Opcode::AArch64_GLD1D_SCALED_REAL: {  // ld1d {zt.d}, pg/z, [xn,
                                                  // zm.d, LSL #3]
         // LOAD
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
 
         const uint16_t partition_num = VL_bits / 64;
         uint16_t index = 0;
@@ -2224,18 +2810,18 @@ void Instruction::execute() {
         for (int i = 0; i < partition_num; i++) {
           uint64_t shifted_active = 1ull << ((i % 8) * 8);
           if (p[i / 8] & shifted_active) {
-            out[i] = memoryData[index].get<uint64_t>();
+            out[i] = memoryData_[index].get<uint64_t>();
             index++;
           }
         }
 
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_GLD1SW_D_IMM_REAL: {  // ld1sw {zd.d}, pg/z, [zn.d{,
                                                  // #imm}]
         // LOAD
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
 
         const uint16_t partition_num = VL_bits / 64;
         int64_t out[32] = {0};
@@ -2243,17 +2829,17 @@ void Instruction::execute() {
         for (int i = 0; i < partition_num; i++) {
           uint64_t shifted_active = 1ull << ((i % 8) * 8);
           if (p[i / 8] & shifted_active) {
-            out[i] = static_cast<int64_t>(memoryData[index].get<int32_t>());
+            out[i] = static_cast<int64_t>(memoryData_[index].get<int32_t>());
             index++;
           }
         }
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_GLD1W_D_SCALED_REAL: {  // ld1w {zd.d}, pg/z,
                                                    // [<xn|sp>, zm.d, lsl #2]
         // LOAD
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
 
         const uint16_t partition_num = VL_bits / 64;
         uint64_t out[32] = {0};
@@ -2261,11 +2847,29 @@ void Instruction::execute() {
         for (int i = 0; i < partition_num; i++) {
           uint64_t shifted_active = 1ull << ((i % 8) * 8);
           if (p[i / 8] & shifted_active) {
-            out[i] = static_cast<uint64_t>(memoryData[index].get<uint32_t>());
+            out[i] = static_cast<uint64_t>(memoryData_[index].get<uint32_t>());
+            index++;
+          }
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_GLD1W_SXTW_REAL: {  // ld1w {zd.s}, pg/z,
+                                               // [<xn|sp>, zm.s, sxtw]
+        // LOAD
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
+
+        const uint16_t partition_num = VL_bits / 32;
+        uint32_t out[64] = {0};
+        uint16_t index = 0;
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << ((i % 16) * 4);
+          if (p[i / 16] & shifted_active) {
+            out[i] = memoryData_[index].get<uint32_t>();
             index++;
           }
         }
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_HINT: {  // nop|yield|wfe|wfi|etc...
@@ -2273,234 +2877,812 @@ void Instruction::execute() {
         break;
       }
       case Opcode::AArch64_INCB_XPiI: {  // incb xdn{, pattern{, #imm}}
-        results[0] =
-            sveHelp::sveInc_gprImm<int8_t>(operands, metadata, VL_bits);
+        results_[0] = sveInc_gprImm<int8_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_INCD_XPiI: {  // incd xdn{, pattern{, #imm}}
-        results[0] =
-            sveHelp::sveInc_gprImm<int64_t>(operands, metadata, VL_bits);
+        results_[0] = sveInc_gprImm<int64_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_INCD_ZPiI: {  // incd zdn.d{, pattern{, #imm}}
-        results[0] = sveHelp::sveInc_imm<int64_t>(operands, metadata, VL_bits);
+        results_[0] = sveInc_imm<int64_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_INCH_XPiI: {  // inch xdn{, pattern{, #imm}}
-        results[0] =
-            sveHelp::sveInc_gprImm<int16_t>(operands, metadata, VL_bits);
+        results_[0] = sveInc_gprImm<int16_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_INCH_ZPiI: {  // inch zdn.h{, pattern{, #imm}}
-        results[0] = sveHelp::sveInc_imm<int16_t>(operands, metadata, VL_bits);
+        results_[0] = sveInc_imm<int16_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_INCP_XP_B: {  // incp xdn, pm.b
-        results[0] = sveHelp::sveIncp_gpr<uint8_t>(operands, VL_bits);
+        results_[0] = sveIncp_gpr<uint8_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_INCP_XP_D: {  // incp xdn, pm.d
-        results[0] = sveHelp::sveIncp_gpr<uint64_t>(operands, VL_bits);
+        results_[0] = sveIncp_gpr<uint64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_INCP_XP_H: {  // incp xdn, pm.h
-        results[0] = sveHelp::sveIncp_gpr<uint16_t>(operands, VL_bits);
+        results_[0] = sveIncp_gpr<uint16_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_INCP_XP_S: {  // incp xdn, pm.s
-        results[0] = sveHelp::sveIncp_gpr<uint32_t>(operands, VL_bits);
+        results_[0] = sveIncp_gpr<uint32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_INCW_XPiI: {  // incw xdn{, pattern{, #imm}}
-        results[0] =
-            sveHelp::sveInc_gprImm<int32_t>(operands, metadata, VL_bits);
+        results_[0] = sveInc_gprImm<int32_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_INCW_ZPiI: {  // incw zdn.s{, pattern{, #imm}}
-        results[0] = sveHelp::sveInc_imm<int32_t>(operands, metadata, VL_bits);
+        results_[0] = sveInc_imm<int32_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_INDEX_II_B: {  // index zd.b, #imm, #imm
-        results[0] =
-            sveHelp::sveIndex<int8_t>(operands, metadata, VL_bits, true, true);
+        results_[0] =
+            sveIndex<int8_t>(sourceValues_, metadata_, VL_bits, true, true);
         break;
       }
       case Opcode::AArch64_INDEX_II_D: {  // index zd.d, #imm, #imm
-        results[0] =
-            sveHelp::sveIndex<int64_t>(operands, metadata, VL_bits, true, true);
+        results_[0] =
+            sveIndex<int64_t>(sourceValues_, metadata_, VL_bits, true, true);
         break;
       }
       case Opcode::AArch64_INDEX_II_H: {  // index zd.h, #imm, #imm
-        results[0] =
-            sveHelp::sveIndex<int16_t>(operands, metadata, VL_bits, true, true);
+        results_[0] =
+            sveIndex<int16_t>(sourceValues_, metadata_, VL_bits, true, true);
         break;
       }
       case Opcode::AArch64_INDEX_II_S: {  // index zd.s, #imm, #imm
-        results[0] =
-            sveHelp::sveIndex<int32_t>(operands, metadata, VL_bits, true, true);
+        results_[0] =
+            sveIndex<int32_t>(sourceValues_, metadata_, VL_bits, true, true);
         break;
       }
       case Opcode::AArch64_INDEX_IR_B: {  // index zd.b, #imm, wn
-        results[0] = sveHelp::sveIndex<int8_t, int32_t>(operands, metadata,
-                                                        VL_bits, true, false);
+        results_[0] = sveIndex<int8_t, int32_t>(sourceValues_, metadata_,
+                                                VL_bits, true, false);
         break;
       }
       case Opcode::AArch64_INDEX_IR_D: {  // index zd.d, #imm, xn
-        results[0] = sveHelp::sveIndex<int64_t, int64_t>(operands, metadata,
-                                                         VL_bits, true, false);
+        results_[0] = sveIndex<int64_t, int64_t>(sourceValues_, metadata_,
+                                                 VL_bits, true, false);
         break;
       }
       case Opcode::AArch64_INDEX_IR_H: {  // index zd.h, #imm, wn
-        results[0] = sveHelp::sveIndex<int16_t, int32_t>(operands, metadata,
-                                                         VL_bits, true, false);
+        results_[0] = sveIndex<int16_t, int32_t>(sourceValues_, metadata_,
+                                                 VL_bits, true, false);
         break;
       }
       case Opcode::AArch64_INDEX_IR_S: {  // index zd.s, #imm, wn
-        results[0] = sveHelp::sveIndex<int32_t, int32_t>(operands, metadata,
-                                                         VL_bits, true, false);
+        results_[0] = sveIndex<int32_t, int32_t>(sourceValues_, metadata_,
+                                                 VL_bits, true, false);
         break;
       }
       case Opcode::AArch64_INDEX_RI_B: {  // index zd.b, wn, #imm
-        results[0] = sveHelp::sveIndex<int8_t, int32_t>(operands, metadata,
-                                                        VL_bits, false, true);
+        results_[0] = sveIndex<int8_t, int32_t>(sourceValues_, metadata_,
+                                                VL_bits, false, true);
         break;
       }
       case Opcode::AArch64_INDEX_RI_D: {  // index zd.d, xn, #imm
-        results[0] = sveHelp::sveIndex<int64_t, int64_t>(operands, metadata,
-                                                         VL_bits, false, true);
+        results_[0] = sveIndex<int64_t, int64_t>(sourceValues_, metadata_,
+                                                 VL_bits, false, true);
         break;
       }
       case Opcode::AArch64_INDEX_RI_H: {  // index zd.h, wn, #imm
-        results[0] = sveHelp::sveIndex<int16_t, int32_t>(operands, metadata,
-                                                         VL_bits, false, true);
+        results_[0] = sveIndex<int16_t, int32_t>(sourceValues_, metadata_,
+                                                 VL_bits, false, true);
         break;
       }
       case Opcode::AArch64_INDEX_RI_S: {  // index zd.s, wn, #imm
-        results[0] = sveHelp::sveIndex<int32_t, int32_t>(operands, metadata,
-                                                         VL_bits, false, true);
+        results_[0] = sveIndex<int32_t, int32_t>(sourceValues_, metadata_,
+                                                 VL_bits, false, true);
         break;
       }
       case Opcode::AArch64_INDEX_RR_B: {  // index zd.b, wn, wm
-        results[0] = sveHelp::sveIndex<int8_t, int32_t>(operands, metadata,
-                                                        VL_bits, false, false);
+        results_[0] = sveIndex<int8_t, int32_t>(sourceValues_, metadata_,
+                                                VL_bits, false, false);
         break;
       }
       case Opcode::AArch64_INDEX_RR_D: {  // index zd.d, xn, xm
-        results[0] = sveHelp::sveIndex<int64_t, int64_t>(operands, metadata,
-                                                         VL_bits, false, false);
+        results_[0] = sveIndex<int64_t, int64_t>(sourceValues_, metadata_,
+                                                 VL_bits, false, false);
         break;
       }
       case Opcode::AArch64_INDEX_RR_H: {  // index zd.h, wn, wm
-        results[0] = sveHelp::sveIndex<int16_t, int32_t>(operands, metadata,
-                                                         VL_bits, false, false);
+        results_[0] = sveIndex<int16_t, int32_t>(sourceValues_, metadata_,
+                                                 VL_bits, false, false);
         break;
       }
       case Opcode::AArch64_INDEX_RR_S: {  // index zd.s, wn, wm
-        results[0] = sveHelp::sveIndex<int32_t, int32_t>(operands, metadata,
-                                                         VL_bits, false, false);
+        results_[0] = sveIndex<int32_t, int32_t>(sourceValues_, metadata_,
+                                                 VL_bits, false, false);
+        break;
+      }
+
+      case Opcode::AArch64_INSERT_MXIPZ_H_B: {  // mova zadh.b[ws, #imm], pg/m,
+                                                // zn.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 8;
+        const uint32_t sliceNum =
+            (sourceValues_[rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[0].sme.slice_offset.imm)) %
+            rowCount;
+        const uint8_t* zaRow = sourceValues_[sliceNum].getAsVector<uint8_t>();
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint8_t* zn = sourceValues_[rowCount + 2].getAsVector<uint8_t>();
+
+        uint8_t out[256] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << (elem % 64);
+          if (pg[elem / 64] & shifted_active)
+            out[elem] = zn[elem];
+          else
+            out[elem] = zaRow[elem];
+        }
+        // Need to update whole za tile
+        for (uint16_t row = 0; row < rowCount; row++) {
+          results_[row] =
+              (row == sliceNum) ? RegisterValue(out, 256) : sourceValues_[row];
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_H_D: {  // mova zadh.d[ws, #imm], pg/m,
+                                                // zn.d
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 64;
+        const uint32_t sliceNum =
+            (sourceValues_[rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[0].sme.slice_offset.imm)) %
+            rowCount;
+        const uint64_t* zaRow = sourceValues_[sliceNum].getAsVector<uint64_t>();
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint64_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint64_t>();
+
+        uint64_t out[32] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << ((elem % 8) * 8);
+          if (pg[elem / 8] & shifted_active)
+            out[elem] = zn[elem];
+          else
+            out[elem] = zaRow[elem];
+        }
+        // Need to update whole za tile
+        for (uint16_t row = 0; row < rowCount; row++) {
+          results_[row] =
+              (row == sliceNum) ? RegisterValue(out, 256) : sourceValues_[row];
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_H_H: {  // mova zadh.h[ws, #imm], pg/m,
+                                                // zn.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 16;
+        const uint32_t sliceNum =
+            (sourceValues_[rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[0].sme.slice_offset.imm)) %
+            rowCount;
+        const uint16_t* zaRow = sourceValues_[sliceNum].getAsVector<uint16_t>();
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint16_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint16_t>();
+
+        uint16_t out[128] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << ((elem % 32) * 2);
+          if (pg[elem / 32] & shifted_active)
+            out[elem] = zn[elem];
+          else
+            out[elem] = zaRow[elem];
+        }
+        // Need to update whole za tile
+        for (uint16_t row = 0; row < rowCount; row++) {
+          results_[row] =
+              (row == sliceNum) ? RegisterValue(out, 256) : sourceValues_[row];
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_H_Q: {  // mova zadh.q[ws], pg/m, zn.q
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 128;
+        const uint32_t sliceNum =
+            sourceValues_[rowCount].get<uint32_t>() % rowCount;
+        // Use uint64_t in place of 128-bit
+        const uint64_t* zaRow = sourceValues_[sliceNum].getAsVector<uint64_t>();
+
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        // Use uint64_t in place of 128-bit
+        const uint64_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint64_t>();
+
+        // Use uint64_t in place of 128-bit
+        uint64_t out[32] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          // For 128-bit there are 16-bit for each active element
+          uint64_t shifted_active = 1ull << ((elem % 4) * 16);
+          if (pg[elem / 4] & shifted_active) {
+            // Need to move two consecutive 64-bit elements
+            out[(2 * elem)] = zn[(2 * elem)];
+            out[(2 * elem + 1)] = zn[(2 * elem + 1)];
+          } else {
+            // Need to move two consecutive 64-bit elements
+            out[(2 * elem)] = zaRow[(2 * elem)];
+            out[(2 * elem + 1)] = zaRow[(2 * elem + 1)];
+          }
+        }
+        // Need to update whole za tile
+        for (uint16_t row = 0; row < rowCount; row++) {
+          results_[row] =
+              (row == sliceNum) ? RegisterValue(out, 256) : sourceValues_[row];
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_H_S: {  // mova zadh.s[ws, #imm], pg/m,
+                                                // zn.s
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 32;
+        const uint32_t sliceNum =
+            (sourceValues_[rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[0].sme.slice_offset.imm)) %
+            rowCount;
+        const uint32_t* zaRow = sourceValues_[sliceNum].getAsVector<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint32_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint32_t>();
+
+        uint32_t out[64] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << ((elem % 16) * 4);
+          if (pg[elem / 16] & shifted_active)
+            out[elem] = zn[elem];
+          else
+            out[elem] = zaRow[elem];
+        }
+        // Need to update whole za tile
+        for (uint16_t row = 0; row < rowCount; row++) {
+          results_[row] =
+              (row == sliceNum) ? RegisterValue(out, 256) : sourceValues_[row];
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_V_B: {  // mova zadv.b[ws, #imm], pg/m,
+                                                // zn.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 8;
+        const uint32_t sliceNum =
+            (sourceValues_[rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[0].sme.slice_offset.imm)) %
+            rowCount;
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint8_t* zn = sourceValues_[rowCount + 2].getAsVector<uint8_t>();
+
+        for (uint16_t i = 0; i < rowCount; i++) {
+          const uint8_t* row = sourceValues_[i].getAsVector<uint8_t>();
+          uint8_t out[256] = {0};
+          memcpy(out, row, rowCount * sizeof(uint8_t));
+          uint64_t shifted_active = 1ull << (i % 64);
+          if (pg[i / 64] & shifted_active) out[sliceNum] = zn[i];
+          results_[i] = {out, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_V_D: {  // mova zadv.d[ws, #imm], pg/m,
+                                                // zn.d
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 64;
+        const uint32_t sliceNum =
+            (sourceValues_[rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[0].sme.slice_offset.imm)) %
+            rowCount;
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint64_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint64_t>();
+
+        for (uint16_t i = 0; i < rowCount; i++) {
+          const uint64_t* row = sourceValues_[i].getAsVector<uint64_t>();
+          uint64_t out[32] = {0};
+          memcpy(out, row, rowCount * sizeof(uint64_t));
+          uint64_t shifted_active = 1ull << ((i % 8) * 8);
+          if (pg[i / 8] & shifted_active) out[sliceNum] = zn[i];
+          results_[i] = {out, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_V_H: {  // mova zadv.h[ws, #imm], pg/m,
+                                                // zn.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 16;
+        const uint32_t sliceNum =
+            (sourceValues_[rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[0].sme.slice_offset.imm)) %
+            rowCount;
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint16_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint16_t>();
+
+        for (uint16_t i = 0; i < rowCount; i++) {
+          const uint16_t* row = sourceValues_[i].getAsVector<uint16_t>();
+          uint16_t out[128] = {0};
+          memcpy(out, row, rowCount * sizeof(uint16_t));
+          uint64_t shifted_active = 1ull << ((i % 32) * 2);
+          if (pg[i / 32] & shifted_active) out[sliceNum] = zn[i];
+          results_[i] = {out, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_V_Q: {  // mova zadv.q[ws], pg/m, zn.q
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 128;
+        const uint32_t sliceNum =
+            sourceValues_[rowCount].get<uint32_t>() % rowCount;
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        // Use uint64_t in place of 128-bit
+        const uint64_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint64_t>();
+
+        for (uint16_t i = 0; i < rowCount; i++) {
+          // Use uint64_t in place of 128-bit
+          const uint64_t* row = sourceValues_[i].getAsVector<uint64_t>();
+          uint64_t out[32] = {0};
+          // *2 in memcpy as need 128-bit elements but using uint64_t
+          memcpy(out, row, rowCount * sizeof(uint64_t) * 2);
+          // For 128-bit there are 16-bit for each active element
+          uint64_t shifted_active = 1ull << ((i % 4) * 16);
+          if (pg[i / 4] & shifted_active) {
+            // Need to move two consecutive 64-bit elements
+            out[2 * sliceNum] = zn[2 * i];
+            out[2 * sliceNum + 1] = zn[2 * i + 1];
+          }
+          results_[i] = {out, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_V_S: {  // mova zadv.s[ws, #imm], pg/m,
+                                                // zn.s
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 32;
+        const uint32_t sliceNum =
+            (sourceValues_[rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[0].sme.slice_offset.imm)) %
+            rowCount;
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint32_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint32_t>();
+
+        for (uint16_t i = 0; i < rowCount; i++) {
+          const uint32_t* row = sourceValues_[i].getAsVector<uint32_t>();
+          uint32_t out[64] = {0};
+          memcpy(out, row, rowCount * sizeof(uint32_t));
+          uint64_t shifted_active = 1ull << ((i % 16) * 4);
+          if (pg[i / 16] & shifted_active) out[sliceNum] = zn[i];
+          results_[i] = {out, 256};
+        }
         break;
       }
       case Opcode::AArch64_INSvi16gpr: {  // ins vd.h[index], wn
-        results[0] = neonHelp::vecInsIndex_gpr<uint16_t, uint32_t, 8>(operands,
-                                                                      metadata);
+        results_[0] =
+            vecInsIndex_gpr<uint16_t, uint32_t, 8>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_INSvi32gpr: {  // ins vd.s[index], wn
-        results[0] = neonHelp::vecInsIndex_gpr<uint32_t, uint32_t, 4>(operands,
-                                                                      metadata);
+        results_[0] =
+            vecInsIndex_gpr<uint32_t, uint32_t, 4>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_INSvi32lane: {  // ins vd.s[index1], vn.s[index2]
-        results[0] = neonHelp::vecIns_2Index<uint32_t, 4>(operands, metadata);
+        results_[0] = vecIns_2Index<uint32_t, 4>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_INSvi64gpr: {  // ins vd.d[index], xn
-        results[0] = neonHelp::vecInsIndex_gpr<uint64_t, uint64_t, 2>(operands,
-                                                                      metadata);
+        results_[0] =
+            vecInsIndex_gpr<uint64_t, uint64_t, 2>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_INSvi64lane: {  // ins vd.d[index1], vn.d[index2]
-        results[0] = neonHelp::vecIns_2Index<uint64_t, 2>(operands, metadata);
+        results_[0] = vecIns_2Index<uint64_t, 2>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_INSvi8gpr: {  // ins vd.b[index], wn
-        results[0] = neonHelp::vecInsIndex_gpr<uint8_t, uint32_t, 16>(operands,
-                                                                      metadata);
+        results_[0] =
+            vecInsIndex_gpr<uint8_t, uint32_t, 16>(sourceValues_, metadata_);
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_H_B: {  // ld1b {zath.b[ws, #imm]}, pg/z,
+                                              // [<xn|sp>{, xm}]
+        // SME, LOAD
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 8;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint16_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+        const uint8_t* data = memoryData_[0].getAsVector<uint8_t>();
+
+        uint8_t out[256] = {0};
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << (i % 64);
+          if (pg[i / 64] & shifted_active) {
+            out[i] = data[i];
+          } else {
+            out[i] = 0;
+          }
+        }
+
+        // All Slice vectors are added to results[] so need to update the
+        // correct one
+        for (uint16_t i = 0; i < partition_num; i++) {
+          results_[i] = sourceValues_[i];
+        }
+        results_[sliceNum] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_H_D: {  // ld1d {zath.d[ws, #imm]}, pg/z,
+                                              // [<xn|sp>{, xm, lsl #3}]
+        // SME, LOAD
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 64;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint16_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+        const uint64_t* data = memoryData_[0].getAsVector<uint64_t>();
+
+        uint64_t out[32] = {0};
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << ((i % 8) * 8);
+          if (pg[i / 8] & shifted_active) {
+            out[i] = data[i];
+          } else {
+            out[i] = 0;
+          }
+        }
+
+        // All Slice vectors are added to results[] so need to update the
+        // correct one
+        for (uint16_t i = 0; i < partition_num; i++) {
+          results_[i] = sourceValues_[i];
+        }
+        results_[sliceNum] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_H_H: {  // ld1h {zath.h[ws, #imm]}, pg/z,
+                                              // [<xn|sp>{, xm, LSL #1}]
+        // SME, LOAD
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 16;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+        const uint16_t* data = memoryData_[0].getAsVector<uint16_t>();
+
+        uint16_t out[128] = {0};
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << ((i % 32) * 2);
+          if (pg[i / 32] & shifted_active) {
+            out[i] = data[i];
+          } else {
+            out[i] = 0;
+          }
+        }
+
+        // All Slice vectors are added to results[] so need to update the
+        // correct one
+        for (uint16_t i = 0; i < partition_num; i++) {
+          results_[i] = sourceValues_[i];
+        }
+        results_[sliceNum] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_H_Q: {  // ld1q {zath.q[ws]}, pg/z,
+                                              // [<xn|sp>{, xm, LSL #4}]
+        // SME, LOAD
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 128;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum = ws % partition_num;
+        // Use uint64_t as no 128-bit type
+        const uint64_t* data = memoryData_[0].getAsVector<uint64_t>();
+
+        // Use uint64_t as no 128-bit type
+        uint64_t out[32] = {0};
+        for (int i = 0; i < partition_num; i++) {
+          // For 128-bit there are 16-bit for each active element
+          uint64_t shifted_active = 1ull << ((i % 4) * 16);
+          if (pg[i / 4] & shifted_active) {
+            // As using uint64_t need to modify 2 elements
+            out[2 * i] = data[2 * i];
+            out[2 * i + 1] = data[2 * i + 1];
+          } else {
+            out[2 * i] = 0;
+            out[2 * i + 1] = 0;
+          }
+        }
+
+        // All Slice vectors are added to results[] so need to update the
+        // correct one
+        for (uint16_t i = 0; i < partition_num; i++) {
+          results_[i] = sourceValues_[i];
+        }
+        results_[sliceNum] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_H_S: {  // ld1w {zath.s[ws, #imm]}, pg/z,
+                                              // [<xn|sp>{, xm, LSL #2}]
+        // SME, LOAD
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 32;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+        const uint32_t* data = memoryData_[0].getAsVector<uint32_t>();
+
+        uint32_t out[64] = {0};
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << ((i % 16) * 4);
+          if (pg[i / 16] & shifted_active) {
+            out[i] = data[i];
+          } else {
+            out[i] = 0;
+          }
+        }
+
+        // All Slice vectors are added to results[] so need to update the
+        // correct one
+        for (uint16_t i = 0; i < partition_num; i++) {
+          results_[i] = sourceValues_[i];
+        }
+        results_[sliceNum] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_V_B: {  // ld1b {zatv.b[ws, #imm]}, pg/z,
+                                              // [<xn|sp>{, xm}]
+        // SME, LOAD
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 8;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+        const uint8_t* data = memoryData_[0].getAsVector<uint8_t>();
+
+        for (int i = 0; i < partition_num; i++) {
+          const uint8_t* row = sourceValues_[i].getAsVector<uint8_t>();
+          uint8_t out[256] = {0};
+          memcpy(out, row, partition_num * sizeof(uint8_t));
+          uint64_t shifted_active = 1ull << (i % 64);
+          if (pg[i / 64] & shifted_active) {
+            out[sliceNum] = data[i];
+          }
+          results_[i] = RegisterValue(out, 256);
+        }
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_V_D: {  // ld1d {zatv.d[ws, #imm]}, pg/z,
+                                              // [<xn|sp>{, xm, lsl #3}]
+        // SME, LOAD
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 64;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+        const uint64_t* data = memoryData_[0].getAsVector<uint64_t>();
+
+        for (int i = 0; i < partition_num; i++) {
+          const uint64_t* row = sourceValues_[i].getAsVector<uint64_t>();
+          uint64_t out[32] = {0};
+          memcpy(out, row, partition_num * sizeof(uint64_t));
+          uint64_t shifted_active = 1ull << ((i % 8) * 8);
+          if (pg[i / 8] & shifted_active) {
+            out[sliceNum] = data[i];
+          }
+          results_[i] = RegisterValue(out, 256);
+        }
         break;
       }
-      case Opcode::AArch64_LD1_MXIPXX_H_S: {  // ld1w {zath.s[ws, #imm]}, pg/z,
-                                              // [<xn|sp>{, xm, LSL #2}]
+      case Opcode::AArch64_LD1_MXIPXX_V_H: {  // ld1h {zatv.h[ws, #imm]}, pg/z,
+                                              // [<xn|sp>{, xm, lsl #1}]
         // SME, LOAD
-        if (!ZAenabled) {
-          // Not in right context mode. Raise exception
-          return ZAdisabled();
-        }
-        const uint16_t partition_num = VL_bits / 32;
-        const uint32_t ws = operands[partition_num].get<uint32_t>();
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 16;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
         const uint64_t* pg =
-            operands[partition_num + 1].getAsVector<uint64_t>();
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
 
         const uint32_t sliceNum =
-            (ws + metadata.operands[0].sme_index.disp) % partition_num;
-        const uint32_t* data = memoryData[0].getAsVector<uint32_t>();
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+        const uint16_t* data = memoryData_[0].getAsVector<uint16_t>();
 
-        uint32_t out[64] = {0};
         for (int i = 0; i < partition_num; i++) {
-          uint64_t shifted_active = 1ull << ((i % 16) * 4);
-          if (pg[i / 16] & shifted_active) {
-            out[i] = data[i];
-          } else {
-            out[i] = 0;
+          const uint16_t* row = sourceValues_[i].getAsVector<uint16_t>();
+          uint16_t out[128] = {0};
+          memcpy(out, row, partition_num * sizeof(uint16_t));
+          uint64_t shifted_active = 1ull << ((i % 32) * 2);
+          if (pg[i / 32] & shifted_active) {
+            out[sliceNum] = data[i];
           }
+          results_[i] = RegisterValue(out, 256);
         }
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_V_Q: {  // ld1q {zatv.q[ws]}, pg/z,
+                                              // [<xn|sp>{, xm, lsl #4}]
+        // SME, LOAD
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 128;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum = ws % partition_num;
+        // Using uint64_t as no 128-bit data type
+        const uint64_t* data = memoryData_[0].getAsVector<uint64_t>();
 
-        // All Slice vectors are added to results[] so need to update the
-        // correct one
         for (int i = 0; i < partition_num; i++) {
-          if (i == sliceNum)
-            results[i] = {out, 256};
-          else
-            // Maintain un-updated rows.
-            results[i] = operands[i];
+          // Using uint64_t as no 128-bit data type
+          const uint64_t* row = sourceValues_[i].getAsVector<uint64_t>();
+          uint64_t out[32] = {0};
+          // *2 in memcpy as need 128-bit but using uint64_t
+          memcpy(out, row, partition_num * sizeof(uint64_t) * 2);
+          // For 128-bit there are 16-bit for each active element
+          uint64_t shifted_active = 1ull << ((i % 4) * 16);
+          if (pg[i / 4] & shifted_active) {
+            // As using uint64_t need to modify 2 elements
+            out[2 * sliceNum] = data[2 * i];
+            out[2 * sliceNum + 1] = data[2 * i + 1];
+          }
+          results_[i] = RegisterValue(out, 256);
         }
         break;
       }
       case Opcode::AArch64_LD1_MXIPXX_V_S: {  // ld1w {zatv.s[ws, #imm]}, pg/z,
                                               // [<xn|sp>{, xm, LSL #2}]
         // SME, LOAD
-        if (!ZAenabled) {
-          // Not in right context mode. Raise exception
-          return ZAdisabled();
-        }
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
         const uint16_t partition_num = VL_bits / 32;
-        const uint32_t ws = operands[partition_num].get<uint32_t>();
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
         const uint64_t* pg =
-            operands[partition_num + 1].getAsVector<uint64_t>();
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
 
         const uint32_t sliceNum =
-            (ws + metadata.operands[0].sme_index.disp) % partition_num;
-        const uint32_t* data = memoryData[0].getAsVector<uint32_t>();
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+        const uint32_t* data = memoryData_[0].getAsVector<uint32_t>();
 
         for (int i = 0; i < partition_num; i++) {
-          uint32_t* row =
-              const_cast<uint32_t*>(operands[i].getAsVector<uint32_t>());
+          const uint32_t* row = sourceValues_[i].getAsVector<uint32_t>();
+          uint32_t out[64] = {0};
+          memcpy(out, row, partition_num * sizeof(uint32_t));
           uint64_t shifted_active = 1ull << ((i % 16) * 4);
           if (pg[i / 16] & shifted_active) {
-            row[sliceNum] = data[i];
-          } else {
-            row[sliceNum] = 0;
+            out[sliceNum] = data[i];
           }
-          results[i] = RegisterValue(reinterpret_cast<char*>(row), 256);
+          results_[i] = RegisterValue(out, 256);
         }
         break;
       }
       case Opcode::AArch64_LD1B: {  // ld1b  {zt.b}, pg/z, [xn, xm]
         // LOAD
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
 
         const uint16_t partition_num = VL_bits / 8;
-        const uint8_t* data = memoryData[0].getAsVector<uint8_t>();
+        const uint8_t* data = memoryData_[0].getAsVector<uint8_t>();
+
+        uint8_t out[256] = {0};
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << (i % 64);
+          if (p[i / 64] & shifted_active) {
+            out[i] = data[i];
+          } else {
+            out[i] = 0;
+          }
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_LD1B_IMM: {  // ld1b {zt.b}, pg/z, [xn{, #imm,
+                                        // mul vl}]
+        // LOAD
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
 
+        const uint16_t partition_num = VL_bits / 8;
+        const uint8_t* data = memoryData_[0].getAsVector<uint8_t>();
         uint8_t out[256] = {0};
         for (int i = 0; i < partition_num; i++) {
           uint64_t shifted_active = 1ull << (i % 64);
@@ -2510,15 +3692,15 @@ void Instruction::execute() {
             out[i] = 0;
           }
         }
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1D: {  // ld1d  {zt.d}, pg/z, [xn, xm, lsl #3]
         // LOAD
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
 
         const uint16_t partition_num = VL_bits / 64;
-        const uint64_t* data = memoryData[0].getAsVector<uint64_t>();
+        const uint64_t* data = memoryData_[0].getAsVector<uint64_t>();
 
         uint64_t out[32] = {0};
         for (int i = 0; i < partition_num; i++) {
@@ -2529,16 +3711,16 @@ void Instruction::execute() {
             out[i] = 0;
           }
         }
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
-      case Opcode::AArch64_LD1D_IMM_REAL: {  // ld1d  {zt.d}, pg/z, [xn{, #imm,
-                                             // mul vl}]
+      case Opcode::AArch64_LD1D_IMM: {  // ld1d  {zt.d}, pg/z, [xn{, #imm,
+                                        // mul vl}]
         // LOAD
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
 
         const uint16_t partition_num = VL_bits / 64;
-        const uint64_t* data = memoryData[0].getAsVector<uint64_t>();
+        const uint64_t* data = memoryData_[0].getAsVector<uint64_t>();
 
         uint64_t out[32] = {0};
         for (int i = 0; i < partition_num; i++) {
@@ -2549,15 +3731,15 @@ void Instruction::execute() {
             out[i] = 0;
           }
         }
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1H: {  // ld1h  {zt.h}, pg/z, [xn, xm, lsl #1]
         // LOAD
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
 
         const uint16_t partition_num = VL_bits / 16;
-        const uint16_t* data = memoryData[0].getAsVector<uint16_t>();
+        const uint16_t* data = memoryData_[0].getAsVector<uint16_t>();
 
         uint16_t out[128] = {0};
         for (int i = 0; i < partition_num; i++) {
@@ -2568,16 +3750,21 @@ void Instruction::execute() {
             out[i] = 0;
           }
         }
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1Onev16b: {  // ld1 {vt.16b} [xn]
-        results[0] = memoryData[0].zeroExtend(memoryData[0].size(), 256);
+        results_[0] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256);
         break;
       }
-      case Opcode::AArch64_LD1Onev16b_POST: {  // ld1 {vt.16b}, [xn], #imm
-        results[0] = memoryData[0].zeroExtend(memoryData[0].size(), 256);
-        results[1] = operands[0].get<uint64_t>() + metadata.operands[2].imm;
+      case Opcode::AArch64_LD1Onev16b_POST: {  // ld1 {vt.16b}, [xn], <#imm|xm>
+        // if #imm post-index, value can only be 16
+        const uint64_t postIndex =
+            (metadata_.operands[2].type == AARCH64_OP_REG)
+                ? sourceValues_[1].get<uint64_t>()
+                : 16;
+        results_[0] = sourceValues_[0].get<uint64_t>() + postIndex;
+        results_[1] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256);
         break;
       }
       case Opcode::AArch64_LD1RD_IMM: {  // ld1rd {zt.d}, pg/z, [xn, #imm]
@@ -2587,7 +3774,7 @@ void Instruction::execute() {
         uint16_t index = 0;
         // Check if any lanes are active, otherwise set all to 0 and break early
         bool active = false;
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
         for (int i = 0; i < 4; i++) {
           if (p[i] != 0) {
             active = true;
@@ -2596,7 +3783,7 @@ void Instruction::execute() {
         }
 
         if (active) {
-          uint64_t data = memoryData[0].get<uint64_t>();
+          uint64_t data = memoryData_[0].get<uint64_t>();
           for (int i = 0; i < partition_num; i++) {
             uint64_t shifted_active = p[index / 8] & 1ull << ((index % 8) * 8);
             out[i] = shifted_active ? data : 0;
@@ -2604,15 +3791,15 @@ void Instruction::execute() {
           }
         }
 
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1RQ_D_IMM: {  // ld1rqd {zd.d}, pg/z, [xn{, #imm}]
         // LOAD
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
         uint64_t out[32] = {0};
-        const uint64_t* data = memoryData[0].getAsVector<uint64_t>();
+        const uint64_t* data = memoryData_[0].getAsVector<uint64_t>();
 
         // Get mini-vector (quadword)
         uint64_t mini[2] = {0};
@@ -2628,15 +3815,41 @@ void Instruction::execute() {
           out[2 * i] = mini[0];
           out[(2 * i) + 1] = mini[1];
         }
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_LD1RQ_W: {  // ld1rqw {zd.s}, pg/z, [xn, xm, lsl #2]
+        // LOAD
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
+        const uint16_t partition_num = VL_bits / 32;
+        uint32_t out[64] = {0};
+        const uint32_t* data = memoryData_[0].getAsVector<uint32_t>();
+
+        // Get mini-vector (quadword)
+        uint32_t mini[4] = {0};
+        for (int i = 0; i < 4; i++) {
+          uint64_t shifted_active = 1ull << ((i % 16) * 4);
+          if (p[i / 16] & shifted_active) {
+            mini[i] = data[i];
+          }
+        }
+
+        // Duplicate mini-vector into output vector
+        for (int i = 0; i < (partition_num / 4); i++) {
+          out[4 * i] = mini[0];
+          out[(4 * i) + 1] = mini[1];
+          out[(4 * i) + 2] = mini[2];
+          out[(4 * i) + 3] = mini[3];
+        }
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1RQ_W_IMM: {  // ld1rqw {zd.s}, pg/z, [xn{, #imm}]
         // LOAD
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 32;
         uint32_t out[64] = {0};
-        const uint32_t* data = memoryData[0].getAsVector<uint32_t>();
+        const uint32_t* data = memoryData_[0].getAsVector<uint32_t>();
 
         // Get mini-vector (quadword)
         uint32_t mini[4] = {0};
@@ -2654,7 +3867,7 @@ void Instruction::execute() {
           out[(4 * i) + 2] = mini[2];
           out[(4 * i) + 3] = mini[3];
         }
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1RW_IMM: {  // ld1rw {zt.s}, pg/z, [xn, #imm]
@@ -2664,7 +3877,7 @@ void Instruction::execute() {
 
         // Check if any lanes are active, otherwise set all to 0 and break early
         bool active = false;
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
         for (int i = 0; i < 4; i++) {
           if (p[i] != 0) {
             active = true;
@@ -2672,159 +3885,218 @@ void Instruction::execute() {
           }
         }
         if (active) {
-          uint32_t data = memoryData[0].get<uint32_t>();
+          uint32_t data = memoryData_[0].get<uint32_t>();
           for (int i = 0; i < partition_num; i++) {
             uint64_t shifted_active = p[i / 16] & 1ull << ((i % 16) * 4);
             out[i] = shifted_active ? data : 0;
           }
         }
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1Rv16b: {  // ld1r {vt.16b}, [xn]
         // LOAD
-        uint8_t val = memoryData[0].get<uint8_t>();
+        uint8_t val = memoryData_[0].get<uint8_t>();
         uint8_t out[16] = {val, val, val, val, val, val, val, val,
                            val, val, val, val, val, val, val, val};
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1Rv16b_POST: {  // ld1r {vt.16b}, [xn], #imm
         // LOAD
-        uint8_t val = memoryData[0].get<uint8_t>();
+        uint8_t val = memoryData_[0].get<uint8_t>();
         uint8_t out[16] = {val, val, val, val, val, val, val, val,
                            val, val, val, val, val, val, val, val};
-        results[0] = {out, 256};
-        results[1] = operands[0].get<uint64_t>() + metadata.operands[2].imm;
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp;
+        results_[1] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1Rv1d: {  // ld1r {vt.1d}, [xn]
         // LOAD
-        uint64_t val = memoryData[0].get<uint64_t>();
+        uint64_t val = memoryData_[0].get<uint64_t>();
         uint64_t out[2] = {val, 0};
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1Rv1d_POST: {  // ld1r {vt.1d}, [xn], #imm
         // LOAD
-        uint64_t val = memoryData[0].get<uint64_t>();
+        uint64_t val = memoryData_[0].get<uint64_t>();
         uint64_t out[2] = {val, 0};
-        results[0] = {out, 256};
-        results[1] = operands[0].get<uint64_t>() + metadata.operands[2].imm;
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp;
+        results_[1] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1Rv2d: {  // ld1r {vt.2d}, [xn]
         // LOAD
-        uint64_t val = memoryData[0].get<uint64_t>();
+        uint64_t val = memoryData_[0].get<uint64_t>();
         uint64_t out[2] = {val, val};
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1Rv2d_POST: {  // ld1r {vt.2d}, [xn], #imm
         // LOAD
-        uint64_t val = memoryData[0].get<uint64_t>();
+        uint64_t val = memoryData_[0].get<uint64_t>();
         uint64_t out[2] = {val, val};
-        results[0] = {out, 256};
-        results[1] = operands[0].get<uint64_t>() + metadata.operands[2].imm;
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp;
+        results_[1] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1Rv2s: {  // ld1r {vt.2s}, [xn]
         // LOAD
-        uint32_t val = memoryData[0].get<uint32_t>();
+        uint32_t val = memoryData_[0].get<uint32_t>();
         uint32_t out[4] = {val, val, 0, 0};
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1Rv2s_POST: {  // ld1r {vt.2s}, [xn], #imm
         // LOAD
-        uint32_t val = memoryData[0].get<uint32_t>();
+        uint32_t val = memoryData_[0].get<uint32_t>();
         uint32_t out[4] = {val, val, 0, 0};
-        results[0] = {out, 256};
-        results[1] = operands[0].get<uint64_t>() + metadata.operands[2].imm;
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp;
+        results_[1] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1Rv4h: {  // ld1r {vt.4h}, [xn]
         // LOAD
-        uint16_t val = memoryData[0].get<uint16_t>();
+        uint16_t val = memoryData_[0].get<uint16_t>();
         uint16_t out[8] = {val, val, val, val, 0, 0, 0, 0};
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1Rv4h_POST: {  // ld1r {vt.4h}, [xn], #imm
         // LOAD
-        uint16_t val = memoryData[0].get<uint16_t>();
+        uint16_t val = memoryData_[0].get<uint16_t>();
         uint16_t out[8] = {val, val, val, val, 0, 0, 0, 0};
-        results[0] = {out, 256};
-        results[1] = operands[0].get<uint64_t>() + metadata.operands[2].imm;
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp;
+        results_[1] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1Rv4s: {  // ld1r {vt.4s}, [xn]
         // LOAD
-        uint32_t val = memoryData[0].get<uint32_t>();
+        uint32_t val = memoryData_[0].get<uint32_t>();
         uint32_t out[4] = {val, val, val, val};
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1Rv4s_POST: {  // ld1r {vt.4s}, [xn], #imm
         // LOAD
-        uint32_t val = memoryData[0].get<uint32_t>();
+        uint32_t val = memoryData_[0].get<uint32_t>();
         uint32_t out[4] = {val, val, val, val};
-        results[0] = {out, 256};
-        results[1] = operands[0].get<uint64_t>() + metadata.operands[2].imm;
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp;
+        results_[1] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1Rv8b: {  // ld1r {vt.8b}, [xn]
         // LOAD
-        uint8_t val = memoryData[0].get<uint8_t>();
+        uint8_t val = memoryData_[0].get<uint8_t>();
         uint8_t out[16] = {val, val, val, val, val, val, val, val,
                            0,   0,   0,   0,   0,   0,   0,   0};
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1Rv8b_POST: {  // ld1r {vt.8b}, [xn], #imm
         // LOAD
-        uint8_t val = memoryData[0].get<uint8_t>();
+        uint8_t val = memoryData_[0].get<uint8_t>();
         uint8_t out[16] = {val, val, val, val, val, val, val, val,
                            0,   0,   0,   0,   0,   0,   0,   0};
-        results[0] = {out, 256};
-        results[1] = operands[0].get<uint64_t>() + metadata.operands[2].imm;
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp;
+        results_[1] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1Rv8h: {  // ld1r {vt.8h}, [xn]
         // LOAD
-        uint16_t val = memoryData[0].get<uint16_t>();
+        uint16_t val = memoryData_[0].get<uint16_t>();
         uint16_t out[8] = {val, val, val, val, val, val, val, val};
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1Rv8h_POST: {  // ld1r {vt.8h}, [xn], #imm
         // LOAD
-        uint16_t val = memoryData[0].get<uint16_t>();
+        uint16_t val = memoryData_[0].get<uint16_t>();
         uint16_t out[8] = {val, val, val, val, val, val, val, val};
-        results[0] = {out, 256};
-        results[1] = operands[0].get<uint64_t>() + metadata.operands[2].imm;
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp;
+        results_[1] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_LD1Fourv16b:  // ld1 {vt1.16b, vt2.16b, vt3.16b,
+                                         // vt4.16b}, [xn]
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Fourv2d:  // ld1 {vt1.2d, vt2.2d, vt3.2d, vt4.2d},
+                                        // [xn]
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Fourv4s: {  // ld1 {vt1.4s, vt2.4s, vt3.4s,
+                                          // vt4.4s}, [xn]
+        // LOAD
+        results_[0] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256);
+        results_[1] = memoryData_[1].zeroExtend(memoryData_[1].size(), 256);
+        results_[2] = memoryData_[2].zeroExtend(memoryData_[2].size(), 256);
+        results_[3] = memoryData_[3].zeroExtend(memoryData_[3].size(), 256);
+        break;
+      }
+      case Opcode::AArch64_LD1Fourv16b_POST:  // ld1 {vt1.16b, vt2.16b, vt3.16b,
+                                              // vt4.16b}, [xn], <#imm|xm>
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Fourv2d_POST:  // ld1 {vt1.2d, vt2.2d, vt3.2d,
+                                             // vt4.2d}, [xn], <#imm|xm>
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Fourv4s_POST: {  // ld1 {vt1.4s, vt2.4s, vt3.4s,
+                                               // vt4.4s}, [xn], <#imm|xm>
+        // LOAD
+        // if #imm post-index, value can only be 64
+        const uint64_t postIndex =
+            (metadata_.operands[5].type == AARCH64_OP_REG)
+                ? sourceValues_[1].get<uint64_t>()
+                : 64;
+        results_[0] = sourceValues_[0].get<uint64_t>() + postIndex;
+        results_[1] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256);
+        results_[2] = memoryData_[1].zeroExtend(memoryData_[1].size(), 256);
+        results_[3] = memoryData_[2].zeroExtend(memoryData_[2].size(), 256);
+        results_[4] = memoryData_[3].zeroExtend(memoryData_[3].size(), 256);
         break;
       }
-      case Opcode::AArch64_LD1Twov16b: {  // ld1 {vt1.16b, vt2.16b}, [xn]
+      case Opcode::AArch64_LD1Twov16b:  // ld1 {vt1.16b, vt2.16b}, [xn]
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Twov2d:  // ld1 {vt1.2d, vt2.2d}, [xn]
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Twov4s: {  // ld1 {vt1.4s, vt2.4s}, [xn]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(memoryData[0].size(), 256);
-        results[1] = memoryData[1].zeroExtend(memoryData[1].size(), 256);
+        results_[0] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256);
+        results_[1] = memoryData_[1].zeroExtend(memoryData_[1].size(), 256);
         break;
       }
-      case Opcode::AArch64_LD1Twov16b_POST: {  // ld1 {vt1.16b, vt2.16b}, [xn],
-                                               //   #imm
+      case Opcode::AArch64_LD1Twov16b_POST:  // ld1 {vt1.16b, vt2.16b}, [xn],
+                                             // <#imm|xm>
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Twov2d_POST:  // ld1 {vt1.2d, vt2.2d}, [xn],
+                                            // <#imm|xm>
+        [[fallthrough]];
+      case Opcode::AArch64_LD1Twov4s_POST: {  // ld1 {vt1.4s, vt2.4s}, [xn],
+                                              // <#imm|xm>
         // LOAD
-        results[0] = memoryData[0].zeroExtend(memoryData[0].size(), 256);
-        results[1] = memoryData[1].zeroExtend(memoryData[1].size(), 256);
-        results[2] = operands[0].get<uint64_t>() + metadata.operands[3].imm;
+        // if #imm post-index, value can only be 32
+        const uint64_t postIndex =
+            (metadata_.operands[3].type == AARCH64_OP_REG)
+                ? sourceValues_[1].get<uint64_t>()
+                : 32;
+        results_[0] = sourceValues_[0].get<uint64_t>() + postIndex;
+        results_[1] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256);
+        results_[2] = memoryData_[1].zeroExtend(memoryData_[1].size(), 256);
         break;
       }
       case Opcode::AArch64_LD1W: {  // ld1w  {zt.s}, pg/z, [xn, xm, lsl #2]
         // LOAD
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
 
         const uint16_t partition_num = VL_bits / 32;
-        const uint32_t* data = memoryData[0].getAsVector<uint32_t>();
+        const uint32_t* data = memoryData_[0].getAsVector<uint32_t>();
 
         uint32_t out[64] = {0};
         for (int i = 0; i < partition_num; i++) {
@@ -2835,16 +4107,16 @@ void Instruction::execute() {
             out[i] = 0;
           }
         }
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
-      case Opcode::AArch64_LD1W_IMM_REAL: {  // ld1w  {zt.s}, pg/z, [xn{, #imm,
-                                             // mul vl}]
+      case Opcode::AArch64_LD1W_IMM: {  // ld1w  {zt.s}, pg/z, [xn{, #imm,
+                                        // mul vl}]
         // LOAD
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
 
         const uint16_t partition_num = VL_bits / 32;
-        const uint32_t* data = memoryData[0].getAsVector<uint32_t>();
+        const uint32_t* data = memoryData_[0].getAsVector<uint32_t>();
 
         uint32_t out[64] = {0};
         for (int i = 0; i < partition_num; i++) {
@@ -2855,41 +4127,46 @@ void Instruction::execute() {
             out[i] = 0;
           }
         }
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1i32: {  // ld1 {vt.s}[index], [xn]
         // LOAD
-        const int index = metadata.operands[0].vector_index;
-        const uint32_t* vt = operands[0].getAsVector<uint32_t>();
+        const int index = metadata_.operands[0].vector_index;
+        const uint32_t* vt = sourceValues_[0].getAsVector<uint32_t>();
         uint32_t out[4];
         for (int i = 0; i < 4; i++) {
-          out[i] = (i == index) ? memoryData[0].get<uint32_t>() : vt[i];
+          out[i] = (i == index) ? memoryData_[0].get<uint32_t>() : vt[i];
         }
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1i64: {  // ld1 {vt.d}[index], [xn]
         // LOAD
-        const int index = metadata.operands[0].vector_index;
-        const uint64_t* vt = operands[0].getAsVector<uint64_t>();
+        const int index = metadata_.operands[0].vector_index;
+        const uint64_t* vt = sourceValues_[0].getAsVector<uint64_t>();
         uint64_t out[2];
         for (int i = 0; i < 2; i++) {
-          out[i] = (i == index) ? memoryData[0].get<uint64_t>() : vt[i];
+          out[i] = (i == index) ? memoryData_[0].get<uint64_t>() : vt[i];
         }
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1i64_POST: {  // ld1 {vt.d}[index], [xn], #8
         // LOAD
-        const int index = metadata.operands[0].vector_index;
-        const uint64_t* vt = operands[0].getAsVector<uint64_t>();
+        const int index = metadata_.operands[0].vector_index;
+        const uint64_t* vt = sourceValues_[0].getAsVector<uint64_t>();
         uint64_t out[2];
         for (int i = 0; i < 2; i++) {
-          out[i] = (i == index) ? memoryData[0].get<uint64_t>() : vt[i];
+          out[i] = (i == index) ? memoryData_[0].get<uint64_t>() : vt[i];
         }
-        results[0] = {out, 256};
-        results[1] = operands[1].get<uint64_t>() + metadata.operands[2].imm;
+        // If post index is #imm, it can only be 8
+        const uint64_t postIndex =
+            (metadata_.operands[2].type == AARCH64_OP_REG)
+                ? sourceValues_[1].get<uint64_t>()
+                : 8;
+        results_[0] = sourceValues_[1].get<uint64_t>() + postIndex;
+        results_[1] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD2D:  // ld2d {zt1.d, zt2.d}, pg/z, [<xn|sp>, xm,
@@ -2897,11 +4174,11 @@ void Instruction::execute() {
       case Opcode::AArch64_LD2D_IMM: {  // ld2d {zt1.d, zt2.d}, pg/z, [<xn|sp>{,
                                         // #imm, mul vl}]
         // LOAD
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
         std::vector<const uint64_t*> data = {
-            memoryData[0].getAsVector<uint64_t>(),
-            memoryData[1].getAsVector<uint64_t>()};
+            memoryData_[0].getAsVector<uint64_t>(),
+            memoryData_[1].getAsVector<uint64_t>()};
         uint64_t out[2][32] = {{0}, {0}};
 
         for (int i = 0; i < partition_num; i++) {
@@ -2916,45 +4193,45 @@ void Instruction::execute() {
           }
         }
 
-        for (int i = 0; i < 2; i++) results[i] = {out[i], 256};
+        for (int i = 0; i < 2; i++) results_[i] = {out[i], 256};
         break;
       }
       case Opcode::AArch64_LD2Twov4s: {  // ld2 {vt1.4s, vt2.4s} [xn]
-        const float* region1 = memoryData[0].getAsVector<float>();
-        const float* region2 = memoryData[1].getAsVector<float>();
+        const float* region1 = memoryData_[0].getAsVector<float>();
+        const float* region2 = memoryData_[1].getAsVector<float>();
 
         // LD2 multistruct uses de-interleaving
         float t1[4] = {region1[0], region1[2], region2[0], region2[2]};
         float t2[4] = {region1[1], region1[3], region2[1], region2[3]};
-        results[0] = {t1, 256};
-        results[1] = {t2, 256};
+        results_[0] = {t1, 256};
+        results_[1] = {t2, 256};
         break;
       }
       case Opcode::AArch64_LD2Twov4s_POST: {  // ld2 {vt1.4s, vt2.4s}, [xn],
-                                              // #imm
+                                              // <xm|#imm>
         // LOAD
-        const float* region1 = memoryData[0].getAsVector<float>();
-        const float* region2 = memoryData[1].getAsVector<float>();
+        const float* region1 = memoryData_[0].getAsVector<float>();
+        const float* region2 = memoryData_[1].getAsVector<float>();
         float t1[4] = {region1[0], region1[2], region2[0], region2[2]};
         float t2[4] = {region1[1], region1[3], region2[1], region2[3]};
-        results[0] = {t1, 256};
-        results[1] = {t2, 256};
-        uint64_t offset = 32;
-        if (metadata.operandCount == 4) {
-          offset = operands[3].get<uint64_t>();
-        }
-        results[2] = operands[2].get<uint64_t>() + offset;
+        // #imm can only be 32
+        const uint64_t offset = (metadata_.operands[3].type == AARCH64_OP_REG)
+                                    ? sourceValues_[1].get<uint64_t>()
+                                    : 32;
+        results_[0] = sourceValues_[0].get<uint64_t>() + offset;
+        results_[1] = {t1, 256};
+        results_[2] = {t2, 256};
         break;
       }
       case Opcode::AArch64_LD3D_IMM: {  // ld3d {zt1.d, zt2.d, zt3.d}, pg/z,
                                         // [xn|sp{, #imm, MUL VL}]
         // LOAD
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
         std::vector<const uint64_t*> data = {
-            memoryData[0].getAsVector<uint64_t>(),
-            memoryData[1].getAsVector<uint64_t>(),
-            memoryData[2].getAsVector<uint64_t>()};
+            memoryData_[0].getAsVector<uint64_t>(),
+            memoryData_[1].getAsVector<uint64_t>(),
+            memoryData_[2].getAsVector<uint64_t>()};
         uint64_t out[3][32] = {{0}, {0}, {0}};
 
         for (int i = 0; i < partition_num; i++) {
@@ -2969,19 +4246,19 @@ void Instruction::execute() {
           }
         }
 
-        for (int i = 0; i < 3; i++) results[i] = {out[i], 256};
+        for (int i = 0; i < 3; i++) results_[i] = {out[i], 256};
         break;
       }
       case Opcode::AArch64_LD4D_IMM: {  // ld4d {zt1.d, zt2.d, zt3.d, zt4.d},
                                         // pg/z, [xn|sp{, #imm, MUL VL}]
         // LOAD
-        const uint64_t* p = operands[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
         std::vector<const uint64_t*> data = {
-            memoryData[0].getAsVector<uint64_t>(),
-            memoryData[1].getAsVector<uint64_t>(),
-            memoryData[2].getAsVector<uint64_t>(),
-            memoryData[3].getAsVector<uint64_t>()};
+            memoryData_[0].getAsVector<uint64_t>(),
+            memoryData_[1].getAsVector<uint64_t>(),
+            memoryData_[2].getAsVector<uint64_t>(),
+            memoryData_[3].getAsVector<uint64_t>()};
         uint64_t out[4][32] = {{0}, {0}, {0}, {0}};
 
         for (int i = 0; i < partition_num; i++) {
@@ -2996,7 +4273,7 @@ void Instruction::execute() {
           }
         }
 
-        for (int i = 0; i < 4; i++) results[i] = {out[i], 256};
+        for (int i = 0; i < 4; i++) results_[i] = {out[i], 256};
         break;
       }
       case Opcode::AArch64_LDADDLW:  // ldaddl ws, wt, [xn]
@@ -3004,40 +4281,41 @@ void Instruction::execute() {
         [[fallthrough]];
       case Opcode::AArch64_LDADDW: {  // ldadd ws, wt, [xn]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(4, 8);
-        memoryData[0] = RegisterValue(
-            memoryData[0].get<uint32_t>() + operands[0].get<uint32_t>(), 4);
+        results_[0] = memoryData_[0].zeroExtend(4, 8);
+        memoryData_[0] = RegisterValue(
+            memoryData_[0].get<uint32_t>() + sourceValues_[0].get<uint32_t>(),
+            4);
         break;
       }
       case Opcode::AArch64_LDARB: {  // ldarb wt, [xn]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(1, 8);
+        results_[0] = memoryData_[0].zeroExtend(1, 8);
         break;
       }
       case Opcode::AArch64_LDARW: {  // ldar wt, [xn]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(4, 8);
+        results_[0] = memoryData_[0].zeroExtend(4, 8);
         break;
       }
       case Opcode::AArch64_LDARX: {  // ldar xt, [xn]
         // LOAD
-        results[0] = memoryData[0];
+        results_[0] = memoryData_[0];
         break;
       }
       case Opcode::AArch64_LDAXRW: {  // ldaxr wd, [xn]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(4, 8);
+        results_[0] = memoryData_[0].zeroExtend(4, 8);
         break;
       }
       case Opcode::AArch64_LDAXRX: {  // ldaxr xd, [xn]
         // LOAD
-        results[0] = memoryData[0];
+        results_[0] = memoryData_[0];
         break;
       }
       case Opcode::AArch64_LDNPSi: {  // ldnp st1, st2, [xn, #imm]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(4, 256);
-        results[1] = memoryData[1].zeroExtend(4, 256);
+        results_[0] = memoryData_[0].zeroExtend(4, 256);
+        results_[1] = memoryData_[1].zeroExtend(4, 256);
         break;
       }
       case Opcode::AArch64_LDPDi:    // ldp dt1, dt2, [xn, #imm]
@@ -3045,10 +4323,13 @@ void Instruction::execute() {
       case Opcode::AArch64_LDPSi:    // ldp st1, st2, [xn, #imm]
       case Opcode::AArch64_LDPWi:    // ldp wt1, wt2, [xn, #imm]
       case Opcode::AArch64_LDPXi: {  // ldp xt1, xt2, [xn, #imm]
-        uint16_t regSize =
-            (isScalarData_ || isVectorData_ || isSVEData_) ? 256 : 8;
-        results[0] = memoryData[0].zeroExtend(dataSize_, regSize);
-        results[1] = memoryData[1].zeroExtend(dataSize_, regSize);
+        uint16_t regSize = (isInstruction(InsnType::isScalarData) ||
+                            isInstruction(InsnType::isVectorData) ||
+                            isInstruction(InsnType::isSVEData))
+                               ? 256
+                               : 8;
+        results_[0] = memoryData_[0].zeroExtend(dataSize_, regSize);
+        results_[1] = memoryData_[1].zeroExtend(dataSize_, regSize);
         break;
       }
       case Opcode::AArch64_LDPDpost:    // ldp dt1, dt2, [xn], #imm
@@ -3056,11 +4337,15 @@ void Instruction::execute() {
       case Opcode::AArch64_LDPSpost:    // ldp st1, st2, [xn], #imm
       case Opcode::AArch64_LDPWpost:    // ldp wt1, wt2, [xn], #imm
       case Opcode::AArch64_LDPXpost: {  // ldp xt1, xt2, [xn], #imm
-        uint16_t regSize =
-            (isScalarData_ || isVectorData_ || isSVEData_) ? 256 : 8;
-        results[0] = memoryData[0].zeroExtend(dataSize_, regSize);
-        results[1] = memoryData[1].zeroExtend(dataSize_, regSize);
-        results[2] = operands[0].get<uint64_t>() + metadata.operands[3].imm;
+        uint16_t regSize = (isInstruction(InsnType::isScalarData) ||
+                            isInstruction(InsnType::isVectorData) ||
+                            isInstruction(InsnType::isSVEData))
+                               ? 256
+                               : 8;
+        results_[1] = memoryData_[0].zeroExtend(dataSize_, regSize);
+        results_[2] = memoryData_[1].zeroExtend(dataSize_, regSize);
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[3].imm;
         break;
       }
       case Opcode::AArch64_LDPDpre:    // ldp dt1, dt2, [xn, #imm]!
@@ -3068,48 +4353,52 @@ void Instruction::execute() {
       case Opcode::AArch64_LDPSpre:    // ldp st1, st2, [xn, #imm]!
       case Opcode::AArch64_LDPWpre:    // ldp wt1, wt2, [xn, #imm]!
       case Opcode::AArch64_LDPXpre: {  // ldp xt1, xt2, [xn, #imm]!
-        uint16_t regSize =
-            (isScalarData_ || isVectorData_ || isSVEData_) ? 256 : 8;
-        results[0] = memoryData[0].zeroExtend(dataSize_, regSize);
-        results[1] = memoryData[1].zeroExtend(dataSize_, regSize);
-        results[2] =
-            operands[0].get<uint64_t>() + metadata.operands[2].mem.disp;
+        uint16_t regSize = (isInstruction(InsnType::isScalarData) ||
+                            isInstruction(InsnType::isVectorData) ||
+                            isInstruction(InsnType::isSVEData))
+                               ? 256
+                               : 8;
+        results_[1] = memoryData_[0].zeroExtend(dataSize_, regSize);
+        results_[2] = memoryData_[1].zeroExtend(dataSize_, regSize);
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[2].mem.disp;
         break;
       }
       case Opcode::AArch64_LDPSWi: {  // ldpsw xt1, xt2, [xn {, #imm}]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(4, 8);
-        results[1] = memoryData[1].zeroExtend(4, 8);
+        results_[0] = memoryData_[0].zeroExtend(4, 8);
+        results_[1] = memoryData_[1].zeroExtend(4, 8);
         break;
       }
       case Opcode::AArch64_LDRBBpost: {  // ldrb wt, [xn], #imm
         // LOAD
-        results[0] = memoryData[0].zeroExtend(1, 8);
-        results[1] = operands[0].get<uint64_t>() + metadata.operands[2].imm;
+        results_[1] = memoryData_[0].zeroExtend(1, 8);
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[2].imm;
         break;
       }
       case Opcode::AArch64_LDRBBpre: {  // ldrb wt, [xn, #imm]!
         // LOAD
-        results[0] = memoryData[0].zeroExtend(1, 8);
-        results[1] =
-            operands[0].get<uint64_t>() + metadata.operands[1].mem.disp;
+        results_[1] = memoryData_[0].zeroExtend(1, 8);
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp;
         break;
       }
       case Opcode::AArch64_LDRBBroW: {  // ldrb wt,
                                         //  [xn, wm{, extend {#amount}}]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(1, 8);
+        results_[0] = memoryData_[0].zeroExtend(1, 8);
         break;
       }
       case Opcode::AArch64_LDRBBroX: {  // ldrb wt,
                                         //  [xn, xm{, extend {#amount}}]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(1, 8);
+        results_[0] = memoryData_[0].zeroExtend(1, 8);
         break;
       }
       case Opcode::AArch64_LDRBBui: {  // ldrb wt, [xn, #imm]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(1, 8);
+        results_[0] = memoryData_[0].zeroExtend(1, 8);
         break;
       }
       case Opcode::AArch64_LDRBui:    // ldr bt, [xn, #imm]
@@ -3119,9 +4408,12 @@ void Instruction::execute() {
       case Opcode::AArch64_LDRSui:    // ldr st, [xn, #imm]
       case Opcode::AArch64_LDRWui:    // ldr wt, [xn, #imm]
       case Opcode::AArch64_LDRXui: {  // ldr xt, [xn, #imm]
-        uint16_t regSize =
-            (isScalarData_ || isVectorData_ || isSVEData_) ? 256 : 8;
-        results[0] = memoryData[0].zeroExtend(dataSize_, regSize);
+        uint16_t regSize = (isInstruction(InsnType::isScalarData) ||
+                            isInstruction(InsnType::isVectorData) ||
+                            isInstruction(InsnType::isSVEData))
+                               ? 256
+                               : 8;
+        results_[0] = memoryData_[0].zeroExtend(dataSize_, regSize);
         break;
       }
       case Opcode::AArch64_LDRBpost:    // ldr bt, [xn], #imm
@@ -3131,10 +4423,14 @@ void Instruction::execute() {
       case Opcode::AArch64_LDRSpost:    // ldr st, [xn], #imm
       case Opcode::AArch64_LDRWpost:    // ldr wt, [xn], #imm
       case Opcode::AArch64_LDRXpost: {  // ldr xt, [xn], #imm
-        uint16_t regSize =
-            (isScalarData_ || isVectorData_ || isSVEData_) ? 256 : 8;
-        results[0] = memoryData[0].zeroExtend(dataSize_, regSize);
-        results[1] = operands[0].get<uint64_t>() + metadata.operands[2].imm;
+        uint16_t regSize = (isInstruction(InsnType::isScalarData) ||
+                            isInstruction(InsnType::isVectorData) ||
+                            isInstruction(InsnType::isSVEData))
+                               ? 256
+                               : 8;
+        results_[1] = memoryData_[0].zeroExtend(dataSize_, regSize);
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[2].imm;
         break;
       }
       case Opcode::AArch64_LDRBpre:    // ldr bt, [xn, #imm]!
@@ -3144,178 +4440,186 @@ void Instruction::execute() {
       case Opcode::AArch64_LDRSpre:    // ldr st, [xn, #imm]!
       case Opcode::AArch64_LDRWpre:    // ldr wt, [xn, #imm]!
       case Opcode::AArch64_LDRXpre: {  // ldr xt, [xn, #imm]!
-        uint16_t regSize =
-            (isScalarData_ || isVectorData_ || isSVEData_) ? 256 : 8;
-        results[0] = memoryData[0].zeroExtend(dataSize_, regSize);
-        results[1] =
-            operands[0].get<uint64_t>() + metadata.operands[1].mem.disp;
+        uint16_t regSize = (isInstruction(InsnType::isScalarData) ||
+                            isInstruction(InsnType::isVectorData) ||
+                            isInstruction(InsnType::isSVEData))
+                               ? 256
+                               : 8;
+        results_[1] = memoryData_[0].zeroExtend(dataSize_, regSize);
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp;
         break;
       }
       case Opcode::AArch64_LDRDroW: {  // ldr dt, [xn, wm, {extend {#amount}}]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(memoryAddresses[0].size, 256);
+        results_[0] = memoryData_[0].zeroExtend(memoryAddresses_[0].size, 256);
         break;
       }
       case Opcode::AArch64_LDRDroX: {  // ldr dt, [xn, xm, {extend {#amount}}]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(memoryAddresses[0].size, 256);
+        results_[0] = memoryData_[0].zeroExtend(memoryAddresses_[0].size, 256);
         break;
       }
       case Opcode::AArch64_LDRHHpost: {  // ldrh wt, [xn], #imm
         // LOAD
-        results[0] = memoryData[0].zeroExtend(2, 8);
-        results[1] = operands[0].get<uint64_t>() + metadata.operands[2].imm;
+        results_[1] = memoryData_[0].zeroExtend(2, 8);
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[2].imm;
         break;
       }
       case Opcode::AArch64_LDRHHpre: {  // ldrh wt, [xn, #imm]!
         // LOAD
-        results[0] = memoryData[0].zeroExtend(2, 8);
-        results[1] =
-            operands[0].get<uint64_t>() + metadata.operands[1].mem.disp;
+        results_[1] = memoryData_[0].zeroExtend(2, 8);
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp;
         break;
       }
       case Opcode::AArch64_LDRHHroW: {  // ldrh wt, [xn, wm, {extend {#amount}}]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(2, 8);
+        results_[0] = memoryData_[0].zeroExtend(2, 8);
         break;
       }
       case Opcode::AArch64_LDRHHroX: {  // ldrh wt, [xn, xm, {extend {#amount}}]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(2, 8);
+        results_[0] = memoryData_[0].zeroExtend(2, 8);
         break;
       }
       case Opcode::AArch64_LDRHHui: {  // ldrh wt, [xn, #imm]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(2, 8);
+        results_[0] = memoryData_[0].zeroExtend(2, 8);
         break;
       }
       case Opcode::AArch64_LDRQroX: {  // ldr qt, [xn, xm, {extend {#amount}}]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(16, 256);
+        results_[0] = memoryData_[0].zeroExtend(16, 256);
         break;
       }
       case Opcode::AArch64_LDRSBWroX: {  // ldrsb wt, [xn, xm{, extend
                                          // {#amount}}]
         // LOAD
-        results[0] =
-            RegisterValue(static_cast<int32_t>(memoryData[0].get<int8_t>()), 4)
+        results_[0] =
+            RegisterValue(static_cast<int32_t>(memoryData_[0].get<int8_t>()), 4)
                 .zeroExtend(4, 8);
         break;
       }
       case Opcode::AArch64_LDRSBWui: {  // ldrsb wt, [xn, #imm]
         // LOAD
-        results[0] =
-            RegisterValue(static_cast<int32_t>(memoryData[0].get<int8_t>()))
+        results_[0] =
+            RegisterValue(static_cast<int32_t>(memoryData_[0].get<int8_t>()))
                 .zeroExtend(4, 8);
         break;
       }
       case Opcode::AArch64_LDRSBXui: {  // ldrsb xt, [xn, #imm]
         // LOAD
-        results[0] = static_cast<int64_t>(memoryData[0].get<int8_t>());
+        results_[0] = static_cast<int64_t>(memoryData_[0].get<int8_t>());
         break;
       }
       case Opcode::AArch64_LDRSHWroW: {  // ldrsh wt, [xn, wm{, extend
                                          // {#amount}}]
         // LOAD
-        results[0] =
-            RegisterValue(static_cast<int32_t>(memoryData[0].get<int16_t>()), 4)
+        results_[0] =
+            RegisterValue(static_cast<int32_t>(memoryData_[0].get<int16_t>()),
+                          4)
                 .zeroExtend(4, 8);
         break;
       }
       case Opcode::AArch64_LDRSHWroX: {  // ldrsh wt, [xn, xm{, extend
                                          // {#amount}}]
         // LOAD
-        results[0] =
-            RegisterValue(static_cast<int32_t>(memoryData[0].get<int16_t>()), 4)
+        results_[0] =
+            RegisterValue(static_cast<int32_t>(memoryData_[0].get<int16_t>()),
+                          4)
                 .zeroExtend(4, 8);
         break;
       }
       case Opcode::AArch64_LDRSHWui: {  // ldrsh wt, [xn, #imm]
         // LOAD
-        results[0] =
-            RegisterValue(static_cast<int32_t>(memoryData[0].get<int16_t>()), 4)
+        results_[0] =
+            RegisterValue(static_cast<int32_t>(memoryData_[0].get<int16_t>()),
+                          4)
                 .zeroExtend(4, 8);
         break;
       }
       case Opcode::AArch64_LDRSHXroW: {  // ldrsh xt, [xn, wm{, extend
                                          // {#amount}}]
         // LOAD
-        results[0] = static_cast<int64_t>(memoryData[0].get<int16_t>());
+        results_[0] = static_cast<int64_t>(memoryData_[0].get<int16_t>());
         break;
       }
       case Opcode::AArch64_LDRSHXroX: {  // ldrsh xt, [xn, xm{, extend
                                          // {#amount}}]
         // LOAD
-        results[0] = static_cast<int64_t>(memoryData[0].get<int16_t>());
+        results_[0] = static_cast<int64_t>(memoryData_[0].get<int16_t>());
         break;
       }
       case Opcode::AArch64_LDRSHXui: {  // ldrsh xt, [xn, #imm]
         // LOAD
-        results[0] = static_cast<int64_t>(memoryData[0].get<int16_t>());
+        results_[0] = static_cast<int64_t>(memoryData_[0].get<int16_t>());
         break;
       }
       case Opcode::AArch64_LDRSWl: {  // ldrsw xt, #imm
         // LOAD
-        results[0] = memoryData[0].zeroExtend(4, 8);
+        results_[0] = memoryData_[0].zeroExtend(4, 8);
         break;
       }
       case Opcode::AArch64_LDRSWpost: {  // ldrsw xt, [xn], #simm
         // LOAD
-        results[0] = static_cast<int64_t>(memoryData[0].get<int32_t>());
-        results[1] = operands[0].get<uint64_t>() + metadata.operands[2].imm;
+        results_[1] = static_cast<int64_t>(memoryData_[0].get<int32_t>());
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[2].imm;
         break;
       }
       case Opcode::AArch64_LDRSWroX: {  // ldrsw xt, [xn, xm{, extend
                                         // {#amount}}]
         // LOAD
-        results[0] = static_cast<int64_t>(memoryData[0].get<int32_t>());
+        results_[0] = static_cast<int64_t>(memoryData_[0].get<int32_t>());
         break;
       }
       case Opcode::AArch64_LDRSWui: {  // ldrsw xt, [xn{, #pimm}]
         // LOAD
-        results[0] = static_cast<int64_t>(memoryData[0].get<int32_t>());
+        results_[0] = static_cast<int64_t>(memoryData_[0].get<int32_t>());
         break;
       }
       case Opcode::AArch64_LDRSroW: {  // ldr st, [xn, wm, {extend {#amount}}]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(4, 256);
+        results_[0] = memoryData_[0].zeroExtend(4, 256);
         break;
       }
       case Opcode::AArch64_LDRSroX: {  // ldr st, [xn, xm, {extend {#amount}}]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(4, 256);
+        results_[0] = memoryData_[0].zeroExtend(4, 256);
         break;
       }
       case Opcode::AArch64_LDRWroW: {  // ldr wt, [xn, wm, {extend {#amount}}]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(4, 8);
+        results_[0] = memoryData_[0].zeroExtend(4, 8);
         break;
       }
       case Opcode::AArch64_LDRWroX: {  // ldr wt, [xn, xm, {extend {#amount}}]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(4, 8);
+        results_[0] = memoryData_[0].zeroExtend(4, 8);
         break;
       }
       case Opcode::AArch64_LDRXl: {  // ldr xt, #imm
         // LOAD
-        results[0] = memoryData[0];
+        results_[0] = memoryData_[0];
         break;
       }
       case Opcode::AArch64_LDRXroW: {  // ldr xt, [xn, wn{, extend {#amount}}]
         // LOAD
-        results[0] = memoryData[0];
+        results_[0] = memoryData_[0];
         break;
       }
       case Opcode::AArch64_LDRXroX: {  // ldr xt, [xn, xn{, extend {#amount}}]
         // LOAD
-        results[0] = memoryData[0];
+        results_[0] = memoryData_[0];
         break;
       }
       case Opcode::AArch64_LDR_PXI: {  // ldr pt, [xn{, #imm, mul vl}]
         // LOAD
         const uint64_t PL_bits = VL_bits / 8;
         const uint16_t partition_num = PL_bits / 8;
-        const uint8_t* memData = memoryData[0].getAsVector<uint8_t>();
+        const uint8_t* memData = memoryData_[0].getAsVector<uint8_t>();
 
         uint64_t out[4] = {0};
         for (int i = 0; i < partition_num; i++) {
@@ -3324,232 +4628,256 @@ void Instruction::execute() {
             out[i / 8] |= (data & (1 << j)) ? 1ull << ((j + (i * 8)) % 64) : 0;
           }
         }
-        results[0] = out;
+        results_[0] = out;
         break;
       }
       case Opcode::AArch64_LDR_ZXI: {  // ldr zt, [xn{, #imm, mul vl}]
         // LOAD
         const uint16_t partition_num = VL_bits / 8;
         uint8_t out[256] = {0};
-        const uint8_t* data = memoryData[0].getAsVector<uint8_t>();
+        const uint8_t* data = memoryData_[0].getAsVector<uint8_t>();
 
         for (int i = 0; i < partition_num; i++) {
           out[i] = data[i];
         }
-        results[0] = {out, 256};
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_LDR_ZA: {  // ldr za[wv, #imm], [<xn|sp>{, #imm, mul
+                                      // vl}]
+        // SME, LOAD
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 8;
+        const uint32_t wn = sourceValues_[rowCount].get<uint32_t>();
+        const uint32_t sliceNum =
+            wn +
+            static_cast<uint32_t>(metadata_.operands[0].sme.slice_offset.imm);
+
+        const uint8_t* data = memoryData_[0].getAsVector<uint8_t>();
+        uint8_t out[256] = {0};
+        for (uint16_t i = 0; i < rowCount; i++) {
+          out[i] = data[i];
+        }
+
+        for (uint16_t row = 0; row < rowCount; row++) {
+          results_[row] = (row == sliceNum)
+                              ? RegisterValue(out, 256)
+                              : results_[row] = sourceValues_[row];
+        }
         break;
       }
       case Opcode::AArch64_LDTRSBXi: {  // ldtrsb xt, [xn, #imm]
         // LOAD
         // TODO: implement
-        results[0] = RegisterValue(0, 8);
+        results_[0] = RegisterValue(0, 8);
         break;
       }
       case Opcode::AArch64_LDURBBi: {  // ldurb wt, [xn, #imm]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(1, 8);
+        results_[0] = memoryData_[0].zeroExtend(1, 8);
         break;
       }
       case Opcode::AArch64_LDURDi: {  // ldur dt, [xn, #imm]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(8, 256);
+        results_[0] = memoryData_[0].zeroExtend(8, 256);
         break;
       }
       case Opcode::AArch64_LDURHHi: {  // ldurh wt, [xn, #imm]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(2, 8);
+        results_[0] = memoryData_[0].zeroExtend(2, 8);
         break;
       }
       case Opcode::AArch64_LDURQi: {  // ldur qt, [xn, #imm]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(16, 256);
+        results_[0] = memoryData_[0].zeroExtend(16, 256);
         break;
       }
       case Opcode::AArch64_LDURSWi: {  // ldursw xt, [xn, #imm]
         // LOAD
-        results[0] = static_cast<int64_t>(memoryData[0].get<int32_t>());
+        results_[0] = static_cast<int64_t>(memoryData_[0].get<int32_t>());
         break;
       }
       case Opcode::AArch64_LDURSi: {  // ldur sd, [<xn|sp>{, #imm}]
         // LOAD
-        results[0] = {memoryData[0].get<float>(), 256};
+        results_[0] = {memoryData_[0].get<float>(), 256};
         break;
       }
       case Opcode::AArch64_LDURWi: {  // ldur wt, [xn, #imm]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(4, 8);
+        results_[0] = memoryData_[0].zeroExtend(4, 8);
         break;
       }
       case Opcode::AArch64_LDURXi: {  // ldur xt, [xn, #imm]
         // LOAD
-        results[0] = memoryData[0];
+        results_[0] = memoryData_[0];
         break;
       }
       case Opcode::AArch64_LDXRW: {  // ldxr wt, [xn]
         // LOAD
-        results[0] = memoryData[0].zeroExtend(4, 8);
+        results_[0] = memoryData_[0].zeroExtend(4, 8);
         break;
       }
       case Opcode::AArch64_LDXRX: {  // ldxr xt, [xn]
         // LOAD
-        results[0] = memoryData[0];
+        results_[0] = memoryData_[0];
         break;
       }
       case Opcode::AArch64_LSLVWr: {  // lslv wd, wn, wm
-        results[0] = {
-            logicalHelp::logicalShiftLR_3ops<uint32_t>(operands, true), 8};
+        results_[0] = {logicalShiftLR_3ops<uint32_t>(sourceValues_, true), 8};
         break;
       }
       case Opcode::AArch64_LSLVXr: {  // lslv xd, xn, xm
-        results[0] = logicalHelp::logicalShiftLR_3ops<uint64_t>(operands, true);
+        results_[0] = logicalShiftLR_3ops<uint64_t>(sourceValues_, true);
         break;
       }
       case Opcode::AArch64_LSL_ZZI_S: {  // lsl zd.s, zn.s, #imm
-        results[0] = sveHelp::sveLsl_imm<uint32_t>(operands, metadata, VL_bits);
+        results_[0] = sveLsl_imm<uint32_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_LSRVWr: {  // lsrv wd, wn, wm
-        results[0] = {
-            logicalHelp::logicalShiftLR_3ops<uint32_t>(operands, false), 8};
+        results_[0] = {logicalShiftLR_3ops<uint32_t>(sourceValues_, false), 8};
         break;
       }
       case Opcode::AArch64_LSRVXr: {  // lsrv xd, xn, xm
-        results[0] =
-            logicalHelp::logicalShiftLR_3ops<uint64_t>(operands, false);
+        results_[0] = logicalShiftLR_3ops<uint64_t>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_MADDWrrr: {  // madd wd, wn, wm, wa
-        results[0] = {multiplyHelp::madd_4ops<uint32_t>(operands), 8};
+        results_[0] = {madd_4ops<uint32_t>(sourceValues_), 8};
         break;
       }
       case Opcode::AArch64_MADDXrrr: {  // madd xd, xn, xm, xa
-        results[0] = multiplyHelp::madd_4ops<uint64_t>(operands);
+        results_[0] = madd_4ops<uint64_t>(sourceValues_);
         break;
       }
       case Opcode::AArch64_MLA_ZPmZZ_B: {  // mla zda.b, pg/m, zn.b, zm.b
-        results[0] = sveHelp::sveMlaPredicated_vecs<uint8_t>(operands, VL_bits);
+        results_[0] = sveMlaPredicated_vecs<uint8_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_MLA_ZPmZZ_D: {  // mla zda.d, pg/m, zn.d, zm.d
-        results[0] =
-            sveHelp::sveMlaPredicated_vecs<uint64_t>(operands, VL_bits);
+        results_[0] = sveMlaPredicated_vecs<uint64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_MLA_ZPmZZ_H: {  // mla zda.h, pg/m, zn.h, zm.h
-        results[0] =
-            sveHelp::sveMlaPredicated_vecs<uint16_t>(operands, VL_bits);
+        results_[0] = sveMlaPredicated_vecs<uint16_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_MLA_ZPmZZ_S: {  // mla zda.s, pg/m, zn.s, zm.s
-        results[0] =
-            sveHelp::sveMlaPredicated_vecs<uint32_t>(operands, VL_bits);
+        results_[0] = sveMlaPredicated_vecs<uint32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_MOVID: {  // movi dd, #imm
-        results[0] = {static_cast<uint64_t>(metadata.operands[1].imm), 256};
+        results_[0] = {static_cast<uint64_t>(metadata_.operands[1].imm), 256};
         break;
       }
       case Opcode::AArch64_MOVIv16b_ns: {  // movi vd.16b, #imm
-        results[0] = neonHelp::vecMovi_imm<uint8_t, 16>(metadata);
+        results_[0] = vecMovi_imm<uint8_t, 16>(metadata_);
         break;
       }
       case Opcode::AArch64_MOVIv2d_ns: {  // movi vd.2d, #imm
-        results[0] = neonHelp::vecMovi_imm<uint64_t, 2>(metadata);
+        results_[0] = vecMovi_imm<uint64_t, 2>(metadata_);
         break;
       }
       case Opcode::AArch64_MOVIv2i32: {  // movi vd.2s, #imm{, lsl #shift}
-        results[0] = neonHelp::vecMoviShift_imm<uint32_t, 2>(metadata, false);
+        results_[0] = vecMoviShift_imm<uint32_t, 2>(metadata_, false);
         break;
       }
       case Opcode::AArch64_MOVIv4i32: {  // movi vd.4s, #imm{, LSL #shift}
-        results[0] = neonHelp::vecMoviShift_imm<uint32_t, 4>(metadata, false);
+        results_[0] = vecMoviShift_imm<uint32_t, 4>(metadata_, false);
         break;
       }
       case Opcode::AArch64_MOVIv8b_ns: {  // movi vd.8b, #imm
-        results[0] = neonHelp::vecMovi_imm<uint8_t, 8>(metadata);
+        results_[0] = vecMovi_imm<uint8_t, 8>(metadata_);
         break;
       }
       case Opcode::AArch64_MOVKWi: {  // movk wd, #imm
-        results[0] = {
-            arithmeticHelp::movkShift_imm<uint32_t>(operands, metadata), 8};
+        results_[0] = {movkShift_imm<uint32_t>(sourceValues_, metadata_), 8};
         break;
       }
       case Opcode::AArch64_MOVKXi: {  // movk xd, #imm
-        results[0] =
-            arithmeticHelp::movkShift_imm<uint64_t>(operands, metadata);
+        results_[0] = movkShift_imm<uint64_t>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_MOVNWi: {  // movn wd, #imm{, LSL #shift}
-        results[0] = {arithmeticHelp::movnShift_imm<uint32_t>(
-                          metadata, [](uint64_t x) -> uint32_t { return ~x; }),
-                      8};
+        results_[0] = {
+            movnShift_imm<uint32_t>(metadata_,
+                                    [](uint64_t x) -> uint32_t { return ~x; }),
+            8};
         break;
       }
       case Opcode::AArch64_MOVNXi: {  // movn xd, #imm{, LSL #shift}
-        results[0] = arithmeticHelp::movnShift_imm<uint64_t>(
-            metadata, [](uint64_t x) -> uint64_t { return ~x; });
+        results_[0] = movnShift_imm<uint64_t>(
+            metadata_, [](uint64_t x) -> uint64_t { return ~x; });
         break;
       }
       case Opcode::AArch64_MOVPRFX_ZPmZ_D: {  // movprfx zd.d, pg/m, zn.d
-        results[0] = sveHelp::sveMovprfxPredicated_destUnchanged<uint64_t>(
-            operands, VL_bits);
+        results_[0] = sveMovprfxPredicated_destUnchanged<uint64_t>(
+            sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_MOVPRFX_ZPzZ_D: {  // movprfx zd.d, pg/z, zn.d
-        results[0] = sveHelp::sveMovprfxPredicated_destToZero<uint64_t>(
-            operands, VL_bits);
+        results_[0] =
+            sveMovprfxPredicated_destToZero<uint64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_MOVPRFX_ZPzZ_S: {  // movprfx zd.s, pg/z, zn.s
-        results[0] = sveHelp::sveMovprfxPredicated_destToZero<uint32_t>(
-            operands, VL_bits);
+        results_[0] =
+            sveMovprfxPredicated_destToZero<uint32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_MOVPRFX_ZZ: {  // movprfx zd, zn
         // TODO: Adopt hint logic of the MOVPRFX instruction
-        results[0] = operands[0];
+        results_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_MOVZWi: {  // movz wd, #imm
-        results[0] = {arithmeticHelp::movnShift_imm<uint32_t>(
-                          metadata, [](uint64_t x) -> uint32_t { return x; }),
-                      8};
+        results_[0] = {movnShift_imm<uint32_t>(
+                           metadata_, [](uint64_t x) -> uint32_t { return x; }),
+                       8};
         break;
       }
       case Opcode::AArch64_MOVZXi: {  // movz xd, #imm
-        results[0] = arithmeticHelp::movnShift_imm<uint64_t>(
-            metadata, [](uint64_t x) -> uint64_t { return x; });
+        results_[0] = movnShift_imm<uint64_t>(
+            metadata_, [](uint64_t x) -> uint64_t { return x; });
         break;
       }
       case Opcode::AArch64_MRS: {  // mrs xt, (systemreg|Sop0_op1_Cn_Cm_op2)
-        results[0] = operands[0];
+        results_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_MSR: {  // msr (systemreg|Sop0_op1_Cn_Cm_op2), xt
-        results[0] = operands[0];
+        // Handle case where SVCR is being updated as this invokes additional
+        // functionality
+        if (metadata_.operands[0].sysop.reg.sysreg == AARCH64_SYSREG_SVCR) {
+          return SMZAupdated();
+        } else {
+          results_[0] = sourceValues_[0];
+        }
         break;
       }
       case Opcode::AArch64_MSUBWrrr: {  // msub wd, wn, wm, wa
-        results[0] = {multiplyHelp::msub_4ops<uint32_t>(operands), 8};
+        results_[0] = {msub_4ops<uint32_t>(sourceValues_), 8};
         break;
       }
       case Opcode::AArch64_MSUBXrrr: {  // msub xd, xn, xm, xa
-        results[0] = multiplyHelp::msub_4ops<uint64_t>(operands);
+        results_[0] = msub_4ops<uint64_t>(sourceValues_);
         break;
       }
       case Opcode::AArch64_MSRpstatesvcrImm1: {  // msr svcr<sm|za|smza>, #imm
         // This instruction is always used by SMSTART and SMSTOP aliases.
         const uint64_t svcrBits =
-            static_cast<uint64_t>(metadata.operands[0].svcr);
+            static_cast<uint64_t>(metadata_.operands[0].sysop.alias.svcr);
 
         // Changing value of SM or ZA bits in SVCR zeros out vector, predicate,
         // and ZA registers. Raise an exception to do this.
         switch (svcrBits) {
-          case ARM64_SVCR_SVCRSM:
+          case AARCH64_SVCR_SVCRSM:
             return streamingModeUpdated();
-          case ARM64_SVCR_SVCRZA:
+          case AARCH64_SVCR_SVCRZA:
             return zaRegisterStatusUpdated();
-          case ARM64_SVCR_SVCRSMZA:
+          case AARCH64_SVCR_SVCRSMZA:
             return SMZAupdated();
           default:
             // Invalid instruction
@@ -3557,513 +4885,675 @@ void Instruction::execute() {
         }
       }
       case Opcode::AArch64_MUL_ZPmZ_B: {  // mul zdn.b, pg/m, zdn.b, zm.b
-        results[0] = sveHelp::sveMulPredicated<uint8_t>(operands, metadata,
-                                                        VL_bits, false);
+        results_[0] =
+            sveMulPredicated<uint8_t>(sourceValues_, metadata_, VL_bits, false);
         break;
       }
       case Opcode::AArch64_MUL_ZPmZ_D: {  // mul zdn.d, pg/m, zdn.d, zm.d
-        results[0] = sveHelp::sveMulPredicated<uint64_t>(operands, metadata,
-                                                         VL_bits, false);
+        results_[0] = sveMulPredicated<uint64_t>(sourceValues_, metadata_,
+                                                 VL_bits, false);
         break;
       }
       case Opcode::AArch64_MUL_ZPmZ_H: {  // mul zdn.h, pg/m, zdn.h, zm.h
-        results[0] = sveHelp::sveMulPredicated<uint16_t>(operands, metadata,
-                                                         VL_bits, false);
+        results_[0] = sveMulPredicated<uint16_t>(sourceValues_, metadata_,
+                                                 VL_bits, false);
         break;
       }
       case Opcode::AArch64_MUL_ZPmZ_S: {  // mul zdn.s, pg/m, zdn.s, zm.s
-        results[0] = sveHelp::sveMulPredicated<uint32_t>(operands, metadata,
-                                                         VL_bits, false);
+        results_[0] = sveMulPredicated<uint32_t>(sourceValues_, metadata_,
+                                                 VL_bits, false);
         break;
       }
       case Opcode::AArch64_MVNIv2i32: {  // mvni vd.2s, #imm{, lsl #shift}
-        results[0] = neonHelp::vecMoviShift_imm<uint32_t, 2>(metadata, true);
+        results_[0] = vecMoviShift_imm<uint32_t, 2>(metadata_, true);
         break;
       }
       case Opcode::AArch64_MVNIv2s_msl: {  // mvni vd.2s, #imm, msl #amount
-        results[0] = neonHelp::vecMoviShift_imm<uint32_t, 2>(metadata, true);
+        results_[0] = vecMoviShift_imm<uint32_t, 2>(metadata_, true);
         break;
       }
       case Opcode::AArch64_MVNIv4i16: {  // mvni vd.4h, #imm{, lsl #shift}
-        results[0] = neonHelp::vecMoviShift_imm<uint16_t, 4>(metadata, true);
+        results_[0] = vecMoviShift_imm<uint16_t, 4>(metadata_, true);
         break;
       }
       case Opcode::AArch64_MVNIv4i32: {  // mvni vd.4s, #imm{, lsl #shift}
-        results[0] = neonHelp::vecMoviShift_imm<uint32_t, 4>(metadata, true);
+        results_[0] = vecMoviShift_imm<uint32_t, 4>(metadata_, true);
         break;
       }
       case Opcode::AArch64_MVNIv4s_msl: {  // mvni vd.4s #imm, msl #amount
-        results[0] = neonHelp::vecMoviShift_imm<uint32_t, 4>(metadata, true);
+        results_[0] = vecMoviShift_imm<uint32_t, 4>(metadata_, true);
         break;
       }
       case Opcode::AArch64_MVNIv8i16: {  // mvni vd.8h, #imm{, lsl #shift}
-        results[0] = neonHelp::vecMoviShift_imm<uint16_t, 8>(metadata, true);
+        results_[0] = vecMoviShift_imm<uint16_t, 8>(metadata_, true);
         break;
       }
       case Opcode::AArch64_NEGv2i64: {  // neg vd.2d, vn.2d
-        results[0] = neonHelp::vecFneg_2ops<int64_t, 2>(operands);
+        results_[0] = vecFneg_2ops<int64_t, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_NOTv16i8: {  // not vd.16b, vn.16b
-        results[0] = neonHelp::vecLogicOp_2vecs<uint8_t, 16>(
-            operands, [](uint8_t x) -> uint8_t { return ~x; });
+        results_[0] = vecLogicOp_2vecs<uint8_t, 16>(
+            sourceValues_, [](uint8_t x) -> uint8_t { return ~x; });
         break;
       }
       case Opcode::AArch64_NOTv8i8: {  // not vd.8b, vn.8b
-        results[0] = neonHelp::vecLogicOp_2vecs<uint8_t, 8>(
-            operands, [](uint8_t x) -> uint8_t { return ~x; });
+        results_[0] = vecLogicOp_2vecs<uint8_t, 8>(
+            sourceValues_, [](uint8_t x) -> uint8_t { return ~x; });
         break;
       }
       case Opcode::AArch64_ORNWrs: {  // orn wd, wn, wm{, shift{ #amount}}
-        auto [result, nzcv] = logicalHelp::logicOpShift_3ops<uint32_t>(
-            operands, metadata, false,
+        auto [result, nzcv] = logicOpShift_3ops<uint32_t>(
+            sourceValues_, metadata_, false,
             [](uint32_t x, uint32_t y) -> uint32_t { return x | (~y); });
-        results[0] = {result, 8};
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = {result, 8};
         break;
       }
       case Opcode::AArch64_ORNXrs: {  // orn xd, xn, xm{, shift{ #amount}}
-        auto [result, nzcv] = logicalHelp::logicOpShift_3ops<uint64_t>(
-            operands, metadata, false,
+        auto [result, nzcv] = logicOpShift_3ops<uint64_t>(
+            sourceValues_, metadata_, false,
             [](uint64_t x, uint64_t y) -> uint64_t { return x | (~y); });
-        results[0] = result;
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = result;
         break;
       }
       case Opcode::AArch64_ORRWri: {  // orr wd, wn, #imm
-        auto [result, nzcv] = logicalHelp::logicOp_imm<uint32_t>(
-            operands, metadata, false,
+        auto [result, nzcv] = logicOp_imm<uint32_t>(
+            sourceValues_, metadata_, false,
             [](uint32_t x, uint32_t y) -> uint32_t { return x | y; });
-        results[0] = {result, 8};
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = {result, 8};
         break;
       }
       case Opcode::AArch64_ORRWrs: {  // orr wd, wn, wm{, shift{ #amount}}
-        results[0] = {
-            comparisonHelp::orrShift_3ops<uint32_t>(operands, metadata), 8};
+        results_[0] = {orrShift_3ops<uint32_t>(sourceValues_, metadata_), 8};
         break;
       }
       case Opcode::AArch64_ORRXri: {  // orr xd, xn, #imm
-        auto [result, nzcv] = logicalHelp::logicOp_imm<uint64_t>(
-            operands, metadata, false,
+        auto [result, nzcv] = logicOp_imm<uint64_t>(
+            sourceValues_, metadata_, false,
             [](uint64_t x, uint64_t y) -> uint64_t { return x | y; });
-        results[0] = {result, 8};
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = {result, 8};
         break;
       }
       case Opcode::AArch64_ORRXrs: {  // orr xd, xn, xm{, shift{ #amount}}
-        results[0] =
-            comparisonHelp::orrShift_3ops<uint64_t>(operands, metadata);
+        results_[0] = orrShift_3ops<uint64_t>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_ORR_PPzPP: {  // orr pd.b, pg/z, pn.b, pm.b
-        results[0] = sveHelp::sveLogicOp_preds<uint8_t>(
-            operands, VL_bits,
+        results_[0] = sveLogicOp_preds<uint8_t>(
+            sourceValues_, VL_bits,
             [](uint64_t x, uint64_t y) -> uint64_t { return x | y; });
         break;
       }
       case Opcode::AArch64_ORR_ZZZ: {  // orr zd.d, zn.d, zm.d
-        results[0] = sveHelp::sveOrr_3vecs<uint64_t>(operands, VL_bits);
+        results_[0] = sveOrr_3vecs<uint64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_ORRv16i8: {  // orr vd.16b, Vn.16b, Vm.16b
-        results[0] = neonHelp::vecLogicOp_3vecs<uint8_t, 16>(
-            operands, [](uint8_t x, uint8_t y) -> uint8_t { return x | y; });
+        results_[0] = vecLogicOp_3vecs<uint8_t, 16>(
+            sourceValues_,
+            [](uint8_t x, uint8_t y) -> uint8_t { return x | y; });
         break;
       }
       case Opcode::AArch64_ORRv8i8: {  // orr vd.8b, vn.8b, vm.8b
-        results[0] = neonHelp::vecLogicOp_3vecs<uint8_t, 8>(
-            operands, [](uint8_t x, uint8_t y) -> uint8_t { return x | y; });
+        results_[0] = vecLogicOp_3vecs<uint8_t, 8>(
+            sourceValues_,
+            [](uint8_t x, uint8_t y) -> uint8_t { return x | y; });
         break;
       }
       case Opcode::AArch64_PFALSE: {  // pfalse pd.b
         uint64_t out[4] = {0, 0, 0, 0};
-        results[0] = out;
+        results_[0] = out;
         break;
       }
       case Opcode::AArch64_PRFMui: {  // prfm op, [xn, xm{, extend{, #amount}}]
         break;
       }
       case Opcode::AArch64_PSEL_PPPRI_B: {  // psel pd, pn, pm.b[wa, #imm]
-        results[0] = sveHelp::svePsel<uint8_t>(operands, metadata);
+        results_[0] = svePsel<uint8_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_PSEL_PPPRI_D: {  // psel pd, pn, pm.d[wa, #imm]
-        results[0] = sveHelp::svePsel<uint64_t>(operands, metadata);
+        results_[0] = svePsel<uint64_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_PSEL_PPPRI_H: {  // psel pd, pn, pm.h[wa, #imm]
-        results[0] = sveHelp::svePsel<uint16_t>(operands, metadata);
+        results_[0] = svePsel<uint16_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_PSEL_PPPRI_S: {  // psel pd, pn, pm.s[wa, #imm]
-        results[0] = sveHelp::svePsel<uint32_t>(operands, metadata);
+        results_[0] = svePsel<uint32_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_PTEST_PP: {  // ptest pg, pn.b
-        const uint64_t* g = operands[0].getAsVector<uint64_t>();
-        const uint64_t* s = operands[1].getAsVector<uint64_t>();
+        const uint64_t* g = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* s = sourceValues_[1].getAsVector<uint64_t>();
         std::array<uint64_t, 4> masked_n = {(g[0] & s[0]), (g[1] & s[1]),
                                             (g[2] & s[2]), (g[3] & s[3])};
         // Byte count = 1 as destination predicate is regarding single bytes.
-        results[0] = AuxFunc::getNZCVfromPred(masked_n, VL_bits, 1);
+        results_[0] = getNZCVfromPred(masked_n, VL_bits, 1);
         break;
       }
       case Opcode::AArch64_PTRUE_B: {  // ptrue pd.b{, pattern}
-        results[0] = sveHelp::svePtrue<uint8_t>(metadata, VL_bits);
+        results_[0] = svePtrue<uint8_t>(metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_PTRUE_D: {  // ptrue pd.d{, pattern}
-        results[0] = sveHelp::svePtrue<uint64_t>(metadata, VL_bits);
+        results_[0] = svePtrue<uint64_t>(metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_PTRUE_H: {  // ptrue pd.h{, pattern}
-        results[0] = sveHelp::svePtrue<uint16_t>(metadata, VL_bits);
+        results_[0] = svePtrue<uint16_t>(metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_PTRUE_S: {  // ptrue pd.s{, pattern}
-        results[0] = sveHelp::svePtrue<uint32_t>(metadata, VL_bits);
+        results_[0] = svePtrue<uint32_t>(metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_PUNPKHI_PP: {  // punpkhi pd.h, pn.b
-        results[0] = sveHelp::svePunpk(operands, VL_bits, true);
+        results_[0] = svePunpk(sourceValues_, VL_bits, true);
         break;
       }
       case Opcode::AArch64_PUNPKLO_PP: {  // punpklo pd.h, pn.b
-        results[0] = sveHelp::svePunpk(operands, VL_bits, false);
+        results_[0] = svePunpk(sourceValues_, VL_bits, false);
         break;
       }
       case Opcode::AArch64_RBITWr: {  // rbit wd, wn
-        results[0] = {bitmanipHelp::rbit<uint32_t>(operands, metadata), 8};
+        results_[0] = {rbit<uint32_t>(sourceValues_, metadata_), 8};
         break;
       }
       case Opcode::AArch64_RBITXr: {  // rbit xd, xn
-        results[0] = bitmanipHelp::rbit<uint64_t>(operands, metadata);
+        results_[0] = rbit<uint64_t>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_RDVLI_XI: {  // rdvl xd, #imm
-        int8_t imm = static_cast<int8_t>(metadata.operands[1].imm);
-        results[0] = (uint64_t)(imm * (VL_bits / 8));
+        int8_t imm = static_cast<int8_t>(metadata_.operands[1].imm);
+        results_[0] = (uint64_t)(imm * (VL_bits / 8));
         break;
       }
       case Opcode::AArch64_RET: {  // ret {xr}
         branchTaken_ = true;
-        branchAddress_ = operands[0].get<uint64_t>();
+        branchAddress_ = sourceValues_[0].get<uint64_t>();
         break;
       }
       case Opcode::AArch64_REV16v16i8: {  // rev16 Vd.16b, Vn.16b
-        results[0] = neonHelp::vecRev<int8_t, 16, 16>(operands);
+        results_[0] = vecRev<int8_t, 16, 16>(sourceValues_);
         break;
       }
       case Opcode::AArch64_REV16v8i8: {  // rev16 Vd.8b, Vn.8b
-        results[0] = neonHelp::vecRev<int8_t, 16, 8>(operands);
+        results_[0] = vecRev<int8_t, 16, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_REV32v16i8: {  // rev32 Vd.16b, Vn.16b
-        results[0] = neonHelp::vecRev<int8_t, 32, 16>(operands);
+        results_[0] = vecRev<int8_t, 32, 16>(sourceValues_);
         break;
       }
       case Opcode::AArch64_REV32v4i16: {  // rev32 Vd.4h, Vn.4h
-        results[0] = neonHelp::vecRev<int16_t, 32, 4>(operands);
+        results_[0] = vecRev<int16_t, 32, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_REV32v8i16: {  // rev32 Vd.8h, Vn.8h
-        results[0] = neonHelp::vecRev<int16_t, 32, 8>(operands);
+        results_[0] = vecRev<int16_t, 32, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_REV32v8i8: {  // rev32 Vd.8b, Vn.8b
-        results[0] = neonHelp::vecRev<int8_t, 32, 8>(operands);
+        results_[0] = vecRev<int8_t, 32, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_REV64v16i8: {  // rev64 Vd.16b, Vn.16b
-        results[0] = neonHelp::vecRev<int8_t, 64, 16>(operands);
+        results_[0] = vecRev<int8_t, 64, 16>(sourceValues_);
         break;
       }
       case Opcode::AArch64_REV64v2i32: {  // rev64 Vd.2s, Vn.2s
-        results[0] = neonHelp::vecRev<int32_t, 64, 2>(operands);
+        results_[0] = vecRev<int32_t, 64, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_REV64v4i16: {  // rev64 Vd.4h, Vn.4h
-        results[0] = neonHelp::vecRev<int16_t, 64, 4>(operands);
+        results_[0] = vecRev<int16_t, 64, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_REV64v4i32: {  // rev64 Vd.4s, Vn.4s
-        results[0] = neonHelp::vecRev<int32_t, 64, 4>(operands);
+        results_[0] = vecRev<int32_t, 64, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_REV64v8i16: {  // rev64 Vd.8h, Vn.8h
-        results[0] = neonHelp::vecRev<int16_t, 64, 8>(operands);
+        results_[0] = vecRev<int16_t, 64, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_REV64v8i8: {  // rev64 Vd.8b Vn.8b
-        results[0] = neonHelp::vecRev<int8_t, 64, 8>(operands);
+        results_[0] = vecRev<int8_t, 64, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_REVXr: {  // rev xd, xn
-        results[0] = bitmanipHelp::rev<uint64_t>(operands);
+        results_[0] = rev<uint64_t>(sourceValues_);
         break;
       }
       case Opcode::AArch64_REV_PP_B: {  // rev pd.b, pn.b
-        results[0] = sveHelp::sveRev_predicates<uint8_t>(operands, VL_bits);
+        results_[0] = sveRev_predicates<uint8_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_REV_PP_D: {  // rev pd.d, pn.d
-        results[0] = sveHelp::sveRev_predicates<uint64_t>(operands, VL_bits);
+        results_[0] = sveRev_predicates<uint64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_REV_PP_H: {  // rev pd.h, pn.h
-        results[0] = sveHelp::sveRev_predicates<uint16_t>(operands, VL_bits);
+        results_[0] = sveRev_predicates<uint16_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_REV_PP_S: {  // rev pd.s, pn.s
-        results[0] = sveHelp::sveRev_predicates<uint32_t>(operands, VL_bits);
+        results_[0] = sveRev_predicates<uint32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_REV_ZZ_B: {  // rev zd.b, zn.b
-        results[0] = sveHelp::sveRev_vecs<uint8_t>(operands, VL_bits);
+        results_[0] = sveRev_vecs<uint8_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_REV_ZZ_D: {  // rev zd.d, zn.d
-        results[0] = sveHelp::sveRev_vecs<uint64_t>(operands, VL_bits);
+        results_[0] = sveRev_vecs<uint64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_REV_ZZ_H: {  // rev zd.h, zn.h
-        results[0] = sveHelp::sveRev_vecs<uint16_t>(operands, VL_bits);
+        results_[0] = sveRev_vecs<uint16_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_REV_ZZ_S: {  // rev zd.s, zn.s
-        results[0] = sveHelp::sveRev_vecs<uint32_t>(operands, VL_bits);
+        results_[0] = sveRev_vecs<uint32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_RORVWr: {  // rorv wd, wn, wm
-        results[0] = {logicalHelp::rorv_3ops<uint32_t>(operands), 8};
+        results_[0] = {rorv_3ops<uint32_t>(sourceValues_), 8};
         break;
       }
       case Opcode::AArch64_RORVXr: {  // rorv xd, xn, xm
-        results[0] = logicalHelp::rorv_3ops<uint64_t>(operands);
+        results_[0] = rorv_3ops<uint64_t>(sourceValues_);
         break;
       }
       case Opcode::AArch64_SBCWr: {  // sbc wd, wn, wm
-        results[0] = {arithmeticHelp::sbc<uint32_t>(operands), 8};
+        results_[0] = {sbc<uint32_t>(sourceValues_), 8};
         break;
       }
       case Opcode::AArch64_SBCXr: {  // sbc xd, xn, xm
-        results[0] = arithmeticHelp::sbc<uint64_t>(operands);
+        results_[0] = sbc<uint64_t>(sourceValues_);
         break;
       }
       case Opcode::AArch64_SBFMWri: {  // sbfm wd, wn, #immr, #imms
-        results[0] = {
-            bitmanipHelp::bfm_2imms<uint32_t>(operands, metadata, true, true),
-            8};
+        results_[0] = {
+            bfm_2imms<uint32_t>(sourceValues_, metadata_, true, true), 8};
         break;
       }
       case Opcode::AArch64_SBFMXri: {  // sbfm xd, xn, #immr, #imms
-        results[0] =
-            bitmanipHelp::bfm_2imms<uint64_t>(operands, metadata, true, true);
+        results_[0] = bfm_2imms<uint64_t>(sourceValues_, metadata_, true, true);
         break;
       }
       case Opcode::AArch64_SCVTFSWSri: {  // scvtf sd, wn, #fbits
-        results[0] =
-            floatHelp::scvtf_FixedPoint<float, int32_t>(operands, metadata);
+        results_[0] =
+            scvtf_FixedPoint<float, int32_t>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_SCVTFSXDri: {  // scvtf dd, xn, #fbits
-        results[0] =
-            floatHelp::scvtf_FixedPoint<double, int64_t>(operands, metadata);
+        results_[0] =
+            scvtf_FixedPoint<double, int64_t>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_SCVTFSXSri: {  // scvtf sd, xn, #fbits
-        results[0] =
-            floatHelp::scvtf_FixedPoint<float, int64_t>(operands, metadata);
+        results_[0] =
+            scvtf_FixedPoint<float, int64_t>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_SCVTFUWDri: {  // scvtf dd, wn
-        results[0] = {static_cast<double>(operands[0].get<int32_t>()), 256};
+        results_[0] = {static_cast<double>(sourceValues_[0].get<int32_t>()),
+                       256};
         break;
       }
       case Opcode::AArch64_SCVTFUWSri: {  // scvtf sd, wn
-        results[0] = {static_cast<float>(operands[0].get<int32_t>()), 256};
+        results_[0] = {static_cast<float>(sourceValues_[0].get<int32_t>()),
+                       256};
         break;
       }
       case Opcode::AArch64_SCVTFUXDri: {  // scvtf dd, xn
-        results[0] = {static_cast<double>(operands[0].get<int64_t>()), 256};
+        results_[0] = {static_cast<double>(sourceValues_[0].get<int64_t>()),
+                       256};
         break;
       }
       case Opcode::AArch64_SCVTFUXSri: {  // scvtf sd, xn
-        results[0] = {static_cast<float>(operands[0].get<int64_t>()), 256};
+        results_[0] = {static_cast<float>(sourceValues_[0].get<int64_t>()),
+                       256};
         break;
       }
       case Opcode::AArch64_SCVTF_ZPmZ_DtoD: {  // scvtf zd.d, pg/m, zn.d
-        results[0] =
-            sveHelp::sveFcvtPredicated<double, int64_t>(operands, VL_bits);
+        results_[0] =
+            sveFcvtPredicated<double, int64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_SCVTF_ZPmZ_DtoS: {  // scvtf zd.s, pg/m, zn.d
-        results[0] =
-            sveHelp::sveFcvtPredicated<float, int64_t>(operands, VL_bits);
+        results_[0] = sveFcvtPredicated<float, int64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_SCVTF_ZPmZ_StoD: {  // scvtf zd.d, pg/m, zn.s
-        results[0] =
-            sveHelp::sveFcvtPredicated<double, int32_t>(operands, VL_bits);
+        results_[0] =
+            sveFcvtPredicated<double, int32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_SCVTF_ZPmZ_StoS: {  // scvtf zd.s, pg/m, zn.s
-        results[0] =
-            sveHelp::sveFcvtPredicated<float, int32_t>(operands, VL_bits);
+        results_[0] = sveFcvtPredicated<float, int32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_SCVTFv1i32: {  // scvtf sd, sn
-        results[0] = {static_cast<float>(operands[0].get<int32_t>()), 256};
+        results_[0] = {static_cast<float>(sourceValues_[0].get<int32_t>()),
+                       256};
         break;
       }
       case Opcode::AArch64_SCVTFv1i64: {  // scvtf dd, dn
-        results[0] = {static_cast<double>(operands[0].get<int64_t>()), 256};
+        results_[0] = {static_cast<double>(sourceValues_[0].get<int64_t>()),
+                       256};
         break;
       }
       case Opcode::AArch64_SCVTFv2f32: {  // scvtf vd.2s, vn.2s
-        results[0] = neonHelp::vecScvtf_2vecs<float, int32_t, 2>(
-            operands, [](int32_t x) -> float { return static_cast<float>(x); });
+        results_[0] = vecScvtf_2vecs<float, int32_t, 2>(
+            sourceValues_,
+            [](int32_t x) -> float { return static_cast<float>(x); });
         break;
       }
       case Opcode::AArch64_SCVTFv2f64: {  // scvtf vd.2d, vn.2d
-        results[0] = neonHelp::vecScvtf_2vecs<double, int64_t, 2>(
-            operands,
+        results_[0] = vecScvtf_2vecs<double, int64_t, 2>(
+            sourceValues_,
             [](int64_t x) -> double { return static_cast<double>(x); });
         break;
       }
       case Opcode::AArch64_SCVTFv4f32: {  // scvtf vd.4s, vn.4s
-        results[0] = neonHelp::vecScvtf_2vecs<float, int32_t, 4>(
-            operands, [](int32_t x) -> float { return static_cast<float>(x); });
+        results_[0] = vecScvtf_2vecs<float, int32_t, 4>(
+            sourceValues_,
+            [](int32_t x) -> float { return static_cast<float>(x); });
         break;
       }
       case Opcode::AArch64_SDIVWr: {  // sdiv wd, wn, wm
-        results[0] = {divideHelp::div_3ops<int32_t>(operands), 8};
+        results_[0] = {div_3ops<int32_t>(sourceValues_), 8};
         break;
       }
       case Opcode::AArch64_SDIVXr: {  // sdiv xd, xn, xm
-        results[0] = {divideHelp::div_3ops<int64_t>(operands), 8};
+        results_[0] = {div_3ops<int64_t>(sourceValues_), 8};
         break;
       }
       case Opcode::AArch64_SEL_ZPZZ_D: {  // sel zd.d, pg, zn.d, zm.d
-        results[0] = sveHelp::sveSel_zpzz<uint64_t>(operands, VL_bits);
+        results_[0] = sveSel_zpzz<uint64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_SEL_ZPZZ_S: {  // sel zd.s, pg, zn.s, zm.s
-        results[0] = sveHelp::sveSel_zpzz<uint32_t>(operands, VL_bits);
+        results_[0] = sveSel_zpzz<uint32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_SHLd: {  // shl dd, dn #imm
-        results[0] =
-            neonHelp::vecShlShift_vecImm<uint64_t, 1>(operands, metadata);
+        results_[0] = vecShlShift_vecImm<uint64_t, 1>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_SHLv4i32_shift: {  // shl vd.4s, vn.4s, #imm
-        results[0] =
-            neonHelp::vecShlShift_vecImm<uint32_t, 4>(operands, metadata);
+        results_[0] = vecShlShift_vecImm<uint32_t, 4>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_SMADDLrrr: {  // smaddl xd, wn, wm, xa
-        results[0] = multiplyHelp::maddl_4ops<int64_t, int32_t>(operands);
+        results_[0] = maddl_4ops<int64_t, int32_t>(sourceValues_);
         break;
       }
       case Opcode::AArch64_SMAX_ZI_S: {  // smax zdn.s, zdn.s, #imm
-        results[0] =
-            sveHelp::sveMax_vecImm<int32_t>(operands, metadata, VL_bits);
+        results_[0] = sveMax_vecImm<int32_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_SMAX_ZPmZ_S: {  // smax zd.s, pg/m, zn.s, zm.s
-        results[0] = sveHelp::sveMaxPredicated_vecs<int32_t>(operands, VL_bits);
+        results_[0] = sveMaxPredicated_vecs<int32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_SMAXv4i32: {  // smax vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecLogicOp_3vecs<int32_t, 4>(
-            operands,
+        results_[0] = vecLogicOp_3vecs<int32_t, 4>(
+            sourceValues_,
             [](int32_t x, int32_t y) -> int32_t { return std::max(x, y); });
         break;
       }
       case Opcode::AArch64_SMINV_VPZ_S: {  // sminv sd, pg, zn.s
-        results[0] = sveHelp::sveSminv<int32_t>(operands, VL_bits);
+        results_[0] = sveSminv<int32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_SMINVv4i32v: {  // sminv sd, vn.4s
-        results[0] = neonHelp::vecMinv_2ops<int32_t, 4>(operands);
+        results_[0] = vecMinv_2ops<int32_t, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_SMIN_ZPmZ_S: {  // smin zd.s, pg/m, zn.s, zm.s
-        results[0] = sveHelp::sveLogicOpPredicated_3vecs<int32_t>(
-            operands, VL_bits,
+        results_[0] = sveLogicOpPredicated_3vecs<int32_t>(
+            sourceValues_, VL_bits,
             [](int32_t x, int32_t y) -> int32_t { return std::min(x, y); });
         break;
       }
       case Opcode::AArch64_SMINv4i32: {  // smin vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecLogicOp_3vecs<int32_t, 4>(
-            operands,
+        results_[0] = vecLogicOp_3vecs<int32_t, 4>(
+            sourceValues_,
             [](int32_t x, int32_t y) -> int32_t { return std::min(x, y); });
         break;
       }
+      case Opcode::AArch64_SMOPA_MPPZZ_D: {  // smopa zada.d, pn/m, pm/m, zn.h,
+                                             // zm.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const int16_t* zn = sourceValues_[tileDim + 2].getAsVector<int16_t>();
+        const int16_t* zm = sourceValues_[tileDim + 3].getAsVector<int16_t>();
+
+        // zn is a SVLd x 4 sub matrix
+        // zm is a 4 x SVLd sub matrix
+        // Resulting SVLd x SVLd matrix has results widened to 64-bit
+        for (int row = 0; row < tileDim; row++) {
+          int64_t outRow[32] = {0};
+          const int64_t* zadaRow = sourceValues_[row].getAsVector<int64_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int64_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2);
+              const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2);
+              if ((pn[znIndex / 32] & shifted_active_zn) &&
+                  (pm[zmIndex / 32] & shifted_active_zm))
+                sum += (static_cast<int64_t>(zn[znIndex]) *
+                        static_cast<int64_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_SMOPA_MPPZZ_S: {  // smopa zada.s, pn/m, pm/m, zn.b,
+                                             // zm.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const int8_t* zn = sourceValues_[tileDim + 2].getAsVector<int8_t>();
+        const int8_t* zm = sourceValues_[tileDim + 3].getAsVector<int8_t>();
+
+        // zn is a SVLs x 4 sub matrix
+        // zm is a 4 x SVLs sub matrix
+        // Resulting SVLs x SVLs matrix has results widened to 32-bit
+        for (int row = 0; row < tileDim; row++) {
+          int32_t outRow[64] = {0};
+          const int32_t* zadaRow = sourceValues_[row].getAsVector<int32_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int32_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << (znIndex % 64);
+              const uint64_t shifted_active_zm = 1ull << (zmIndex % 64);
+              if ((pn[znIndex / 64] & shifted_active_zn) &&
+                  (pm[zmIndex / 64] & shifted_active_zm))
+                sum += (static_cast<int32_t>(zn[znIndex]) *
+                        static_cast<int32_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_SMOPS_MPPZZ_D: {  // smops zada.d, pn/m, pm/m, zn.h,
+                                             // zm.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const int16_t* zn = sourceValues_[tileDim + 2].getAsVector<int16_t>();
+        const int16_t* zm = sourceValues_[tileDim + 3].getAsVector<int16_t>();
+
+        // zn is a SVLd x 4 sub matrix
+        // zm is a 4 x SVLd sub matrix
+        // Resulting SVLd x SVLd matrix has results widened to 64-bit
+        for (int row = 0; row < tileDim; row++) {
+          int64_t outRow[32] = {0};
+          const int64_t* zadaRow = sourceValues_[row].getAsVector<int64_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int64_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2);
+              const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2);
+              if ((pn[znIndex / 32] & shifted_active_zn) &&
+                  (pm[zmIndex / 32] & shifted_active_zm))
+                sum -= (static_cast<int64_t>(zn[znIndex]) *
+                        static_cast<int64_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_SMOPS_MPPZZ_S: {  // smops zada.s, pn/m, pm/m, zn.b,
+                                             // zm.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const int8_t* zn = sourceValues_[tileDim + 2].getAsVector<int8_t>();
+        const int8_t* zm = sourceValues_[tileDim + 3].getAsVector<int8_t>();
+
+        // zn is a SVLs x 4 sub matrix
+        // zm is a 4 x SVLs sub matrix
+        // Resulting SVLs x SVLs matrix has results widened to 32-bit
+        for (int row = 0; row < tileDim; row++) {
+          int32_t outRow[64] = {0};
+          const int32_t* zadaRow = sourceValues_[row].getAsVector<int32_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int32_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << (znIndex % 64);
+              const uint64_t shifted_active_zm = 1ull << (zmIndex % 64);
+              if ((pn[znIndex / 64] & shifted_active_zn) &&
+                  (pm[zmIndex / 64] & shifted_active_zm))
+                sum -= (static_cast<int32_t>(zn[znIndex]) *
+                        static_cast<int32_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
       case Opcode::AArch64_SMSUBLrrr: {  // smsubl xd, wn, wm, xa
-        results[0] = arithmeticHelp::msubl_4ops<int64_t, int32_t>(operands);
+        results_[0] = msubl_4ops<int64_t, int32_t>(sourceValues_);
         break;
       }
       case Opcode::AArch64_SMULH_ZPmZ_B: {  // smulh zdn.b, pg/m, zdn.b, zm.b
-        results[0] =
-            sveHelp::sveMulhPredicated<int8_t, int16_t>(operands, VL_bits);
+        results_[0] =
+            sveMulhPredicated<int8_t, int16_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_SMULH_ZPmZ_H: {  // smulh zdn.h, pg/m, zdn.h, zm.h
-        results[0] =
-            sveHelp::sveMulhPredicated<int16_t, int32_t>(operands, VL_bits);
+        results_[0] =
+            sveMulhPredicated<int16_t, int32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_SMULH_ZPmZ_S: {  // smulh zdn.s, pg/m, zdn.s, zm.s
-        results[0] =
-            sveHelp::sveMulhPredicated<int32_t, int64_t>(operands, VL_bits);
+        results_[0] =
+            sveMulhPredicated<int32_t, int64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_SMULHrr: {  // smulh xd, xn, xm
         // TODO: signed
-        results[0] = AuxFunc::mulhi(operands[0].get<uint64_t>(),
-                                    operands[1].get<uint64_t>());
+        results_[0] = mulhi(sourceValues_[0].get<uint64_t>(),
+                            sourceValues_[1].get<uint64_t>());
         break;
       }
       case Opcode::AArch64_SSHLLv2i32_shift: {  // sshll vd.2d, vn.2s, #imm
-        results[0] = neonHelp::vecShllShift_vecImm<int64_t, int32_t, 2>(
-            operands, metadata, false);
+        results_[0] = vecShllShift_vecImm<int64_t, int32_t, 2>(
+            sourceValues_, metadata_, false);
         break;
       }
       case Opcode::AArch64_SSHLLv4i32_shift: {  // sshll2 vd.2d, vn.4s, #imm
-        results[0] = neonHelp::vecShllShift_vecImm<int64_t, int32_t, 2>(
-            operands, metadata, true);
+        results_[0] = vecShllShift_vecImm<int64_t, int32_t, 2>(sourceValues_,
+                                                               metadata_, true);
+        break;
+      }
+      case Opcode::AArch64_SHRNv8i8_shift: {  // shrn vd.8b, vn.8h, #imm
+        results_[0] =
+            vecShrnShift_imm<uint16_t, uint8_t, 8>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_SSHRv4i32_shift: {  // sshr vd.4s, vn.4s, #imm
-        results[0] = neonHelp::vecSshrShift_imm<int32_t, 4>(operands, metadata);
+        results_[0] = vecSshrShift_imm<int32_t, 4>(sourceValues_, metadata_);
         break;
       }
-      case Opcode::AArch64_SST1B_D_REAL: {  // st1b {zd.d}, pg, [xn, zm.d]
+      case Opcode::AArch64_SST1B_D: {  // st1b {zd.d}, pg, [xn, zm.d]
         // STORE
-        const uint64_t* d = operands[0].getAsVector<uint64_t>();
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint64_t* d = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
 
         const uint16_t partition_num = VL_bits / 64;
         uint16_t index = 0;
         for (int i = 0; i < partition_num; i++) {
           uint64_t shifted_active = 1ull << ((i % 8) * 8);
           if (p[i / 8] & shifted_active) {
-            memoryData[index] = static_cast<uint8_t>(d[i]);
+            memoryData_[index] = static_cast<uint8_t>(d[i]);
             index++;
           }
         }
         break;
       }
-      case Opcode::AArch64_SST1D_REAL: {  // st1d {zt.d}, pg, [xn, zm.d]
+      case Opcode::AArch64_SST1D: {  // st1d {zt.d}, pg, [xn, zm.d]
         // STORE
-        const uint64_t* d = operands[0].getAsVector<uint64_t>();
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint64_t* d = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
 
         const uint16_t partition_num = VL_bits / 64;
         uint16_t index = 0;
         for (int i = 0; i < partition_num; i++) {
           uint64_t shifted_active = 1ull << ((i % 8) * 8);
           if (p[i / 8] & shifted_active) {
-            memoryData[index] = d[i];
+            memoryData_[index] = d[i];
             index++;
           }
         }
@@ -4071,113 +5561,353 @@ void Instruction::execute() {
       }
       case Opcode::AArch64_SST1D_IMM: {  // st1d {zd.d}, pg, [zn.d{, #imm}]
         // STORE
-        const uint64_t* t = operands[0].getAsVector<uint64_t>();
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint64_t* t = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
 
         const uint16_t partition_num = VL_bits / 64;
         uint16_t index = 0;
         for (int i = 0; i < partition_num; i++) {
           uint64_t shifted_active = 1ull << ((i % 8) * 8);
           if (p[i / 8] & shifted_active) {
-            memoryData[index] = t[i];
+            memoryData_[index] = t[i];
             index++;
           }
         }
         break;
       }
-      case Opcode::AArch64_SST1D_SCALED_SCALED_REAL: {  // st1d {zt.d}, pg, [xn,
-                                                        // zm.d, lsl #
-                                                        // 3]
+      case Opcode::AArch64_SST1D_SCALED: {  // st1d {zt.d}, pg, [xn,
+                                            // zm.d, lsl #3]
         // STORE
-        const uint64_t* d = operands[0].getAsVector<uint64_t>();
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint64_t* d = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
 
         const uint16_t partition_num = VL_bits / 64;
         uint16_t index = 0;
         for (int i = 0; i < partition_num; i++) {
           uint64_t shifted_active = 1ull << ((i % 8) * 8);
           if (p[i / 8] & shifted_active) {
-            memoryData[index] = d[i];
+            memoryData_[index] = d[i];
+            index++;
+          }
+        }
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_H_B: {  // st1b {zath.b[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm}]
+        // SME, STORE
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 8;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+
+        const uint8_t* tileSlice =
+            sourceValues_[sliceNum].getAsVector<uint8_t>();
+        memoryData_ = sve_merge_store_data<uint8_t>(tileSlice, pg, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_H_D: {  // st1d {zath.d[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm, lsl #3}]
+        // SME, STORE
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 64;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+
+        const uint64_t* tileSlice =
+            sourceValues_[sliceNum].getAsVector<uint64_t>();
+        memoryData_ = sve_merge_store_data<uint64_t>(tileSlice, pg, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_H_H: {  // st1h {zath.h[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm, lsl #1}]
+        // SME, STORE
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 16;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+
+        const uint16_t* tileSlice =
+            sourceValues_[sliceNum].getAsVector<uint16_t>();
+        memoryData_ = sve_merge_store_data<uint16_t>(tileSlice, pg, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_H_Q: {  // st1q {zath.q[ws]}, pg,
+                                              // [<xn|sp>{, xm, lsl #4}]
+        // SME, STORE
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 128;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum = ws % partition_num;
+
+        // Using uint64_t as no 128-bit type
+        const uint64_t* tileSlice =
+            sourceValues_[sliceNum].getAsVector<uint64_t>();
+
+        // Need to combine active adjacent elements into RegisterValues and
+        // place into each memoryData_ index.
+        int index = 0;
+        std::vector<uint64_t> memData;
+        for (uint16_t i = 0; i < partition_num; i++) {
+          // For 128-bit there are 16-bit for each active element
+          uint64_t shifted_active = 1ull << ((i % 4) * 16);
+          if (pg[i / 4] & shifted_active) {
+            // As using uint64_t need to push_back 2 elements
+            memData.push_back(tileSlice[2 * i]);
+            memData.push_back(tileSlice[2 * i + 1]);
+          } else if (memData.size() > 0) {
+            // Predicate false, save current data
+            memoryData_[index] = RegisterValue(
+                (char*)memData.data(), memData.size() * sizeof(uint64_t));
+            index++;
+            memData.clear();
+          }
+        }
+        // Check if final data needs putting into memoryData_
+        if (memData.size() > 0) {
+          memoryData_[index] = RegisterValue((char*)memData.data(),
+                                             memData.size() * sizeof(uint64_t));
+        }
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_H_S: {  // st1w {zath.s[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm, lsl #2}]
+        // SME, STORE
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 32;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+
+        const uint32_t* tileSlice =
+            sourceValues_[sliceNum].getAsVector<uint32_t>();
+        memoryData_ = sve_merge_store_data<uint32_t>(tileSlice, pg, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_V_B: {  // st1b {zatv.b[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm}]
+        // SME, STORE
+        // Not in right context mode. Raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 8;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+
+        std::vector<uint8_t> memData;
+        uint16_t index = 0;
+
+        for (uint16_t x = 0; x < partition_num; x++) {
+          uint64_t shifted_active = 1ull << (x % 64);
+          if (pg[x / 64] & shifted_active) {
+            memData.push_back(
+                sourceValues_[x].getAsVector<uint8_t>()[sliceNum]);
+          } else if (memData.size() > 0) {
+            memoryData_[index] =
+                RegisterValue((char*)memData.data(), memData.size());
+            index++;
+            memData.clear();
+          }
+        }
+
+        if (memData.size() > 0) {
+          memoryData_[index] =
+              RegisterValue((char*)memData.data(), memData.size());
+        }
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_V_D: {  // st1d {zatv.d[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm, lsl #3}]
+        // SME, STORE
+        // Not in right context mode. Raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 64;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+
+        std::vector<uint64_t> memData;
+        uint16_t index = 0;
+
+        for (uint16_t x = 0; x < partition_num; x++) {
+          uint64_t shifted_active = 1ull << ((x % 8) * 8);
+          if (pg[x / 8] & shifted_active) {
+            memData.push_back(
+                sourceValues_[x].getAsVector<uint64_t>()[sliceNum]);
+          } else if (memData.size() > 0) {
+            memoryData_[index] =
+                RegisterValue((char*)memData.data(), memData.size() * 8);
+            index++;
+            memData.clear();
+          }
+        }
+
+        if (memData.size() > 0) {
+          memoryData_[index] =
+              RegisterValue((char*)memData.data(), memData.size() * 8);
+        }
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_V_H: {  // st1h {zatv.h[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm, LSL #1}]
+        // SME, STORE
+        // Not in right context mode. Raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 16;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+
+        std::vector<uint16_t> memData;
+        uint16_t index = 0;
+
+        for (uint16_t x = 0; x < partition_num; x++) {
+          uint64_t shifted_active = 1ull << ((x % 32) * 2);
+          if (pg[x / 32] & shifted_active) {
+            memData.push_back(
+                sourceValues_[x].getAsVector<uint16_t>()[sliceNum]);
+          } else if (memData.size() > 0) {
+            memoryData_[index] =
+                RegisterValue((char*)memData.data(), memData.size() * 2);
             index++;
+            memData.clear();
           }
         }
+
+        if (memData.size() > 0) {
+          memoryData_[index] =
+              RegisterValue((char*)memData.data(), memData.size() * 2);
+        }
         break;
       }
-      case Opcode::AArch64_ST1_MXIPXX_H_S: {  // st1w {zath.s[ws, #imm]}, pg/z,
-                                              // [<xn|sp>{, xm, LSL #2}]
+      case Opcode::AArch64_ST1_MXIPXX_V_Q: {  // st1h {zatv.q[ws]}, pg,
+                                              // [<xn|sp>{, xm, LSL #4}]
         // SME, STORE
-        if (!ZAenabled) {
-          // Not in right context mode. Raise exception
-          return ZAdisabled();
-        }
-        const uint16_t partition_num = VL_bits / 32;
-        const uint32_t ws = operands[partition_num].get<uint32_t>();
+        // Not in right context mode. Raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 128;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
         const uint64_t* pg =
-            operands[partition_num + 1].getAsVector<uint64_t>();
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
 
-        const uint32_t sliceNum =
-            (ws + metadata.operands[0].sme_index.disp) % partition_num;
+        const uint32_t sliceNum = ws % partition_num;
 
-        const uint32_t* tileSlice = operands[sliceNum].getAsVector<uint32_t>();
-        memoryData =
-            sveHelp::sve_merge_store_data<uint32_t>(tileSlice, pg, VL_bits);
+        // Need to combine active adjacent elements into RegisterValues and
+        // place into each memoryData_ index.
+        std::vector<uint64_t> memData;
+        uint16_t index = 0;
+        for (uint16_t x = 0; x < partition_num; x++) {
+          // For 128-bit there are 16-bit for each active element
+          uint64_t shifted_active = 1ull << ((x % 4) * 16);
+          if (pg[x / 4] & shifted_active) {
+            // As using uint64_t need to push_back 2 elements
+            memData.push_back(
+                sourceValues_[x].getAsVector<uint64_t>()[2 * sliceNum]);
+            memData.push_back(
+                sourceValues_[x].getAsVector<uint64_t>()[2 * sliceNum + 1]);
+          } else if (memData.size() > 0) {
+            // Predicate false, save current data
+            memoryData_[index] = RegisterValue(
+                (char*)memData.data(), memData.size() * sizeof(uint64_t));
+            index++;
+            memData.clear();
+          }
+        }
 
+        // Check if final data needs putting into memoryData_
+        if (memData.size() > 0) {
+          memoryData_[index] = RegisterValue((char*)memData.data(),
+                                             memData.size() * sizeof(uint64_t));
+        }
         break;
       }
-      case Opcode::AArch64_ST1_MXIPXX_V_S: {  // st1w {zatv.s[ws, #imm]}, pg/z,
+      case Opcode::AArch64_ST1_MXIPXX_V_S: {  // st1w {zatv.s[ws, #imm]}, pg,
                                               // [<xn|sp>{, xm, LSL #2}]
         // SME, STORE
-        if (!ZAenabled) {
-          // Not in right context mode. Raise exception
-          return ZAdisabled();
-        }
+        // Not in right context mode. Raise exception
+        if (!ZAenabled) return ZAdisabled();
+
         const uint16_t partition_num = VL_bits / 32;
-        const uint32_t ws = operands[partition_num].get<uint32_t>();
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
         const uint64_t* pg =
-            operands[partition_num + 1].getAsVector<uint64_t>();
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
 
         const uint32_t sliceNum =
-            (ws + metadata.operands[0].sme_index.disp) % partition_num;
-
-        uint16_t num_vec_elems = (VL_bits / 32);
-        uint16_t prdcns_per_preg = (64 / 4);
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
 
-        std::array<uint32_t, 256 / 4> mdata;
-        uint16_t md_size = 0;
+        std::vector<uint32_t> memData;
         uint16_t index = 0;
 
-        for (uint16_t x = 0; x < num_vec_elems; x++) {
-          uint64_t predicate = pg[x / prdcns_per_preg];
-          uint64_t bit_mask = 1ull << ((x % prdcns_per_preg) * 4);
-          uint64_t is_elem_active = predicate & bit_mask;
-          if (is_elem_active) {
-            mdata[md_size] = operands[x].getAsVector<uint32_t>()[sliceNum];
-            md_size++;
-          } else if (md_size && !is_elem_active) {
-            const char* data = (char*)mdata.data();
-            memoryData[index] = RegisterValue(data, md_size * 4);
-            md_size = 0;
+        for (uint16_t x = 0; x < partition_num; x++) {
+          uint64_t shifted_active = 1ull << ((x % 16) * 4);
+          if (pg[x / 16] & shifted_active) {
+            memData.push_back(
+                sourceValues_[x].getAsVector<uint32_t>()[sliceNum]);
+          } else if (memData.size() > 0) {
+            memoryData_[index] =
+                RegisterValue((char*)memData.data(), memData.size() * 4);
+            index++;
+            memData.clear();
           }
         }
 
-        if (md_size) {
-          const char* data = (char*)mdata.data();
-          memoryData[index] = RegisterValue(data, md_size * 4);
+        if (memData.size() > 0) {
+          memoryData_[index] =
+              RegisterValue((char*)memData.data(), memData.size() * 4);
         }
-
         break;
       }
       case Opcode::AArch64_SST1W_D_IMM: {  // st1w {zt.d}, pg, [zn.d{, #imm}]
         // STORE
-        const uint64_t* t = operands[0].getAsVector<uint64_t>();
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint64_t* t = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
 
         const uint16_t partition_num = VL_bits / 64;
         uint16_t index = 0;
         for (int i = 0; i < partition_num; i++) {
           uint64_t shifted_active = 1ull << ((i % 8) * 8);
           if (p[i / 8] & shifted_active) {
-            memoryData[index] = t[i];
+            memoryData_[index] = t[i];
             index++;
           }
         }
@@ -4185,15 +5915,15 @@ void Instruction::execute() {
       }
       case Opcode::AArch64_SST1W_IMM: {  // st1w {zt.s}, pg, [zn.s{, #imm}]
         // STORE
-        const uint32_t* t = operands[0].getAsVector<uint32_t>();
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint32_t* t = sourceValues_[0].getAsVector<uint32_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
 
         const uint16_t partition_num = VL_bits / 32;
         uint16_t index = 0;
         for (int i = 0; i < partition_num; i++) {
           uint64_t shifted_active = 1ull << ((i % 16) * 4);
           if (p[i / 16] & shifted_active) {
-            memoryData[index] = t[i];
+            memoryData_[index] = t[i];
             index++;
           }
         }
@@ -4201,172 +5931,305 @@ void Instruction::execute() {
       }
       case Opcode::AArch64_ST1B: {  // st1b {zt.b}, pg, [xn, xm]
         // STORE
-        const uint8_t* d = operands[0].getAsVector<uint8_t>();
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint8_t* d = sourceValues_[0].getAsVector<uint8_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
+
+        memoryData_ = sve_merge_store_data<uint8_t>(d, p, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_ST1B_IMM: {  // st1b {zt.b}, pg, [xn{, #imm, mul vl}]
+        // STORE
+        const uint8_t* d = sourceValues_[0].getAsVector<uint8_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
 
-        memoryData = sveHelp::sve_merge_store_data<uint8_t>(d, p, VL_bits);
+        memoryData_ = sve_merge_store_data<uint8_t>(d, p, VL_bits);
         break;
       }
       case Opcode::AArch64_ST1D: {  // st1d {zt.d}, pg, [xn, xm, lsl #3]
         // STORE
-        const uint64_t* d = operands[0].getAsVector<uint64_t>();
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint64_t* d = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
 
-        memoryData = sveHelp::sve_merge_store_data<uint64_t>(d, p, VL_bits);
+        memoryData_ = sve_merge_store_data<uint64_t>(d, p, VL_bits);
         break;
       }
       case Opcode::AArch64_ST1D_IMM: {  // st1d {zt.d}, pg, [xn{, #imm, mul vl}]
         // STORE
-        const uint64_t* d = operands[0].getAsVector<uint64_t>();
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint64_t* d = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
 
-        memoryData = sveHelp::sve_merge_store_data<uint64_t>(d, p, VL_bits);
+        memoryData_ = sve_merge_store_data<uint64_t>(d, p, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_ST1Fourv16b: {  // st1 {vt.16b, vt2.16b, vt3.16b,
+                                           // vt4.16b}, [xn|sp]
+        // STORE
+        for (int i = 0; i < 4; i++) {
+          memoryData_[i] =
+              RegisterValue((char*)sourceValues_[i].getAsVector<uint8_t>(),
+                            16 * sizeof(uint8_t));
+        }
+        break;
+      }
+      case Opcode::AArch64_ST1Fourv16b_POST: {  // st1 {vt.16b, vt2.16b,
+                                                // vt3.16b, vt4.16b}, [xn|sp],
+                                                // <#imm|xm>
+        // STORE
+        for (int i = 0; i < 4; i++) {
+          memoryData_[i] =
+              RegisterValue((char*)sourceValues_[i].getAsVector<uint8_t>(),
+                            16 * sizeof(uint8_t));
+        }
+        // if #imm post-index, value can only be 64
+        const uint64_t postIndex =
+            (metadata_.operands[5].type == AARCH64_OP_REG)
+                ? sourceValues_[5].get<uint64_t>()
+                : 64;
+        results_[0] = sourceValues_[4].get<uint64_t>() + postIndex;
+        break;
+      }
+      case Opcode::AArch64_ST1Fourv2d: {  // st1 {vt.2d, vt2.2d, vt3.2d,
+                                          // vt4.2d}, [xn|sp]
+        // STORE
+        for (int i = 0; i < 4; i++) {
+          memoryData_[i] =
+              RegisterValue((char*)sourceValues_[i].getAsVector<uint64_t>(),
+                            2 * sizeof(uint64_t));
+        }
+        break;
+      }
+      case Opcode::AArch64_ST1Fourv2d_POST: {  // st1 {vt.2d, vt2.2d, vt3.2d,
+                                               // vt4.2d}, [xn|sp], <#imm|xm>
+        // STORE
+        for (int i = 0; i < 4; i++) {
+          memoryData_[i] =
+              RegisterValue((char*)sourceValues_[i].getAsVector<uint64_t>(),
+                            2 * sizeof(uint64_t));
+        }
+        // if #imm post-index, value can only be 64
+        const uint64_t postIndex =
+            (metadata_.operands[5].type == AARCH64_OP_REG)
+                ? sourceValues_[5].get<uint64_t>()
+                : 64;
+        results_[0] = sourceValues_[4].get<uint64_t>() + postIndex;
         break;
       }
       case Opcode::AArch64_ST1Fourv2s_POST: {  // st1 {vt.2s, vt2.2s, vt3.2s,
                                                // vt4.2s}, [xn|sp], <#imm|xm>
         // STORE
         for (int i = 0; i < 4; i++) {
-          memoryData[i] = RegisterValue(
-              (char*)operands[i].getAsVector<uint32_t>(), 2 * sizeof(uint32_t));
+          memoryData_[i] =
+              RegisterValue((char*)sourceValues_[i].getAsVector<uint32_t>(),
+                            2 * sizeof(uint32_t));
         }
         // if #imm post-index, value can only be 32
         const uint64_t postIndex =
-            (metadata.operandCount == 6) ? operands[5].get<uint64_t>() : 32;
-        results[0] = operands[4].get<uint64_t>() + postIndex;
+            (metadata_.operands[5].type == AARCH64_OP_REG)
+                ? sourceValues_[5].get<uint64_t>()
+                : 32;
+        results_[0] = sourceValues_[4].get<uint64_t>() + postIndex;
+        break;
+      }
+      case Opcode::AArch64_ST1Fourv4s: {  // st1 {vt.4s, vt2.4s, vt3.4s,
+                                          // vt4.4s}, [xn|sp]
+        // STORE
+        for (int i = 0; i < 4; i++) {
+          memoryData_[i] =
+              RegisterValue((char*)sourceValues_[i].getAsVector<uint32_t>(),
+                            4 * sizeof(uint32_t));
+        }
         break;
       }
       case Opcode::AArch64_ST1Fourv4s_POST: {  // st1 {vt.4s, vt2.4s, vt3.4s,
                                                // vt4.4s}, [xn|sp], <#imm|xm>
         // STORE
         for (int i = 0; i < 4; i++) {
-          memoryData[i] = RegisterValue(
-              (char*)operands[i].getAsVector<uint32_t>(), 4 * sizeof(uint32_t));
+          memoryData_[i] =
+              RegisterValue((char*)sourceValues_[i].getAsVector<uint32_t>(),
+                            4 * sizeof(uint32_t));
         }
         // if #imm post-index, value can only be 64
         const uint64_t postIndex =
-            (metadata.operandCount == 6) ? operands[5].get<uint64_t>() : 64;
-        results[0] = operands[4].get<uint64_t>() + postIndex;
+            (metadata_.operands[5].type == AARCH64_OP_REG)
+                ? sourceValues_[5].get<uint64_t>()
+                : 64;
+        results_[0] = sourceValues_[4].get<uint64_t>() + postIndex;
         break;
       }
       case Opcode::AArch64_ST1Twov16b: {  // st1 {vt.16b, vt2.16b}, [xn|sp]
         // STORE
-        const uint8_t* t = operands[0].getAsVector<uint8_t>();
-        const uint8_t* t2 = operands[1].getAsVector<uint8_t>();
-        memoryData[0] = RegisterValue((char*)t, 16 * sizeof(uint8_t));
-        memoryData[1] = RegisterValue((char*)t2, 16 * sizeof(uint8_t));
+        const uint8_t* t = sourceValues_[0].getAsVector<uint8_t>();
+        const uint8_t* t2 = sourceValues_[1].getAsVector<uint8_t>();
+        memoryData_[0] = RegisterValue((char*)t, 16 * sizeof(uint8_t));
+        memoryData_[1] = RegisterValue((char*)t2, 16 * sizeof(uint8_t));
+        break;
+      }
+      case Opcode::AArch64_ST1Twov16b_POST: {  // st1 {vt.16b, vt2.16b},
+                                               // [xn|sp], <#imm|xm>
+        // STORE
+        const uint8_t* t = sourceValues_[0].getAsVector<uint8_t>();
+        const uint8_t* t2 = sourceValues_[1].getAsVector<uint8_t>();
+        memoryData_[0] = RegisterValue((char*)t, 16 * sizeof(uint8_t));
+        memoryData_[1] = RegisterValue((char*)t2, 16 * sizeof(uint8_t));
+
+        // if #imm post-index, value can only be 32
+        const uint64_t postIndex =
+            (metadata_.operands[3].type == AARCH64_OP_REG)
+                ? sourceValues_[3].get<uint64_t>()
+                : 32;
+        results_[0] = sourceValues_[2].get<uint64_t>() + postIndex;
+        break;
+      }
+      case Opcode::AArch64_ST1Twov2d: {  // st1 {vt.2d, vt2.2d}, [xn|sp]
+        // STORE
+        const uint64_t* t = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* t2 = sourceValues_[1].getAsVector<uint64_t>();
+        memoryData_[0] = RegisterValue((char*)t, 2 * sizeof(uint64_t));
+        memoryData_[1] = RegisterValue((char*)t2, 2 * sizeof(uint64_t));
+        break;
+      }
+      case Opcode::AArch64_ST1Twov2d_POST: {  // st1 {vt.2d, vt2.2d},
+                                              // [xn|sp], <#imm|xm>
+        // STORE
+        const uint64_t* t = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* t2 = sourceValues_[1].getAsVector<uint64_t>();
+        memoryData_[0] = RegisterValue((char*)t, 2 * sizeof(uint64_t));
+        memoryData_[1] = RegisterValue((char*)t2, 2 * sizeof(uint64_t));
+
+        // if #imm post-index, value can only be 32
+        const uint64_t postIndex =
+            (metadata_.operands[3].type == AARCH64_OP_REG)
+                ? sourceValues_[3].get<uint64_t>()
+                : 32;
+        results_[0] = sourceValues_[2].get<uint64_t>() + postIndex;
         break;
       }
       case Opcode::AArch64_ST1Twov4s: {  // st1 {vt.4s, vt2.4s}, [xn|sp]
         // STORE
-        const uint32_t* t = operands[0].getAsVector<uint32_t>();
-        const uint32_t* t2 = operands[1].getAsVector<uint32_t>();
-        memoryData[0] = RegisterValue((char*)t, 4 * sizeof(uint32_t));
-        memoryData[1] = RegisterValue((char*)t2, 4 * sizeof(uint32_t));
+        const uint32_t* t = sourceValues_[0].getAsVector<uint32_t>();
+        const uint32_t* t2 = sourceValues_[1].getAsVector<uint32_t>();
+        memoryData_[0] = RegisterValue((char*)t, 4 * sizeof(uint32_t));
+        memoryData_[1] = RegisterValue((char*)t2, 4 * sizeof(uint32_t));
+        break;
+      }
+      case Opcode::AArch64_ST1Twov4s_POST: {  // st1 {vt.4s, vt2.4s},
+                                              // [xn|sp], <#imm|xm>
+        // STORE
+        const uint32_t* t = sourceValues_[0].getAsVector<uint32_t>();
+        const uint32_t* t2 = sourceValues_[1].getAsVector<uint32_t>();
+        memoryData_[0] = RegisterValue((char*)t, 4 * sizeof(uint32_t));
+        memoryData_[1] = RegisterValue((char*)t2, 4 * sizeof(uint32_t));
+
+        // if #imm post-index, value can only be 32
+        const uint64_t postIndex =
+            (metadata_.operands[3].type == AARCH64_OP_REG)
+                ? sourceValues_[3].get<uint64_t>()
+                : 32;
+        results_[0] = sourceValues_[2].get<uint64_t>() + postIndex;
         break;
       }
       case Opcode::AArch64_ST1W: {  // st1w {zt.s}, pg, [xn, xm, lsl #2]
         // STORE
-        const uint32_t* d = operands[0].getAsVector<uint32_t>();
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint32_t* d = sourceValues_[0].getAsVector<uint32_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
 
-        memoryData = sveHelp::sve_merge_store_data<uint32_t>(d, p, VL_bits);
+        memoryData_ = sve_merge_store_data<uint32_t>(d, p, VL_bits);
         break;
       }
       case Opcode::AArch64_ST1W_D: {  // st1w {zt.d}, pg, [xn, xm, lsl #2]
         // STORE
-        const uint64_t* d = operands[0].getAsVector<uint64_t>();
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint64_t* d = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
 
-        memoryData = sveHelp::sve_merge_store_data<uint64_t>(d, p, VL_bits);
+        memoryData_ = sve_merge_store_data<uint64_t, uint32_t>(d, p, VL_bits);
         break;
       }
       case Opcode::AArch64_ST1W_IMM: {  // st1w {zt.s}, pg, [xn{, #imm, mul vl}]
         // STORE
-        const uint32_t* d = operands[0].getAsVector<uint32_t>();
-        const uint64_t* p = operands[1].getAsVector<uint64_t>();
+        const uint32_t* d = sourceValues_[0].getAsVector<uint32_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
 
-        memoryData = sveHelp::sve_merge_store_data<uint32_t>(d, p, VL_bits);
+        memoryData_ = sve_merge_store_data<uint32_t>(d, p, VL_bits);
         break;
       }
       case Opcode::AArch64_ST1i16: {  // st1 {vt.h}[index], [xn]
         // STORE
-        const uint16_t* t = operands[0].getAsVector<uint16_t>();
-        memoryData[0] = t[metadata.operands[0].vector_index];
+        const uint16_t* t = sourceValues_[0].getAsVector<uint16_t>();
+        memoryData_[0] = t[metadata_.operands[0].vector_index];
         break;
       }
-      case Opcode::AArch64_ST1i16_POST: {  // st1 {vt.h}[index], [xn], xm
-                                           // st1 {vt.h}[index], [xn], #2
+      case Opcode::AArch64_ST1i16_POST: {  // st1 {vt.h}[index], [xn], <xm|#imm>
         // STORE
-        const uint16_t* t = operands[0].getAsVector<uint16_t>();
-        memoryData[0] = t[metadata.operands[0].vector_index];
-        uint64_t offset = 2;
-        if (metadata.operandCount == 3) {
-          offset = operands[2].get<uint64_t>();
-        }
-        results[0] = operands[1].get<uint64_t>() + offset;
+        const uint16_t* t = sourceValues_[0].getAsVector<uint16_t>();
+        memoryData_[0] = t[metadata_.operands[0].vector_index];
+        // if #imm post-index, value can only be 2
+        const uint64_t postIndex =
+            (metadata_.operands[2].type == AARCH64_OP_REG)
+                ? sourceValues_[2].get<uint64_t>()
+                : 2;
+        results_[0] = sourceValues_[1].get<uint64_t>() + postIndex;
         break;
       }
       case Opcode::AArch64_ST1i32: {  // st1 {vt.s}[index], [xn]
         // STORE
-        const uint32_t* t = operands[0].getAsVector<uint32_t>();
-        memoryData[0] = t[metadata.operands[0].vector_index];
+        const uint32_t* t = sourceValues_[0].getAsVector<uint32_t>();
+        memoryData_[0] = t[metadata_.operands[0].vector_index];
         break;
       }
-      case Opcode::AArch64_ST1i32_POST: {  // st1 {vt.s}[index], [xn], xm
-                                           // st1 {vt.s}[index], [xn], #4
+      case Opcode::AArch64_ST1i32_POST: {  // st1 {vt.s}[index], [xn], <xm|#imm>
         // STORE
-        const uint32_t* t = operands[0].getAsVector<uint32_t>();
-        memoryData[0] = t[metadata.operands[0].vector_index];
-        uint64_t offset = 4;
-        if (metadata.operandCount == 3) {
-          offset = operands[2].get<uint64_t>();
-        }
-        results[0] = operands[1].get<uint64_t>() + offset;
+        const uint32_t* t = sourceValues_[0].getAsVector<uint32_t>();
+        memoryData_[0] = t[metadata_.operands[0].vector_index];
+        // if #imm post-index, value can only be 4
+        const uint64_t postIndex =
+            (metadata_.operands[2].type == AARCH64_OP_REG)
+                ? sourceValues_[2].get<uint64_t>()
+                : 4;
+        results_[0] = sourceValues_[1].get<uint64_t>() + postIndex;
         break;
       }
       case Opcode::AArch64_ST1i64: {  // st1 {vt.d}[index], [xn]
         // STORE
-        const uint64_t* t = operands[0].getAsVector<uint64_t>();
-        memoryData[0] = t[metadata.operands[0].vector_index];
+        const uint64_t* t = sourceValues_[0].getAsVector<uint64_t>();
+        memoryData_[0] = t[metadata_.operands[0].vector_index];
         break;
       }
-      case Opcode::AArch64_ST1i64_POST: {  // st1 {vt.d}[index], [xn], xm
-                                           // st1 {vt.d}[index], [xn], #8
+      case Opcode::AArch64_ST1i64_POST: {  // st1 {vt.d}[index], [xn], <xm|#imm>
         // STORE
-        const uint64_t* t = operands[0].getAsVector<uint64_t>();
-        memoryData[0] = t[metadata.operands[0].vector_index];
-        uint64_t offset = 8;
-        if (metadata.operandCount == 3) {
-          offset = operands[2].get<uint64_t>();
-        }
-        results[0] = operands[1].get<uint64_t>() + offset;
+        const uint64_t* t = sourceValues_[0].getAsVector<uint64_t>();
+        memoryData_[0] = t[metadata_.operands[0].vector_index];
+        // if #imm post-index, value can only be 8
+        const uint64_t postIndex =
+            (metadata_.operands[2].type == AARCH64_OP_REG)
+                ? sourceValues_[2].get<uint64_t>()
+                : 8;
+        results_[0] = sourceValues_[1].get<uint64_t>() + postIndex;
         break;
       }
       case Opcode::AArch64_ST1i8: {  // st1 {vt.b}[index], [xn]
         // STORE
-        const uint8_t* t = operands[0].getAsVector<uint8_t>();
-        memoryData[0] = t[metadata.operands[0].vector_index];
+        const uint8_t* t = sourceValues_[0].getAsVector<uint8_t>();
+        memoryData_[0] = t[metadata_.operands[0].vector_index];
         break;
       }
-      case Opcode::AArch64_ST1i8_POST: {  // st1 {vt.b}[index], [xn], xm
-                                          // st1 {vt.b}[index], [xn], #1
+      case Opcode::AArch64_ST1i8_POST: {  // st1 {vt.b}[index], [xn], <xm|#imm>
         // STORE
-        const uint8_t* t = operands[0].getAsVector<uint8_t>();
-        memoryData[0] = t[metadata.operands[0].vector_index];
-        uint64_t offset = 1;
-        if (metadata.operandCount == 3) {
-          offset = operands[2].get<uint64_t>();
-        }
-        results[0] = RegisterValue(operands[1].get<uint64_t>() + offset, 8);
+        const uint8_t* t = sourceValues_[0].getAsVector<uint8_t>();
+        memoryData_[0] = t[metadata_.operands[0].vector_index];
+        // if #imm post-index, value can only be 1
+        const uint64_t postIndex =
+            (metadata_.operands[2].type == AARCH64_OP_REG)
+                ? sourceValues_[2].get<uint64_t>()
+                : 1;
+        results_[0] = sourceValues_[1].get<uint64_t>() + postIndex;
         break;
       }
       case Opcode::AArch64_ST2D_IMM: {  // st2d {zt1.d, zt2.d}, pg, [<xn|sp>{,
                                         // #imm, mul vl}]
         // STORE
-        const uint64_t* d1 = operands[0].getAsVector<uint64_t>();
-        const uint64_t* d2 = operands[1].getAsVector<uint64_t>();
-        const uint64_t* p = operands[2].getAsVector<uint64_t>();
+        const uint64_t* d1 = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* d2 = sourceValues_[1].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[2].getAsVector<uint64_t>();
 
         std::vector<uint64_t> memData;
         bool inActiveBlock = false;
@@ -4385,52 +6248,52 @@ void Instruction::execute() {
             memData.push_back(d2[i]);
           } else if (inActiveBlock) {
             inActiveBlock = false;
-            memoryData[index] = RegisterValue(
+            memoryData_[index] = RegisterValue(
                 (char*)memData.data(), sizeof(uint64_t) * memData.size());
             index++;
           }
         }
         // Add final block if needed
         if (inActiveBlock)
-          memoryData[index] = RegisterValue((char*)memData.data(),
-                                            sizeof(uint64_t) * memData.size());
+          memoryData_[index] = RegisterValue((char*)memData.data(),
+                                             sizeof(uint64_t) * memData.size());
 
         break;
       }
       case Opcode::AArch64_ST2Twov4s_POST: {  // st2 {vt1.4s, vt2.4s}, [xn],
-                                              // #imm
+                                              // <xm|#imm>
         // STORE
-        const float* t1 = operands[0].getAsVector<float>();
-        const float* t2 = operands[1].getAsVector<float>();
+        const float* t1 = sourceValues_[0].getAsVector<float>();
+        const float* t2 = sourceValues_[1].getAsVector<float>();
         std::vector<float> m1 = {t1[0], t2[0], t1[1], t2[1]};
         std::vector<float> m2 = {t1[2], t2[2], t1[3], t2[3]};
-        memoryData[0] = RegisterValue((char*)m1.data(), 4 * sizeof(float));
-        memoryData[1] = RegisterValue((char*)m2.data(), 4 * sizeof(float));
-
-        uint64_t offset = 32;
-        if (metadata.operandCount == 4) {
-          offset = operands[3].get<uint64_t>();
-        }
-        results[0] = operands[2].get<uint64_t>() + offset;
+        memoryData_[0] = RegisterValue((char*)m1.data(), 4 * sizeof(float));
+        memoryData_[1] = RegisterValue((char*)m2.data(), 4 * sizeof(float));
+        // if #imm post-index, value can only be 32
+        const uint64_t postIndex =
+            (metadata_.operands[3].type == AARCH64_OP_REG)
+                ? sourceValues_[3].get<uint64_t>()
+                : 32;
+        results_[0] = sourceValues_[2].get<uint64_t>() + postIndex;
         break;
       }
       case Opcode::AArch64_STLRB: {  // stlrb wt, [xn]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STLRW:    // stlr wt, [xn]
       case Opcode::AArch64_STLRX: {  // stlr xt, [xn]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STLXRW:    // stlxr ws, wt, [xn]
       case Opcode::AArch64_STLXRX: {  // stlxr ws, xt, [xn]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         // TODO: Implement atomic memory access
-        results[0] = static_cast<uint64_t>(0);
+        results_[0] = static_cast<uint64_t>(0);
         break;
       }
       case Opcode::AArch64_STPDi:    // stp dt1, dt2, [xn, #imm]
@@ -4438,8 +6301,8 @@ void Instruction::execute() {
       case Opcode::AArch64_STPSi:    // stp st1, st2, [xn, #imm]
       case Opcode::AArch64_STPWi:    // stp wt1, wt2, [xn, #imm]
       case Opcode::AArch64_STPXi: {  // stp xt1, xt2, [xn, #imm]
-        memoryData[0] = operands[0];
-        memoryData[1] = operands[1];
+        memoryData_[0] = sourceValues_[0];
+        memoryData_[1] = sourceValues_[1];
         break;
       }
       case Opcode::AArch64_STPDpost:    // stp dt1, dt2, [xn], #imm
@@ -4447,9 +6310,10 @@ void Instruction::execute() {
       case Opcode::AArch64_STPSpost:    // stp st1, st2, [xn], #imm
       case Opcode::AArch64_STPWpost:    // stp wt1, wt2, [xn], #imm
       case Opcode::AArch64_STPXpost: {  // stp xt1, xt2, [xn], #imm
-        memoryData[0] = operands[0];
-        memoryData[1] = operands[1];
-        results[0] = operands[2].get<uint64_t>() + metadata.operands[3].imm;
+        memoryData_[0] = sourceValues_[0];
+        memoryData_[1] = sourceValues_[1];
+        results_[0] =
+            sourceValues_[2].get<uint64_t>() + metadata_.operands[3].imm;
         break;
       }
       case Opcode::AArch64_STPDpre:    // stp dt1, dt2, [xn, #imm]!
@@ -4457,40 +6321,41 @@ void Instruction::execute() {
       case Opcode::AArch64_STPSpre:    // stp st1, st2, [xn, #imm]!
       case Opcode::AArch64_STPWpre:    // stp wt1, wt2, [xn, #imm]!
       case Opcode::AArch64_STPXpre: {  // stp xt1, xt2, [xn, #imm]!
-        memoryData[0] = operands[0];
-        memoryData[1] = operands[1];
-        results[0] =
-            operands[2].get<uint64_t>() + metadata.operands[2].mem.disp;
+        memoryData_[0] = sourceValues_[0];
+        memoryData_[1] = sourceValues_[1];
+        results_[0] =
+            sourceValues_[2].get<uint64_t>() + metadata_.operands[2].mem.disp;
         break;
       }
       case Opcode::AArch64_STRBBpost: {  // strb wd, [xn], #imm
         // STORE
-        memoryData[0] = operands[0];
-        results[0] = operands[1].get<uint64_t>() + metadata.operands[2].imm;
+        memoryData_[0] = sourceValues_[0];
+        results_[0] =
+            sourceValues_[1].get<uint64_t>() + metadata_.operands[2].imm;
         break;
       }
       case Opcode::AArch64_STRBBpre: {  // strb wd, [xn, #imm]!
         // STORE
-        memoryData[0] = operands[0];
-        results[0] =
-            operands[1].get<uint64_t>() + metadata.operands[1].mem.disp;
+        memoryData_[0] = sourceValues_[0];
+        results_[0] =
+            sourceValues_[1].get<uint64_t>() + metadata_.operands[1].mem.disp;
         break;
       }
       case Opcode::AArch64_STRBBroW: {  // strb wd,
                                         //  [xn, wm{, extend {#amount}}]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STRBBroX: {  // strb wd,
                                         //  [xn, xm{, extend {#amount}}]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STRBBui: {  // strb wd, [xn, #imm]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STRBui:    // str bt, [xn, #imm]
@@ -4500,7 +6365,7 @@ void Instruction::execute() {
       case Opcode::AArch64_STRSui:    // str st, [xn, #imm]
       case Opcode::AArch64_STRWui:    // str wt, [xn, #imm]
       case Opcode::AArch64_STRXui: {  // str xt, [xn, #imm]
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STRBpost:    // str bt, [xn], #imm
@@ -4510,8 +6375,9 @@ void Instruction::execute() {
       case Opcode::AArch64_STRSpost:    // str st, [xn], #imm
       case Opcode::AArch64_STRWpost:    // str wt, [xn], #imm
       case Opcode::AArch64_STRXpost: {  // str xt, [xn], #imm
-        memoryData[0] = operands[0];
-        results[0] = operands[1].get<uint64_t>() + metadata.operands[2].imm;
+        memoryData_[0] = sourceValues_[0];
+        results_[0] =
+            sourceValues_[1].get<uint64_t>() + metadata_.operands[2].imm;
         break;
       }
       case Opcode::AArch64_STRBpre:    // str bt, [xn, #imm]!
@@ -4521,110 +6387,126 @@ void Instruction::execute() {
       case Opcode::AArch64_STRSpre:    // str st, [xn, #imm]!
       case Opcode::AArch64_STRWpre:    // str wt, [xn, #imm]!
       case Opcode::AArch64_STRXpre: {  // str xt, [xn, #imm]!
-        memoryData[0] = operands[0];
-        results[0] =
-            operands[1].get<uint64_t>() + metadata.operands[1].mem.disp;
+        memoryData_[0] = sourceValues_[0];
+        results_[0] =
+            sourceValues_[1].get<uint64_t>() + metadata_.operands[1].mem.disp;
         break;
       }
       case Opcode::AArch64_STRDroW: {  // str dt, [xn, wm{, #extend {#amount}}]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STRDroX: {  // str dt, [xn, xm{, #extend {#amount}}]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STRHHpost: {  // strh wt, [xn], #imm
         // STORE
-        memoryData[0] = operands[0];
-        results[0] = operands[1].get<uint64_t>() + metadata.operands[2].imm;
+        memoryData_[0] = sourceValues_[0];
+        results_[0] =
+            sourceValues_[1].get<uint64_t>() + metadata_.operands[2].imm;
         break;
       }
       case Opcode::AArch64_STRHHpre: {  // strh wd, [xn, #imm]!
         // STORE
-        memoryData[0] = operands[0];
-        results[0] =
-            operands[1].get<uint64_t>() + metadata.operands[1].mem.disp;
+        memoryData_[0] = sourceValues_[0];
+        results_[0] =
+            sourceValues_[1].get<uint64_t>() + metadata_.operands[1].mem.disp;
         break;
       }
       case Opcode::AArch64_STRHHroW: {  // strh wd,
                                         //  [xn, wm{, extend {#amount}}]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STRHHroX: {  // strh wd,
                                         //  [xn, xm{, extend {#amount}}]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STRHHui: {  // strh wt, [xn, #imm]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STRQroX: {  // str qt, [xn, xm{, extend, {#amount}}]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STRSroW: {  // str st, [xn, wm{, #extend {#amount}}]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STRSroX: {  // str st, [xn, xm{, #extend {#amount}}]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STRWroW: {  // str wd, [xn, wm{, extend {#amount}}]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STRWroX: {  // str wt, [xn, xm{, extend, {#amount}}]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STRXroW: {  // str xd, [xn, wm{, extend {#amount}}]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STRXroX: {  // str xt, [xn, xm{, extend, {#amount}}]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STR_PXI: {  // str pt, [xn{, #imm, mul vl}]
         // STORE
         const uint64_t PL_bits = VL_bits / 8;
         const uint16_t partition_num = PL_bits / 8;
-        const uint8_t* p = operands[0].getAsVector<uint8_t>();
-        memoryData[0] = RegisterValue((char*)p, partition_num);
+        const uint8_t* p = sourceValues_[0].getAsVector<uint8_t>();
+        memoryData_[0] = RegisterValue((char*)p, partition_num);
+        break;
+      }
+      case Opcode::AArch64_STR_ZA: {  // str za[wv, #imm], [xn|sp{, #imm, mul
+                                      // vl}]
+        // SME, STORE
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint32_t wv = sourceValues_[zaRowCount].get<uint32_t>();
+        const uint32_t imm = metadata_.operands[0].sme.slice_offset.imm;
+
+        const uint8_t* zaRow =
+            sourceValues_[(wv + imm) % zaRowCount].getAsVector<uint8_t>();
+        memoryData_[0] = RegisterValue((char*)zaRow, zaRowCount);
         break;
       }
       case Opcode::AArch64_STR_ZXI: {  // str zt, [xn{, #imm, mul vl}]
         // STORE
         const uint16_t partition_num = VL_bits / 8;
-        const uint8_t* z = operands[0].getAsVector<uint8_t>();
-        memoryData[0] = RegisterValue((char*)z, partition_num);
+        const uint8_t* z = sourceValues_[0].getAsVector<uint8_t>();
+        memoryData_[0] = RegisterValue((char*)z, partition_num);
         break;
       }
       case Opcode::AArch64_STURBBi: {  // sturb wd, [xn, #imm]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STURDi:     // stur dt, [xn, #imm]
       case Opcode::AArch64_STURHHi: {  // sturh wt, [xn, #imm]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STURQi:    // stur qt, [xn, #imm]
@@ -4632,151 +6514,316 @@ void Instruction::execute() {
       case Opcode::AArch64_STURWi:    // stur wt, [xn, #imm]
       case Opcode::AArch64_STURXi: {  // stur xt, [xn, #imm]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STXRW: {  // stxr ws, wt, [xn]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         // TODO: Implement atomic memory access
-        results[0] = static_cast<uint64_t>(0);
+        results_[0] = static_cast<uint64_t>(0);
         break;
       }
       case Opcode::AArch64_STXRX: {  // stxr ws, xt, [xn]
         // STORE
-        memoryData[0] = operands[0];
+        memoryData_[0] = sourceValues_[0];
         // TODO: Implement atomic memory access
-        results[0] = static_cast<uint64_t>(0);
+        results_[0] = static_cast<uint64_t>(0);
         break;
       }
       case Opcode::AArch64_SUBSWri: {  // subs wd, wn, #imm
         auto [result, nzcv] =
-            arithmeticHelp::subShift_imm<uint32_t>(operands, metadata, true);
-        results[0] = nzcv;
-        results[1] = {result, 8};
+            subShift_imm<uint32_t>(sourceValues_, metadata_, true);
+        results_[0] = nzcv;
+        results_[1] = {result, 8};
         break;
       }
       case Opcode::AArch64_SUBSWrs: {  // subs wd, wn, wm{, shift #amount}
         auto [result, nzcv] =
-            arithmeticHelp::subShift_3ops<uint32_t>(operands, metadata, true);
-        results[0] = nzcv;
-        results[1] = {result, 8};
+            subShift_3ops<uint32_t>(sourceValues_, metadata_, true);
+        results_[0] = nzcv;
+        results_[1] = {result, 8};
         break;
       }
       case Opcode::AArch64_SUBSWrx: {  // subs wd, wn, wm{, extend #amount}
         auto [result, nzcv] =
-            arithmeticHelp::subExtend_3ops<uint32_t>(operands, metadata, true);
-        results[0] = nzcv;
-        results[1] = {result, 8};
+            subExtend_3ops<uint32_t>(sourceValues_, metadata_, true);
+        results_[0] = nzcv;
+        results_[1] = {result, 8};
         break;
       }
       case Opcode::AArch64_SUBSXri: {  // subs xd, xn, #imm
         auto [result, nzcv] =
-            arithmeticHelp::subShift_imm<uint64_t>(operands, metadata, true);
-        results[0] = nzcv;
-        results[1] = result;
+            subShift_imm<uint64_t>(sourceValues_, metadata_, true);
+        results_[0] = nzcv;
+        results_[1] = result;
         break;
       }
       case Opcode::AArch64_SUBSXrs: {  // subs xd, xn, xm{, shift #amount}
         auto [result, nzcv] =
-            arithmeticHelp::subShift_3ops<uint64_t>(operands, metadata, true);
-        results[0] = nzcv;
-        results[1] = result;
+            subShift_3ops<uint64_t>(sourceValues_, metadata_, true);
+        results_[0] = nzcv;
+        results_[1] = result;
         break;
       }
       case Opcode::AArch64_SUBSXrx:      // subs xd, xn, wm{, extend #amount}
       case Opcode::AArch64_SUBSXrx64: {  // subs xd, xn, xm{, extend #amount}
         auto [result, nzcv] =
-            arithmeticHelp::subExtend_3ops<uint64_t>(operands, metadata, true);
-        results[0] = nzcv;
-        results[1] = result;
+            subExtend_3ops<uint64_t>(sourceValues_, metadata_, true);
+        results_[0] = nzcv;
+        results_[1] = result;
         break;
       }
       case Opcode::AArch64_SUBWri: {  // sub wd, wn, #imm{, <shift>}
         auto [result, nzcv] =
-            arithmeticHelp::subShift_imm<uint32_t>(operands, metadata, false);
-        results[0] = {result, 8};
+            subShift_imm<uint32_t>(sourceValues_, metadata_, false);
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = {result, 8};
         break;
       }
       case Opcode::AArch64_SUBWrs: {  // sub wd, wn, wm{, shift #amount}
         auto [result, nzcv] =
-            arithmeticHelp::subShift_3ops<uint32_t>(operands, metadata, false);
-        results[0] = {result, 8};
+            subShift_3ops<uint32_t>(sourceValues_, metadata_, false);
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = {result, 8};
         break;
       }
       case Opcode::AArch64_SUBXri: {  // sub xd, xn, #imm{, <shift>}
         auto [result, nzcv] =
-            arithmeticHelp::subShift_imm<uint64_t>(operands, metadata, false);
-        results[0] = result;
+            subShift_imm<uint64_t>(sourceValues_, metadata_, false);
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = result;
         break;
       }
       case Opcode::AArch64_SUBXrs: {  // sub xd, xn, xm{, shift #amount}
         auto [result, nzcv] =
-            arithmeticHelp::subShift_3ops<uint64_t>(operands, metadata, false);
-        results[0] = result;
+            subShift_3ops<uint64_t>(sourceValues_, metadata_, false);
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = result;
         break;
       }
       case Opcode::AArch64_SUBXrx:      // sub xd, xn, wm{, extend #amount}
       case Opcode::AArch64_SUBXrx64: {  // sub xd, xn, xm{, extend #amount}
         auto [result, nzcv] =
-            arithmeticHelp::subExtend_3ops<uint64_t>(operands, metadata, false);
-        results[0] = result;
+            subExtend_3ops<uint64_t>(sourceValues_, metadata_, false);
+        (void)nzcv;  // Prevent unused variable warnings in GCC7
+        results_[0] = result;
         break;
       }
       case Opcode::AArch64_SUB_ZZZ_B: {  // sub zd.b, zn.b, zm.b
-        results[0] = sveHelp::sveSub_3vecs<uint8_t>(operands, VL_bits);
+        results_[0] = sveSub_3vecs<uint8_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_SUB_ZZZ_D: {  // sub zd.d, zn.d, zm.d
-        results[0] = sveHelp::sveSub_3vecs<uint64_t>(operands, VL_bits);
+        results_[0] = sveSub_3vecs<uint64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_SUB_ZZZ_H: {  // sub zd.h, zn.h, zm.h
-        results[0] = sveHelp::sveSub_3vecs<uint16_t>(operands, VL_bits);
+        results_[0] = sveSub_3vecs<uint16_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_SUB_ZZZ_S: {  // sub zd.s, zn.s, zm.s
-        results[0] = sveHelp::sveSub_3vecs<uint32_t>(operands, VL_bits);
+        results_[0] = sveSub_3vecs<uint32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_SUBv16i8: {  // sub vd.16b, vn.16b, vm.16b
-        results[0] = neonHelp::vecLogicOp_3vecs<uint8_t, 16>(
-            operands, [](uint8_t x, uint8_t y) -> uint8_t { return x - y; });
+        results_[0] = vecLogicOp_3vecs<uint8_t, 16>(
+            sourceValues_,
+            [](uint8_t x, uint8_t y) -> uint8_t { return x - y; });
         break;
       }
       case Opcode::AArch64_SUBv1i64: {  // sub dd, dn, dm
-        results[0] = neonHelp::vecLogicOp_3vecs<uint64_t, 1>(
-            operands, [](uint64_t x, uint64_t y) -> uint64_t { return x - y; });
+        results_[0] = vecLogicOp_3vecs<uint64_t, 1>(
+            sourceValues_,
+            [](uint64_t x, uint64_t y) -> uint64_t { return x - y; });
         break;
       }
       case Opcode::AArch64_SUBv2i32: {  // sub vd.2s, vn.2s, vm.2s
-        results[0] = neonHelp::vecLogicOp_3vecs<uint32_t, 2>(
-            operands, [](uint32_t x, uint32_t y) -> uint32_t { return x - y; });
+        results_[0] = vecLogicOp_3vecs<uint32_t, 2>(
+            sourceValues_,
+            [](uint32_t x, uint32_t y) -> uint32_t { return x - y; });
         break;
       }
       case Opcode::AArch64_SUBv2i64: {  // sub vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecLogicOp_3vecs<uint64_t, 2>(
-            operands, [](uint64_t x, uint64_t y) -> uint64_t { return x - y; });
+        results_[0] = vecLogicOp_3vecs<uint64_t, 2>(
+            sourceValues_,
+            [](uint64_t x, uint64_t y) -> uint64_t { return x - y; });
         break;
       }
       case Opcode::AArch64_SUBv4i16: {  // sub vd.4h, vn.4h, vm.4h
-        results[0] = neonHelp::vecLogicOp_3vecs<uint16_t, 4>(
-            operands, [](uint64_t x, uint16_t y) -> uint16_t { return x - y; });
+        results_[0] = vecLogicOp_3vecs<uint16_t, 4>(
+            sourceValues_,
+            [](uint64_t x, uint16_t y) -> uint16_t { return x - y; });
         break;
       }
       case Opcode::AArch64_SUBv4i32: {  // sub vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecLogicOp_3vecs<uint32_t, 4>(
-            operands, [](uint32_t x, uint32_t y) -> uint32_t { return x - y; });
+        results_[0] = vecLogicOp_3vecs<uint32_t, 4>(
+            sourceValues_,
+            [](uint32_t x, uint32_t y) -> uint32_t { return x - y; });
         break;
       }
       case Opcode::AArch64_SUBv8i16: {  // sub vd.8h, vn.8h, vm.8h
-        results[0] = neonHelp::vecLogicOp_3vecs<uint16_t, 8>(
-            operands, [](uint16_t x, uint16_t y) -> uint16_t { return x - y; });
+        results_[0] = vecLogicOp_3vecs<uint16_t, 8>(
+            sourceValues_,
+            [](uint16_t x, uint16_t y) -> uint16_t { return x - y; });
         break;
       }
       case Opcode::AArch64_SUBv8i8: {  // sub vd.8b, vn.8b, vm.8b
-        results[0] = neonHelp::vecLogicOp_3vecs<uint8_t, 8>(
-            operands, [](uint8_t x, uint8_t y) -> uint8_t { return x - y; });
+        results_[0] = vecLogicOp_3vecs<uint8_t, 8>(
+            sourceValues_,
+            [](uint8_t x, uint8_t y) -> uint8_t { return x - y; });
+        break;
+      }
+      case Opcode::AArch64_SUMOPA_MPPZZ_D: {  // sumopa zada.d, pn/m, pm/m,
+                                              // zn.h, zm.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const int16_t* zn = sourceValues_[tileDim + 2].getAsVector<int16_t>();
+        const uint16_t* zm = sourceValues_[tileDim + 3].getAsVector<uint16_t>();
+
+        // zn is a SVLd x 4 sub matrix
+        // zm is a 4 x SVLd sub matrix
+        // Resulting SVLd x SVLd matrix has results widened to 64-bit
+        for (int row = 0; row < tileDim; row++) {
+          int64_t outRow[32] = {0};
+          const int64_t* zadaRow = sourceValues_[row].getAsVector<int64_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int64_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2);
+              const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2);
+              if ((pn[znIndex / 32] & shifted_active_zn) &&
+                  (pm[zmIndex / 32] & shifted_active_zm))
+                sum += (static_cast<int64_t>(zn[znIndex]) *
+                        static_cast<int64_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_SUMOPA_MPPZZ_S: {  // sumopa zada.s, pn/m, pm/m,
+                                              // zn.b, zm.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const int8_t* zn = sourceValues_[tileDim + 2].getAsVector<int8_t>();
+        const uint8_t* zm = sourceValues_[tileDim + 3].getAsVector<uint8_t>();
+
+        // zn is a SVLs x 4 sub matrix
+        // zm is a 4 x SVLs sub matrix
+        // Resulting SVLs x SVLs matrix has results widened to 32-bit
+        for (int row = 0; row < tileDim; row++) {
+          int32_t outRow[64] = {0};
+          const int32_t* zadaRow = sourceValues_[row].getAsVector<int32_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int32_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << (znIndex % 64);
+              const uint64_t shifted_active_zm = 1ull << (zmIndex % 64);
+              if ((pn[znIndex / 64] & shifted_active_zn) &&
+                  (pm[zmIndex / 64] & shifted_active_zm))
+                sum += (static_cast<int32_t>(zn[znIndex]) *
+                        static_cast<int32_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_SUMOPS_MPPZZ_D: {  // sumops zada.d, pn/m, pm/m,
+                                              // zn.h, zm.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const int16_t* zn = sourceValues_[tileDim + 2].getAsVector<int16_t>();
+        const uint16_t* zm = sourceValues_[tileDim + 3].getAsVector<uint16_t>();
+
+        // zn is a SVLd x 4 sub matrix
+        // zm is a 4 x SVLd sub matrix
+        // Resulting SVLd x SVLd matrix has results widened to 64-bit
+        for (int row = 0; row < tileDim; row++) {
+          int64_t outRow[32] = {0};
+          const int64_t* zadaRow = sourceValues_[row].getAsVector<int64_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int64_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2);
+              const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2);
+              if ((pn[znIndex / 32] & shifted_active_zn) &&
+                  (pm[zmIndex / 32] & shifted_active_zm))
+                sum -= (static_cast<int64_t>(zn[znIndex]) *
+                        static_cast<int64_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_SUMOPS_MPPZZ_S: {  // sumops zada.s, pn/m, pm/m,
+                                              // zn.b, zm.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const int8_t* zn = sourceValues_[tileDim + 2].getAsVector<int8_t>();
+        const uint8_t* zm = sourceValues_[tileDim + 3].getAsVector<uint8_t>();
+
+        // zn is a SVLs x 4 sub matrix
+        // zm is a 4 x SVLs sub matrix
+        // Resulting SVLs x SVLs matrix has results widened to 32-bit
+        for (int row = 0; row < tileDim; row++) {
+          int32_t outRow[64] = {0};
+          const int32_t* zadaRow = sourceValues_[row].getAsVector<int32_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int32_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << (znIndex % 64);
+              const uint64_t shifted_active_zm = 1ull << (zmIndex % 64);
+              if ((pn[znIndex / 64] & shifted_active_zn) &&
+                  (pm[zmIndex / 64] & shifted_active_zm))
+                sum -= (static_cast<int32_t>(zn[znIndex]) *
+                        static_cast<int32_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
         break;
       }
       case Opcode::AArch64_SVC: {  // svc #imm
@@ -4785,8 +6832,8 @@ void Instruction::execute() {
         break;
       }
       case Opcode::AArch64_SXTW_ZPmZ_D: {  // sxtw zd.d, pg/m, zn.d
-        results[0] =
-            sveHelp::sveSxtPredicated<int64_t, int32_t>(operands, VL_bits);
+        results_[0] =
+            sveSxtPredicated<int64_t, int32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_SYSxt: {  // sys #<op1>, cn, cm, #<op2>{, xt}
@@ -4804,467 +6851,776 @@ void Instruction::execute() {
       }
       case Opcode::AArch64_TBLv16i8Four: {  // tbl Vd.16b {Vn.16b, Vn+1.16b,
                                             // Vn+2.16b,Vn+3.16b } Vm.16b
-        results[0] = neonHelp::vecTbl<16>(operands, metadata);
+        results_[0] = vecTbl<16>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_TBLv16i8One: {  // tbl Vd.16b {Vn.16b} Vm.16b
-        results[0] = neonHelp::vecTbl<16>(operands, metadata);
+        results_[0] = vecTbl<16>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_TBLv16i8Three: {  // tbl Vd.16b {Vn.16b, Vn+1.16b,
                                              // Vn+2.16b } Vm.16b
-        results[0] = neonHelp::vecTbl<16>(operands, metadata);
+        results_[0] = vecTbl<16>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_TBLv16i8Two: {  // tbl Vd.16b {Vn.16b, Vn+1.16b }
                                            // Vm.16b
-        results[0] = neonHelp::vecTbl<16>(operands, metadata);
+        results_[0] = vecTbl<16>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_TBLv8i8Four: {  // tbl Vd.8b {Vn.16b, Vn+1.16b,
                                            // Vn+2.16b,Vn+3.16b } Vm.8b
-        results[0] = neonHelp::vecTbl<8>(operands, metadata);
+        results_[0] = vecTbl<8>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_TBLv8i8One: {  // tbl Vd.8b {Vn.16b} Vm.8b
-        results[0] = neonHelp::vecTbl<8>(operands, metadata);
+        results_[0] = vecTbl<8>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_TBLv8i8Three: {  // tbl Vd.8b {Vn.16b, Vn+1.16b,
                                             // Vn+2.16b } Vm.8b
-        results[0] = neonHelp::vecTbl<8>(operands, metadata);
+        results_[0] = vecTbl<8>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_TBLv8i8Two: {  // tbl Vd.8b {Vn.16b, Vn+1.16b } Vm.8b
-        results[0] = neonHelp::vecTbl<8>(operands, metadata);
+        results_[0] = vecTbl<8>(sourceValues_, metadata_);
         break;
       }
       case Opcode::AArch64_TBNZW: {  // tbnz wn, #imm, label
-        auto [taken, addr] = conditionalHelp::tbnz_tbz<uint32_t>(
-            operands, metadata, instructionAddress_, true);
+        auto [taken, addr] = tbnz_tbz<uint32_t>(sourceValues_, metadata_,
+                                                instructionAddress_, true);
         branchTaken_ = taken;
         branchAddress_ = addr;
         break;
       }
       case Opcode::AArch64_TBNZX: {  // tbnz xn, #imm, label
-        auto [taken, addr] = conditionalHelp::tbnz_tbz<uint64_t>(
-            operands, metadata, instructionAddress_, true);
+        auto [taken, addr] = tbnz_tbz<uint64_t>(sourceValues_, metadata_,
+                                                instructionAddress_, true);
         branchTaken_ = taken;
         branchAddress_ = addr;
         break;
       }
       case Opcode::AArch64_TBZW: {  // tbz wn, #imm, label
-        auto [taken, addr] = conditionalHelp::tbnz_tbz<uint32_t>(
-            operands, metadata, instructionAddress_, false);
+        auto [taken, addr] = tbnz_tbz<uint32_t>(sourceValues_, metadata_,
+                                                instructionAddress_, false);
         branchTaken_ = taken;
         branchAddress_ = addr;
         break;
       }
       case Opcode::AArch64_TBZX: {  // tbz xn, #imm, label
-        auto [taken, addr] = conditionalHelp::tbnz_tbz<uint64_t>(
-            operands, metadata, instructionAddress_, false);
+        auto [taken, addr] = tbnz_tbz<uint64_t>(sourceValues_, metadata_,
+                                                instructionAddress_, false);
         branchTaken_ = taken;
         branchAddress_ = addr;
         break;
       }
       case Opcode::AArch64_TRN1_ZZZ_B: {  // trn1 zd.b, zn.b, zm.b
-        results[0] = sveHelp::sveTrn1_3vecs<uint8_t>(operands, VL_bits);
+        results_[0] = sveTrn1_3vecs<uint8_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_TRN1_ZZZ_D: {  // trn1 zd.d, zn.d, zm.d
-        results[0] = sveHelp::sveTrn1_3vecs<uint64_t>(operands, VL_bits);
+        results_[0] = sveTrn1_3vecs<uint64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_TRN1_ZZZ_H: {  // trn1 zd.h, zn.h, zm.h
-        results[0] = sveHelp::sveTrn1_3vecs<uint16_t>(operands, VL_bits);
+        results_[0] = sveTrn1_3vecs<uint16_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_TRN1_ZZZ_S: {  // trn1 zd.s, zn.s, zm.s
-        results[0] = sveHelp::sveTrn1_3vecs<uint32_t>(operands, VL_bits);
+        results_[0] = sveTrn1_3vecs<uint32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_TRN1v16i8: {  // trn1 vd.16b, vn.16b, vm.16b
-        results[0] = neonHelp::vecTrn1<uint8_t, 16>(operands);
+        results_[0] = vecTrn1<uint8_t, 16>(sourceValues_);
         break;
       }
       case Opcode::AArch64_TRN1v2i32: {  // trn1 vd.2s, vn.2s, vm.2s
-        results[0] = neonHelp::vecTrn1<uint32_t, 2>(operands);
+        results_[0] = vecTrn1<uint32_t, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_TRN1v2i64: {  // trn1 vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecTrn1<uint64_t, 2>(operands);
+        results_[0] = vecTrn1<uint64_t, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_TRN1v4i16: {  // trn1 vd.4h, vn.4h, vm.4h
-        results[0] = neonHelp::vecTrn1<uint16_t, 4>(operands);
+        results_[0] = vecTrn1<uint16_t, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_TRN1v4i32: {  // trn1 vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecTrn1<uint32_t, 4>(operands);
+        results_[0] = vecTrn1<uint32_t, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_TRN1v8i16: {  // trn1 vd.8h, vn.8h, vm.8h
-        results[0] = neonHelp::vecTrn1<uint16_t, 8>(operands);
+        results_[0] = vecTrn1<uint16_t, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_TRN1v8i8: {  // trn1 vd.8b, vn.8b, vm.8b
-        results[0] = neonHelp::vecTrn1<uint8_t, 8>(operands);
+        results_[0] = vecTrn1<uint8_t, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_TRN2_ZZZ_B: {  // trn2 zd.b, zn.b, zm.b
-        results[0] = sveHelp::sveTrn2_3vecs<uint8_t>(operands, VL_bits);
+        results_[0] = sveTrn2_3vecs<uint8_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_TRN2_ZZZ_D: {  // trn2 zd.d, zn.d, zm.d
-        results[0] = sveHelp::sveTrn2_3vecs<uint64_t>(operands, VL_bits);
+        results_[0] = sveTrn2_3vecs<uint64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_TRN2_ZZZ_H: {  // trn2 zd.h, zn.h, zm.h
-        results[0] = sveHelp::sveTrn2_3vecs<uint16_t>(operands, VL_bits);
+        results_[0] = sveTrn2_3vecs<uint16_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_TRN2_ZZZ_S: {  // trn2 zd.s, zn.s, zm.s
-        results[0] = sveHelp::sveTrn2_3vecs<uint32_t>(operands, VL_bits);
+        results_[0] = sveTrn2_3vecs<uint32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_TRN2v16i8: {  // trn2 vd.16b, vn.16b, vm.16b
-        results[0] = neonHelp::vecTrn2<uint8_t, 16>(operands);
+        results_[0] = vecTrn2<uint8_t, 16>(sourceValues_);
         break;
       }
       case Opcode::AArch64_TRN2v2i32: {  // trn2 vd.2s, vn.2s, vm.2s
-        results[0] = neonHelp::vecTrn2<uint32_t, 2>(operands);
+        results_[0] = vecTrn2<uint32_t, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_TRN2v2i64: {  // trn2 vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecTrn2<uint64_t, 2>(operands);
+        results_[0] = vecTrn2<uint64_t, 2>(sourceValues_);
         break;
       }
       case Opcode::AArch64_TRN2v4i16: {  // trn2 vd.4h, vn.4h, vm.4h
-        results[0] = neonHelp::vecTrn2<uint16_t, 4>(operands);
+        results_[0] = vecTrn2<uint16_t, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_TRN2v4i32: {  // trn2 vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecTrn2<uint32_t, 4>(operands);
+        results_[0] = vecTrn2<uint32_t, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_TRN2v8i16: {  // trn2 vd.8h, vn.8h, vm.8h
-        results[0] = neonHelp::vecTrn2<uint16_t, 8>(operands);
+        results_[0] = vecTrn2<uint16_t, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_TRN2v8i8: {  // trn2 vd.8b, vn.8b, vm.8b
-        results[0] = neonHelp::vecTrn2<uint8_t, 8>(operands);
+        results_[0] = vecTrn2<uint8_t, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_UADDV_VPZ_B: {  // uaddv dd, pg, zn.b
-        results[0] = sveHelp::sveAddvPredicated<uint8_t>(operands, VL_bits);
+        results_[0] = sveAddvPredicated<uint8_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_UADDV_VPZ_D: {  // uaddv dd, pg, zn.d
-        results[0] = sveHelp::sveAddvPredicated<uint64_t>(operands, VL_bits);
+        results_[0] = sveAddvPredicated<uint64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_UADDV_VPZ_H: {  // uaddv dd, pg, zn.h
-        results[0] = sveHelp::sveAddvPredicated<uint16_t>(operands, VL_bits);
+        results_[0] = sveAddvPredicated<uint16_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_UADDV_VPZ_S: {  // uaddv dd, pg, zn.s
-        results[0] = sveHelp::sveAddvPredicated<uint32_t>(operands, VL_bits);
+        results_[0] = sveAddvPredicated<uint32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_UBFMWri: {  // ubfm wd, wn, #immr, #imms
-        results[0] = {
-            bitmanipHelp::bfm_2imms<uint32_t>(operands, metadata, false, true),
-            8};
+        results_[0] = {
+            bfm_2imms<uint32_t>(sourceValues_, metadata_, false, true), 8};
         break;
       }
       case Opcode::AArch64_UBFMXri: {  // ubfm xd, xn, #immr, #imms
-        results[0] =
-            bitmanipHelp::bfm_2imms<uint64_t>(operands, metadata, false, true);
+        results_[0] =
+            bfm_2imms<uint64_t>(sourceValues_, metadata_, false, true);
         break;
       }
       case Opcode::AArch64_UCVTFUWDri: {  // ucvtf dd, wn
-        results[0] = {static_cast<double>(operands[0].get<uint32_t>()), 256};
+        results_[0] = {static_cast<double>(sourceValues_[0].get<uint32_t>()),
+                       256};
         break;
       }
       case Opcode::AArch64_UCVTFUWSri: {  // ucvtf sd, wn
-        results[0] = {static_cast<float>(operands[0].get<uint32_t>()), 256};
+        results_[0] = {static_cast<float>(sourceValues_[0].get<uint32_t>()),
+                       256};
         break;
       }
       case Opcode::AArch64_UCVTFUXDri: {  // ucvtf dd, xn
-        results[0] = {static_cast<double>(operands[0].get<uint64_t>()), 256};
+        results_[0] = {static_cast<double>(sourceValues_[0].get<uint64_t>()),
+                       256};
         break;
       }
       case Opcode::AArch64_UCVTFUXSri: {  // ucvtf sd, xn
-        results[0] = {static_cast<float>(operands[0].get<uint64_t>()), 256};
+        results_[0] = {static_cast<float>(sourceValues_[0].get<uint64_t>()),
+                       256};
         break;
       }
       case Opcode::AArch64_UCVTFv1i32: {  // ucvtf sd, sn
-        results[0] = {static_cast<float>(operands[0].get<uint32_t>()), 256};
+        results_[0] = {static_cast<float>(sourceValues_[0].get<uint32_t>()),
+                       256};
         break;
       }
       case Opcode::AArch64_UCVTFv1i64: {  // ucvtf dd, dn
-        results[0] = {static_cast<double>(operands[0].get<uint64_t>()), 256};
+        results_[0] = {static_cast<double>(sourceValues_[0].get<uint64_t>()),
+                       256};
         break;
       }
       case Opcode::AArch64_UDIVWr: {  // udiv wd, wn, wm
-        results[0] = {divideHelp::div_3ops<uint32_t>(operands), 8};
+        results_[0] = {div_3ops<uint32_t>(sourceValues_), 8};
         break;
       }
       case Opcode::AArch64_UDIVXr: {  // udiv xd, xn, xm
-        results[0] = {divideHelp::div_3ops<uint64_t>(operands), 8};
+        results_[0] = {div_3ops<uint64_t>(sourceValues_), 8};
         break;
       }
       case Opcode::AArch64_UMADDLrrr: {  // umaddl xd, wn, wm, xa
-        results[0] = multiplyHelp::maddl_4ops<uint64_t, uint32_t>(operands);
+        results_[0] = maddl_4ops<uint64_t, uint32_t>(sourceValues_);
         break;
       }
       case Opcode::AArch64_UMAXPv16i8: {  // umaxp vd.16b, vn.16b, vm.16b
-        results[0] = neonHelp::vecUMaxP<uint8_t, 16>(operands);
+        results_[0] = vecUMaxP<uint8_t, 16>(sourceValues_);
         break;
       }
       case Opcode::AArch64_UMINPv16i8: {  // uminp vd.16b, vn.16b, vm.16b
-        results[0] = neonHelp::vecUMinP<uint8_t, 16>(operands);
+        results_[0] = vecUMinP<uint8_t, 16>(sourceValues_);
+        break;
+      }
+      case Opcode::AArch64_UMOPA_MPPZZ_D: {  // umopa zada.d, pn/m, pm/m, zn.h,
+                                             // zm.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const uint16_t* zn = sourceValues_[tileDim + 2].getAsVector<uint16_t>();
+        const uint16_t* zm = sourceValues_[tileDim + 3].getAsVector<uint16_t>();
+
+        // zn is a SVLd x 4 sub matrix
+        // zm is a 4 x SVLd sub matrix
+        // Resulting SVLd x SVLd matrix has results widened to 64-bit
+        for (int row = 0; row < tileDim; row++) {
+          uint64_t outRow[32] = {0};
+          const uint64_t* zadaRow = sourceValues_[row].getAsVector<uint64_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            uint64_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2);
+              const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2);
+              if ((pn[znIndex / 32] & shifted_active_zn) &&
+                  (pm[zmIndex / 32] & shifted_active_zm))
+                sum += (static_cast<uint64_t>(zn[znIndex]) *
+                        static_cast<uint64_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_UMOPA_MPPZZ_S: {  // umopa zada.s, pn/m, pm/m, zn.b,
+                                             // zm.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const uint8_t* zn = sourceValues_[tileDim + 2].getAsVector<uint8_t>();
+        const uint8_t* zm = sourceValues_[tileDim + 3].getAsVector<uint8_t>();
+
+        // zn is a SVLs x 4 sub matrix
+        // zm is a 4 x SVLs sub matrix
+        // Resulting SVLs x SVLs matrix has results widened to 32-bit
+        for (int row = 0; row < tileDim; row++) {
+          uint32_t outRow[64] = {0};
+          const uint32_t* zadaRow = sourceValues_[row].getAsVector<uint32_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            uint32_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << (znIndex % 64);
+              const uint64_t shifted_active_zm = 1ull << (zmIndex % 64);
+              if ((pn[znIndex / 64] & shifted_active_zn) &&
+                  (pm[zmIndex / 64] & shifted_active_zm))
+                sum += (static_cast<uint32_t>(zn[znIndex]) *
+                        static_cast<uint32_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_UMOPS_MPPZZ_D: {  // umops zada.d, pn/m, pm/m, zn.h,
+                                             // zm.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const uint16_t* zn = sourceValues_[tileDim + 2].getAsVector<uint16_t>();
+        const uint16_t* zm = sourceValues_[tileDim + 3].getAsVector<uint16_t>();
+
+        // zn is a SVLd x 4 sub matrix
+        // zm is a 4 x SVLd sub matrix
+        // Resulting SVLd x SVLd matrix has results widened to 64-bit
+        for (int row = 0; row < tileDim; row++) {
+          uint64_t outRow[32] = {0};
+          const uint64_t* zadaRow = sourceValues_[row].getAsVector<uint64_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            uint64_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2);
+              const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2);
+              if ((pn[znIndex / 32] & shifted_active_zn) &&
+                  (pm[zmIndex / 32] & shifted_active_zm))
+                sum -= (static_cast<uint64_t>(zn[znIndex]) *
+                        static_cast<uint64_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_UMOPS_MPPZZ_S: {  // umops zada.s, pn/m, pm/m, zn.b,
+                                             // zm.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const uint8_t* zn = sourceValues_[tileDim + 2].getAsVector<uint8_t>();
+        const uint8_t* zm = sourceValues_[tileDim + 3].getAsVector<uint8_t>();
+
+        // zn is a SVLs x 4 sub matrix
+        // zm is a 4 x SVLs sub matrix
+        // Resulting SVLs x SVLs matrix has results widened to 32-bit
+        for (int row = 0; row < tileDim; row++) {
+          uint32_t outRow[64] = {0};
+          const uint32_t* zadaRow = sourceValues_[row].getAsVector<uint32_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            uint32_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << (znIndex % 64);
+              const uint64_t shifted_active_zm = 1ull << (zmIndex % 64);
+              if ((pn[znIndex / 64] & shifted_active_zn) &&
+                  (pm[zmIndex / 64] & shifted_active_zm))
+                sum -= (static_cast<uint32_t>(zn[znIndex]) *
+                        static_cast<uint32_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
         break;
       }
       case Opcode::AArch64_UMOVvi32_idx0:  // umov wd, vn.s[0]
       case Opcode::AArch64_UMOVvi32: {     // umov wd, vn.s[index]
-        const uint32_t* vec = operands[0].getAsVector<uint32_t>();
-        results[0] = {vec[metadata.operands[1].vector_index], 8};
+        const uint32_t* vec = sourceValues_[0].getAsVector<uint32_t>();
+        results_[0] = {vec[metadata_.operands[1].vector_index], 8};
         break;
       }
       case Opcode::AArch64_UMOVvi64_idx0:  // umov xd, vn.d[0]
       case Opcode::AArch64_UMOVvi64: {     // umov xd, vn.d[index]
-        const uint64_t* vec = operands[0].getAsVector<uint64_t>();
-        results[0] = vec[metadata.operands[1].vector_index];
+        const uint64_t* vec = sourceValues_[0].getAsVector<uint64_t>();
+        results_[0] = vec[metadata_.operands[1].vector_index];
         break;
       }
       case Opcode::AArch64_UMOVvi8_idx0:  // umov wd, vn.b[0]
       case Opcode::AArch64_UMOVvi8: {     // umov wd, vn.b[index]
-        const uint8_t* vec = operands[0].getAsVector<uint8_t>();
-        results[0] = {vec[metadata.operands[1].vector_index], 8};
+        const uint8_t* vec = sourceValues_[0].getAsVector<uint8_t>();
+        results_[0] = {vec[metadata_.operands[1].vector_index], 8};
         break;
       }
       case Opcode::AArch64_UMSUBLrrr: {  // umsubl xd, wn, wm, xa
-        results[0] = arithmeticHelp::msubl_4ops<uint64_t, uint32_t>(operands);
+        results_[0] = msubl_4ops<uint64_t, uint32_t>(sourceValues_);
         break;
       }
       case Opcode::AArch64_UMULHrr: {  // umulh xd, xn, xm
-        results[0] = AuxFunc::mulhi(operands[0].get<uint64_t>(),
-                                    operands[1].get<uint64_t>());
+        results_[0] = mulhi(sourceValues_[0].get<uint64_t>(),
+                            sourceValues_[1].get<uint64_t>());
         break;
       }
       case Opcode::AArch64_UQDECD_WPiI: {  // uqdecd wd{, pattern{, MUL #imm}}
-        results[0] =
-            sveHelp::sveUqdec<uint32_t, 64u>(operands, metadata, VL_bits);
+        results_[0] =
+            sveUqdec<uint32_t, 64u>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_UQDECD_XPiI: {  // uqdecd xd{, pattern{, MUL #imm}}
-        results[0] =
-            sveHelp::sveUqdec<uint64_t, 64u>(operands, metadata, VL_bits);
+        results_[0] =
+            sveUqdec<uint64_t, 64u>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_UQDECH_XPiI: {  // uqdech xd{, pattern{, MUL #imm}}
-        results[0] =
-            sveHelp::sveUqdec<uint64_t, 16u>(operands, metadata, VL_bits);
+        results_[0] =
+            sveUqdec<uint64_t, 16u>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_UQDECW_XPiI: {  // uqdecw xd{, pattern{, MUL #imm}}
-        results[0] =
-            sveHelp::sveUqdec<uint64_t, 32u>(operands, metadata, VL_bits);
+        results_[0] =
+            sveUqdec<uint64_t, 32u>(sourceValues_, metadata_, VL_bits);
         break;
       }
       case Opcode::AArch64_USHLLv16i8_shift: {  // ushll2 vd.8h, vn.16b, #imm
-        results[0] = neonHelp::vecShllShift_vecImm<uint16_t, uint8_t, 8>(
-            operands, metadata, true);
+        results_[0] = vecShllShift_vecImm<uint16_t, uint8_t, 8>(
+            sourceValues_, metadata_, true);
         break;
       }
       case Opcode::AArch64_USHLLv4i16_shift: {  // ushll vd.4s, vn.4h, #imm
-        results[0] = neonHelp::vecShllShift_vecImm<uint32_t, uint16_t, 4>(
-            operands, metadata, false);
+        results_[0] = vecShllShift_vecImm<uint32_t, uint16_t, 4>(
+            sourceValues_, metadata_, false);
         break;
       }
       case Opcode::AArch64_USHLLv8i16_shift: {  // ushll2 vd.4s, vn.8h, #imm
-        results[0] = neonHelp::vecShllShift_vecImm<uint32_t, uint16_t, 4>(
-            operands, metadata, true);
+        results_[0] = vecShllShift_vecImm<uint32_t, uint16_t, 4>(
+            sourceValues_, metadata_, true);
         break;
       }
       case Opcode::AArch64_USHLLv8i8_shift: {  // ushll vd.8h, vn.8b, #imm
-        results[0] = neonHelp::vecShllShift_vecImm<uint16_t, uint8_t, 8>(
-            operands, metadata, false);
+        results_[0] = vecShllShift_vecImm<uint16_t, uint8_t, 8>(
+            sourceValues_, metadata_, false);
+        break;
+      }
+      case Opcode::AArch64_USMOPA_MPPZZ_D: {  // usmopa zada.d, pn/m, pm/m,
+                                              // zn.h, zm.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const uint16_t* zn = sourceValues_[tileDim + 2].getAsVector<uint16_t>();
+        const int16_t* zm = sourceValues_[tileDim + 3].getAsVector<int16_t>();
+
+        // zn is a SVLd x 4 sub matrix
+        // zm is a 4 x SVLd sub matrix
+        // Resulting SVLd x SVLd matrix has results widened to 64-bit
+        for (int row = 0; row < tileDim; row++) {
+          int64_t outRow[32] = {0};
+          const int64_t* zadaRow = sourceValues_[row].getAsVector<int64_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int64_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2);
+              const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2);
+              if ((pn[znIndex / 32] & shifted_active_zn) &&
+                  (pm[zmIndex / 32] & shifted_active_zm))
+                sum += (static_cast<int64_t>(zn[znIndex]) *
+                        static_cast<int64_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_USMOPA_MPPZZ_S: {  // usmopa zada.s, pn/m, pm/m,
+                                              // zn.b, zm.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const uint8_t* zn = sourceValues_[tileDim + 2].getAsVector<uint8_t>();
+        const int8_t* zm = sourceValues_[tileDim + 3].getAsVector<int8_t>();
+
+        // zn is a SVLs x 4 sub matrix
+        // zm is a 4 x SVLs sub matrix
+        // Resulting SVLs x SVLs matrix has results widened to 32-bit
+        for (int row = 0; row < tileDim; row++) {
+          int32_t outRow[64] = {0};
+          const int32_t* zadaRow = sourceValues_[row].getAsVector<int32_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int32_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << (znIndex % 64);
+              const uint64_t shifted_active_zm = 1ull << (zmIndex % 64);
+              if ((pn[znIndex / 64] & shifted_active_zn) &&
+                  (pm[zmIndex / 64] & shifted_active_zm))
+                sum += (static_cast<int32_t>(zn[znIndex]) *
+                        static_cast<int32_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_USMOPS_MPPZZ_D: {  // usmops zada.d, pn/m, pm/m,
+                                              // zn.h, zm.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const uint16_t* zn = sourceValues_[tileDim + 2].getAsVector<uint16_t>();
+        const int16_t* zm = sourceValues_[tileDim + 3].getAsVector<int16_t>();
+
+        // zn is a SVLd x 4 sub matrix
+        // zm is a 4 x SVLd sub matrix
+        // Resulting SVLd x SVLd matrix has results widened to 64-bit
+        for (int row = 0; row < tileDim; row++) {
+          int64_t outRow[32] = {0};
+          const int64_t* zadaRow = sourceValues_[row].getAsVector<int64_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int64_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2);
+              const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2);
+              if ((pn[znIndex / 32] & shifted_active_zn) &&
+                  (pm[zmIndex / 32] & shifted_active_zm))
+                sum -= (static_cast<int64_t>(zn[znIndex]) *
+                        static_cast<int64_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_USMOPS_MPPZZ_S: {  // usmops zada.s, pn/m, pm/m,
+                                              // zn.b, zm.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const uint8_t* zn = sourceValues_[tileDim + 2].getAsVector<uint8_t>();
+        const int8_t* zm = sourceValues_[tileDim + 3].getAsVector<int8_t>();
+
+        // zn is a SVLs x 4 sub matrix
+        // zm is a 4 x SVLs sub matrix
+        // Resulting SVLs x SVLs matrix has results widened to 32-bit
+        for (int row = 0; row < tileDim; row++) {
+          int32_t outRow[64] = {0};
+          const int32_t* zadaRow = sourceValues_[row].getAsVector<int32_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int32_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << (znIndex % 64);
+              const uint64_t shifted_active_zm = 1ull << (zmIndex % 64);
+              if ((pn[znIndex / 64] & shifted_active_zn) &&
+                  (pm[zmIndex / 64] & shifted_active_zm))
+                sum -= (static_cast<int32_t>(zn[znIndex]) *
+                        static_cast<int32_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
         break;
       }
       case Opcode::AArch64_UUNPKHI_ZZ_D: {  // uunpkhi zd.d, zn.s
-        results[0] =
-            sveHelp::sveUnpk_vecs<uint64_t, uint32_t>(operands, VL_bits, true);
+        results_[0] =
+            sveUnpk_vecs<uint64_t, uint32_t>(sourceValues_, VL_bits, true);
         break;
       }
       case Opcode::AArch64_UUNPKHI_ZZ_H: {  // uunpkhi zd.h, zn.b
-        results[0] =
-            sveHelp::sveUnpk_vecs<uint16_t, uint8_t>(operands, VL_bits, true);
+        results_[0] =
+            sveUnpk_vecs<uint16_t, uint8_t>(sourceValues_, VL_bits, true);
         break;
       }
       case Opcode::AArch64_UUNPKHI_ZZ_S: {  // uunpkhi zd.s, zn.h
-        results[0] =
-            sveHelp::sveUnpk_vecs<uint32_t, uint16_t>(operands, VL_bits, true);
+        results_[0] =
+            sveUnpk_vecs<uint32_t, uint16_t>(sourceValues_, VL_bits, true);
         break;
       }
       case Opcode::AArch64_UUNPKLO_ZZ_D: {  // uunpklo zd.d, zn.s
-        results[0] =
-            sveHelp::sveUnpk_vecs<uint64_t, uint32_t>(operands, VL_bits, false);
+        results_[0] =
+            sveUnpk_vecs<uint64_t, uint32_t>(sourceValues_, VL_bits, false);
         break;
       }
       case Opcode::AArch64_UUNPKLO_ZZ_H: {  // uunpklo zd.h, zn.b
-        results[0] =
-            sveHelp::sveUnpk_vecs<uint16_t, uint8_t>(operands, VL_bits, false);
+        results_[0] =
+            sveUnpk_vecs<uint16_t, uint8_t>(sourceValues_, VL_bits, false);
         break;
       }
       case Opcode::AArch64_UUNPKLO_ZZ_S: {  // uunpklo zd.s, zn.h
-        results[0] =
-            sveHelp::sveUnpk_vecs<uint32_t, uint16_t>(operands, VL_bits, false);
+        results_[0] =
+            sveUnpk_vecs<uint32_t, uint16_t>(sourceValues_, VL_bits, false);
         break;
       }
       case Opcode::AArch64_UZP1_ZZZ_S: {  // uzp1 zd.s, zn.s, zm.s
-        results[0] = sveHelp::sveUzp_vecs<uint32_t>(operands, VL_bits, true);
+        results_[0] = sveUzp_vecs<uint32_t>(sourceValues_, VL_bits, true);
         break;
       }
       case Opcode::AArch64_UZP1v16i8: {  // uzp1 vd.16b, vn.16b, vm.16b
-        results[0] = neonHelp::vecUzp<int8_t, 16>(operands, true);
+        results_[0] = vecUzp<int8_t, 16>(sourceValues_, true);
         break;
       }
       case Opcode::AArch64_UZP1v2i32: {  // uzp1 vd.2s, vn.2s, vm.2s
-        results[0] = neonHelp::vecUzp<int32_t, 2>(operands, true);
+        results_[0] = vecUzp<int32_t, 2>(sourceValues_, true);
         break;
       }
       case Opcode::AArch64_UZP1v2i64: {  // uzp1 vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecUzp<int64_t, 2>(operands, true);
+        results_[0] = vecUzp<int64_t, 2>(sourceValues_, true);
         break;
       }
       case Opcode::AArch64_UZP1v4i16: {  // uzp1 vd.4h, vn.4h, vm.4h
-        results[0] = neonHelp::vecUzp<int16_t, 4>(operands, true);
+        results_[0] = vecUzp<int16_t, 4>(sourceValues_, true);
         break;
       }
       case Opcode::AArch64_UZP1v4i32: {  // uzp1 vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecUzp<int32_t, 4>(operands, true);
+        results_[0] = vecUzp<int32_t, 4>(sourceValues_, true);
         break;
       }
       case Opcode::AArch64_UZP1v8i16: {  // uzp1 vd.8h, vn.8h, vm.8h
-        results[0] = neonHelp::vecUzp<int16_t, 8>(operands, true);
+        results_[0] = vecUzp<int16_t, 8>(sourceValues_, true);
         break;
       }
       case Opcode::AArch64_UZP1v8i8: {  // uzp1 vd.8b, vn.8b, vm.8b
-        results[0] = neonHelp::vecUzp<int8_t, 8>(operands, true);
+        results_[0] = vecUzp<int8_t, 8>(sourceValues_, true);
         break;
       }
       case Opcode::AArch64_UZP2v16i8: {  // uzp2 vd.16b, vn.16b, vm.16b
-        results[0] = neonHelp::vecUzp<int8_t, 16>(operands, false);
+        results_[0] = vecUzp<int8_t, 16>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_UZP2v2i32: {  // uzp2 vd.2s, vn.2s, vm.2s
-        results[0] = neonHelp::vecUzp<int32_t, 2>(operands, false);
+        results_[0] = vecUzp<int32_t, 2>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_UZP2v2i64: {  // uzp2 vd.2d, vn.2d, vm.2d
-        results[0] = neonHelp::vecUzp<int64_t, 2>(operands, false);
+        results_[0] = vecUzp<int64_t, 2>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_UZP2v4i16: {  // uzp2 vd.4h, vn.4h, vm.4h
-        results[0] = neonHelp::vecUzp<int16_t, 4>(operands, false);
+        results_[0] = vecUzp<int16_t, 4>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_UZP2v4i32: {  // uzp2 vd.4s, vn.4s, vm.4s
-        results[0] = neonHelp::vecUzp<int32_t, 4>(operands, false);
+        results_[0] = vecUzp<int32_t, 4>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_UZP2v8i16: {  // uzp2 vd.8h, vn.8h, vm.8h
-        results[0] = neonHelp::vecUzp<int16_t, 8>(operands, false);
+        results_[0] = vecUzp<int16_t, 8>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_UZP2v8i8: {  // uzp2 vd.8b, vn.8b, vm.8b
-        results[0] = neonHelp::vecUzp<int8_t, 8>(operands, false);
+        results_[0] = vecUzp<int8_t, 8>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_WHILELO_PWW_B: {  // whilelo pd.b, wn, wm
         auto [output, nzcv] =
-            sveHelp::sveWhilelo<uint32_t, uint8_t>(operands, VL_bits, true);
-        results[0] = nzcv;
-        results[1] = output;
+            sveWhilelo<uint32_t, uint8_t>(sourceValues_, VL_bits, true);
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PWW_D: {  // whilelo pd.d, wn, wm
         auto [output, nzcv] =
-            sveHelp::sveWhilelo<uint32_t, uint64_t>(operands, VL_bits, true);
-        results[0] = nzcv;
-        results[1] = output;
+            sveWhilelo<uint32_t, uint64_t>(sourceValues_, VL_bits, true);
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PWW_H: {  // whilelo pd.h, wn, wm
         auto [output, nzcv] =
-            sveHelp::sveWhilelo<uint32_t, uint16_t>(operands, VL_bits, true);
-        results[0] = nzcv;
-        results[1] = output;
+            sveWhilelo<uint32_t, uint16_t>(sourceValues_, VL_bits, true);
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PWW_S: {  // whilelo pd.s, wn, wm
         auto [output, nzcv] =
-            sveHelp::sveWhilelo<uint32_t, uint32_t>(operands, VL_bits, true);
-        results[0] = nzcv;
-        results[1] = output;
+            sveWhilelo<uint32_t, uint32_t>(sourceValues_, VL_bits, true);
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PXX_B: {  // whilelo pd.b, xn, xm
         auto [output, nzcv] =
-            sveHelp::sveWhilelo<uint64_t, uint8_t>(operands, VL_bits, true);
-        results[0] = nzcv;
-        results[1] = output;
+            sveWhilelo<uint64_t, uint8_t>(sourceValues_, VL_bits, true);
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PXX_D: {  // whilelo pd.d, xn, xm
         auto [output, nzcv] =
-            sveHelp::sveWhilelo<uint64_t, uint64_t>(operands, VL_bits, true);
-        results[0] = nzcv;
-        results[1] = output;
+            sveWhilelo<uint64_t, uint64_t>(sourceValues_, VL_bits, true);
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PXX_H: {  // whilelo pd.h, xn, xm
         auto [output, nzcv] =
-            sveHelp::sveWhilelo<uint64_t, uint16_t>(operands, VL_bits, true);
-        results[0] = nzcv;
-        results[1] = output;
+            sveWhilelo<uint64_t, uint16_t>(sourceValues_, VL_bits, true);
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PXX_S: {  // whilelo pd.s, xn, xm
         auto [output, nzcv] =
-            sveHelp::sveWhilelo<uint64_t, uint32_t>(operands, VL_bits, true);
-        results[0] = nzcv;
-        results[1] = output;
+            sveWhilelo<uint64_t, uint32_t>(sourceValues_, VL_bits, true);
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELT_PXX_B: {  // whilelt pd.b, xn, xm
         auto [output, nzcv] =
-            sveHelp::sveWhilelo<int64_t, int8_t>(operands, VL_bits, true);
-        results[0] = nzcv;
-        results[1] = output;
+            sveWhilelo<int64_t, int8_t>(sourceValues_, VL_bits, true);
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELT_PXX_D: {  // whilelt pd.d, xn, xm
         auto [output, nzcv] =
-            sveHelp::sveWhilelo<int64_t, int64_t>(operands, VL_bits, true);
-        results[0] = nzcv;
-        results[1] = output;
+            sveWhilelo<int64_t, int64_t>(sourceValues_, VL_bits, true);
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELT_PXX_H: {  // whilelt pd.h, xn, xm
         auto [output, nzcv] =
-            sveHelp::sveWhilelo<int64_t, int16_t>(operands, VL_bits, true);
-        results[0] = nzcv;
-        results[1] = output;
+            sveWhilelo<int64_t, int16_t>(sourceValues_, VL_bits, true);
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELT_PXX_S: {  // whilelt pd.s, xn, xm
         auto [output, nzcv] =
-            sveHelp::sveWhilelo<int64_t, int32_t>(operands, VL_bits, true);
-        results[0] = nzcv;
-        results[1] = output;
+            sveWhilelo<int64_t, int32_t>(sourceValues_, VL_bits, true);
+        results_[0] = nzcv;
+        results_[1] = output;
         break;
       }
       case Opcode::AArch64_XPACLRI: {  // xpaclri
@@ -5272,73 +7628,128 @@ void Instruction::execute() {
         break;
       }
       case Opcode::AArch64_XTNv2i32: {  // xtn vd.2s, vn.2d
-        results[0] = neonHelp::vecXtn<uint32_t, uint64_t, 2>(operands, false);
+        results_[0] = vecXtn<uint32_t, uint64_t, 2>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_XTNv4i16: {  // xtn vd.4h, vn.4s
-        results[0] = neonHelp::vecXtn<uint16_t, uint32_t, 4>(operands, false);
+        results_[0] = vecXtn<uint16_t, uint32_t, 4>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_XTNv4i32: {  // xtn2 vd.4s, vn.2d
-        results[0] = neonHelp::vecXtn<uint32_t, uint64_t, 4>(operands, true);
+        results_[0] = vecXtn<uint32_t, uint64_t, 4>(sourceValues_, true);
         break;
       }
       case Opcode::AArch64_ZIP1_PPP_B: {  // zip1 pd.b, pn.b, pm.b
-        results[0] = sveHelp::sveZip_preds<uint8_t>(operands, VL_bits, false);
+        results_[0] = sveZip_preds<uint8_t>(sourceValues_, VL_bits, false);
         break;
       }
       case Opcode::AArch64_ZIP1_PPP_D: {  // zip1 pd.d, pn.d, pm.d
-        results[0] = sveHelp::sveZip_preds<uint64_t>(operands, VL_bits, false);
+        results_[0] = sveZip_preds<uint64_t>(sourceValues_, VL_bits, false);
         break;
       }
       case Opcode::AArch64_ZIP1_PPP_H: {  // zip1 pd.h, pn.h, pm.h
-        results[0] = sveHelp::sveZip_preds<uint16_t>(operands, VL_bits, false);
+        results_[0] = sveZip_preds<uint16_t>(sourceValues_, VL_bits, false);
         break;
       }
       case Opcode::AArch64_ZIP1_PPP_S: {  // zip1 pd.s, pn.s, pm.s
-        results[0] = sveHelp::sveZip_preds<uint32_t>(operands, VL_bits, false);
+        results_[0] = sveZip_preds<uint32_t>(sourceValues_, VL_bits, false);
         break;
       }
       case Opcode::AArch64_ZIP1_ZZZ_D: {  // zip1 zd.d, zn.d, zm.d
-        results[0] = sveHelp::sveZip_vecs<uint64_t>(operands, VL_bits, false);
+        results_[0] = sveZip_vecs<uint64_t>(sourceValues_, VL_bits, false);
         break;
       }
       case Opcode::AArch64_ZIP1_ZZZ_S: {  // zip1 zd.s, zn.s, zm.s
-        results[0] = sveHelp::sveZip_vecs<uint32_t>(operands, VL_bits, false);
+        results_[0] = sveZip_vecs<uint32_t>(sourceValues_, VL_bits, false);
+        break;
+      }
+      case Opcode::AArch64_ZIP1v16i8: {  // zip1 vd.16b, vn.16b, vm.16b
+        results_[0] = vecZip<uint8_t, 16>(sourceValues_, false);
+        break;
+      }
+      case Opcode::AArch64_ZIP1v2i32: {  // zip1 vd.2s, vn.2s, vm.2s
+        results_[0] = vecZip<uint32_t, 2>(sourceValues_, false);
+        break;
+      }
+      case Opcode::AArch64_ZIP1v2i64: {  // zip1 vd.2d, vn.2d, vm.2d
+        results_[0] = vecZip<uint64_t, 2>(sourceValues_, false);
+        break;
+      }
+      case Opcode::AArch64_ZIP1v4i16: {  // zip1 vd.4h, vn.4h, vm.4h
+        results_[0] = vecZip<uint16_t, 4>(sourceValues_, false);
+        break;
+      }
+      case Opcode::AArch64_ZIP1v4i32: {  // zip1 vd.4s, vn.4s, vm.4s
+        results_[0] = vecZip<uint32_t, 4>(sourceValues_, false);
+        break;
+      }
+      case Opcode::AArch64_ZIP1v8i16: {  // zip1 vd.8h, vn.8h, vm.8h
+        results_[0] = vecZip<uint16_t, 8>(sourceValues_, false);
+        break;
+      }
+      case Opcode::AArch64_ZIP1v8i8: {  // zip1 vd.8b, vn.8b, vm.8b
+        results_[0] = vecZip<uint8_t, 8>(sourceValues_, false);
         break;
       }
       case Opcode::AArch64_ZIP2_PPP_B: {  // zip2 pd.b, pn.b, pm.b
-        results[0] = sveHelp::sveZip_preds<uint8_t>(operands, VL_bits, true);
+        results_[0] = sveZip_preds<uint8_t>(sourceValues_, VL_bits, true);
         break;
       }
       case Opcode::AArch64_ZIP2_PPP_D: {  // zip2 pd.d, pn.d, pm.d
-        results[0] = sveHelp::sveZip_preds<uint64_t>(operands, VL_bits, true);
+        results_[0] = sveZip_preds<uint64_t>(sourceValues_, VL_bits, true);
         break;
       }
       case Opcode::AArch64_ZIP2_PPP_H: {  // zip2 pd.h, pn.h, pm.h
-        results[0] = sveHelp::sveZip_preds<uint16_t>(operands, VL_bits, true);
+        results_[0] = sveZip_preds<uint16_t>(sourceValues_, VL_bits, true);
         break;
       }
       case Opcode::AArch64_ZIP2_PPP_S: {  // zip2 pd.s, pn.s, pm.s
-        results[0] = sveHelp::sveZip_preds<uint32_t>(operands, VL_bits, true);
+        results_[0] = sveZip_preds<uint32_t>(sourceValues_, VL_bits, true);
         break;
       }
       case Opcode::AArch64_ZIP2_ZZZ_D: {  // zip2 zd.d, zn.d, zm.d
-        results[0] = sveHelp::sveZip_vecs<uint64_t>(operands, VL_bits, true);
+        results_[0] = sveZip_vecs<uint64_t>(sourceValues_, VL_bits, true);
         break;
       }
       case Opcode::AArch64_ZIP2_ZZZ_S: {  // zip2 zd.s, zn.s, zm.s
-        results[0] = sveHelp::sveZip_vecs<uint32_t>(operands, VL_bits, true);
+        results_[0] = sveZip_vecs<uint32_t>(sourceValues_, VL_bits, true);
+        break;
+      }
+      case Opcode::AArch64_ZIP2v16i8: {  // zip2 vd.16b, vn.16b, vm.16b
+        results_[0] = vecZip<uint8_t, 16>(sourceValues_, true);
+        break;
+      }
+      case Opcode::AArch64_ZIP2v2i32: {  // zip2 vd.2s, vn.2s, vm.2s
+        results_[0] = vecZip<uint32_t, 2>(sourceValues_, true);
+        break;
+      }
+      case Opcode::AArch64_ZIP2v2i64: {  // zip2 vd.2d, vn.2d, vm.2d
+        results_[0] = vecZip<uint64_t, 2>(sourceValues_, true);
+        break;
+      }
+      case Opcode::AArch64_ZIP2v4i16: {  // zip2 vd.4h, vn.4h, vm.4h
+        results_[0] = vecZip<uint16_t, 4>(sourceValues_, true);
+        break;
+      }
+      case Opcode::AArch64_ZIP2v4i32: {  // zip2 vd.4s, vn.4s, vm.4s
+        results_[0] = vecZip<uint32_t, 4>(sourceValues_, true);
+        break;
+      }
+      case Opcode::AArch64_ZIP2v8i16: {  // zip2 vd.8h, vn.8h, vm.8h
+        results_[0] = vecZip<uint16_t, 8>(sourceValues_, true);
+        break;
+      }
+      case Opcode::AArch64_ZIP2v8i8: {  // zip2 vd.8b, vn.8b, vm.8b
+        results_[0] = vecZip<uint8_t, 8>(sourceValues_, true);
         break;
       }
       case Opcode::AArch64_ZERO_M: {  // zero {mask}
         // SME
-        if (!ZAenabled) {
-          // Not in right context mode. Raise exception
-          return ZAdisabled();
-        }
-        for (int i = 0; i < destinationRegisterCount; i++) {
-          results[i] = RegisterValue(0, 256);
+        // Not in right context mode. Raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        for (int i = 0; i < destinationRegisterCount_; i++) {
+          results_[i] = RegisterValue(0, 256);
         }
         break;
       }
@@ -5351,11 +7762,12 @@ void Instruction::execute() {
   // Check if upper bits of vector registers are zeroed because Z
   // configuration extend to 256 bytes whilst V configurations only extend
   // to 16 bytes. Thus upper 240 bytes must be ignored by being set to 0.
-  for (int i = 0; i < destinationRegisterCount; i++) {
-    if ((destinationRegisters[i].type == RegisterType::VECTOR) && !isSVEData_) {
-      if (results[i].size() != 256)
-        std::cerr << "[SimEng:Instruction_execute] " << metadata.mnemonic
-                  << " opcode: " << metadata.opcode
+  for (int i = 0; i < destinationRegisterCount_; i++) {
+    if ((destinationRegisters_[i].type == RegisterType::VECTOR) &&
+        !isInstruction(InsnType::isSVEData)) {
+      if (results_[i].size() != 256)
+        std::cerr << "[SimEng:Instruction_execute] " << metadata_.mnemonic
+                  << " opcode: " << metadata_.opcode
                   << " has not been zero extended correctly\n";
     }
   }
diff --git a/src/lib/arch/aarch64/MicroDecoder.cc b/src/lib/arch/aarch64/MicroDecoder.cc
index b2976c5edf..ea181c8d66 100644
--- a/src/lib/arch/aarch64/MicroDecoder.cc
+++ b/src/lib/arch/aarch64/MicroDecoder.cc
@@ -7,63 +7,62 @@ namespace arch {
 namespace aarch64 {
 
 std::unordered_map<uint32_t, std::vector<Instruction>>
-    MicroDecoder::microDecodeCache;
-std::forward_list<InstructionMetadata> MicroDecoder::microMetadataCache;
+    MicroDecoder::microDecodeCache_;
+std::forward_list<InstructionMetadata> MicroDecoder::microMetadataCache_;
 
-MicroDecoder::MicroDecoder(YAML::Node config)
+MicroDecoder::MicroDecoder(ryml::ConstNodeRef config)
     : instructionSplit_(config["Core"]["Micro-Operations"].as<bool>()) {}
 
 MicroDecoder::~MicroDecoder() {
-  microDecodeCache.clear();
-  microMetadataCache.clear();
+  microDecodeCache_.clear();
+  microMetadataCache_.clear();
 }
 
-bool MicroDecoder::detectOverlap(arm64_reg registerA, arm64_reg registerB) {
+bool MicroDecoder::detectOverlap(aarch64_reg registerA, aarch64_reg registerB) {
   // Early checks on equivalent register ISA names
   if (registerA == registerB) return true;
-  if ((registerA == ARM64_REG_WZR || registerA == ARM64_REG_XZR) &&
-      (registerB == ARM64_REG_WZR || registerB == ARM64_REG_XZR))
+  if ((registerA == AARCH64_REG_WZR || registerA == AARCH64_REG_XZR) &&
+      (registerB == AARCH64_REG_WZR || registerB == AARCH64_REG_XZR))
     return true;
-  if ((registerA == ARM64_REG_WSP || registerA == ARM64_REG_SP) &&
-      (registerB == ARM64_REG_WSP || registerB == ARM64_REG_SP))
+  if ((registerA == AARCH64_REG_WSP || registerA == AARCH64_REG_SP) &&
+      (registerB == AARCH64_REG_WSP || registerB == AARCH64_REG_SP))
     return true;
 
   // Arrays to hold register identifiers
-  std::array<arm64_reg, 2> registers = {registerA, registerB};
+  std::array<aarch64_reg, 2> registers = {registerA, registerB};
   std::array<bool, 2> isGP = {false, false};
   std::array<uint8_t, 2> indexes = {0, 0};
   // Get index of each register and whether they are general purpose
   for (int i = 0; i < 2; i++) {
-    if (registers[i] == ARM64_REG_FP) {
+    if (registers[i] == AARCH64_REG_FP) {
       isGP[i] = true;
       indexes[i] = 29;
-    } else if (registers[i] == ARM64_REG_LR) {
+    } else if (registers[i] == AARCH64_REG_LR) {
       isGP[i] = true;
       indexes[i] = 30;
     } else {
-      arm64_reg base = (arm64_reg)0;
-      if (registers[i] >= ARM64_REG_V0) {
-        base = ARM64_REG_V0;
-      } else if (registers[i] >= ARM64_REG_Z0) {
-        base = ARM64_REG_Z0;
-      } else if (registers[i] >= ARM64_REG_X0) {
-        base = ARM64_REG_X0;
+      aarch64_reg base = (aarch64_reg)0;
+      // No need to check V registers as they are encoded as Q or D registers
+      if (registers[i] >= AARCH64_REG_Z0) {
+        base = AARCH64_REG_Z0;
+      } else if (registers[i] >= AARCH64_REG_X0) {
+        base = AARCH64_REG_X0;
         isGP[i] = true;
-      } else if (registers[i] >= ARM64_REG_W0) {
-        base = ARM64_REG_W0;
+      } else if (registers[i] >= AARCH64_REG_W0) {
+        base = AARCH64_REG_W0;
         isGP[i] = true;
-      } else if (registers[i] >= ARM64_REG_S0) {
-        base = ARM64_REG_S0;
-      } else if (registers[i] >= ARM64_REG_Q0) {
-        base = ARM64_REG_Q0;
-      } else if (registers[i] >= ARM64_REG_P0) {
-        base = ARM64_REG_P0;
-      } else if (registers[i] >= ARM64_REG_H0) {
-        base = ARM64_REG_H0;
-      } else if (registers[i] >= ARM64_REG_D0) {
-        base = ARM64_REG_D0;
-      } else if (registers[i] >= ARM64_REG_B0) {
-        base = ARM64_REG_B0;
+      } else if (registers[i] >= AARCH64_REG_S0) {
+        base = AARCH64_REG_S0;
+      } else if (registers[i] >= AARCH64_REG_Q0) {
+        base = AARCH64_REG_Q0;
+      } else if (registers[i] >= AARCH64_REG_P0) {
+        base = AARCH64_REG_P0;
+      } else if (registers[i] >= AARCH64_REG_H0) {
+        base = AARCH64_REG_H0;
+      } else if (registers[i] >= AARCH64_REG_D0) {
+        base = AARCH64_REG_D0;
+      } else if (registers[i] >= AARCH64_REG_B0) {
+        base = AARCH64_REG_B0;
       }
       indexes[i] = registers[i] - base;
     }
@@ -87,12 +86,210 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
     output[0] = std::make_shared<Instruction>(macroOp);
   } else {
     // Try and find instruction splitting entry in cache
-    auto iter = microDecodeCache.find(word);
-    if (iter == microDecodeCache.end()) {
+    auto iter = microDecodeCache_.find(word);
+    if (iter == microDecodeCache_.end()) {
       // Get macro-operation metadata to create micro-operation metadata from
       InstructionMetadata metadata = macroOp.getMetadata();
       std::vector<Instruction> cacheVector;
       switch (metadata.opcode) {
+        case Opcode::AArch64_LD1Fourv16b:
+        case Opcode::AArch64_LD1Fourv1d:
+        case Opcode::AArch64_LD1Fourv2d:
+        case Opcode::AArch64_LD1Fourv2s:
+        case Opcode::AArch64_LD1Fourv4h:
+        case Opcode::AArch64_LD1Fourv4s:
+        case Opcode::AArch64_LD1Fourv8b:
+        case Opcode::AArch64_LD1Fourv8h: {
+          uint8_t dataSize = getDataSize(metadata.operands[0]);
+          // ldr uop 0
+          cacheVector.push_back(createLdrUop(
+              architecture, metadata.operands[0].reg,
+              {metadata.operands[4].mem.base, AARCH64_REG_INVALID, 0},
+              capstoneHandle, false, 1, dataSize));
+          // ldr uop 1
+          cacheVector.push_back(createLdrUop(
+              architecture, metadata.operands[1].reg,
+              {metadata.operands[4].mem.base, AARCH64_REG_INVALID, dataSize},
+              capstoneHandle, true, 2, dataSize));
+          // ldr uop 2
+          cacheVector.push_back(
+              createLdrUop(architecture, metadata.operands[2].reg,
+                           {metadata.operands[4].mem.base, AARCH64_REG_INVALID,
+                            2 * dataSize},
+                           capstoneHandle, true, 2, dataSize));
+          // ldr uop 3
+          cacheVector.push_back(
+              createLdrUop(architecture, metadata.operands[3].reg,
+                           {metadata.operands[4].mem.base, AARCH64_REG_INVALID,
+                            3 * dataSize},
+                           capstoneHandle, true, 2, dataSize));
+
+          iter = microDecodeCache_.try_emplace(word, cacheVector).first;
+          break;
+        }
+        case Opcode::AArch64_LD1Fourv16b_POST:
+        case Opcode::AArch64_LD1Fourv2d_POST:
+        case Opcode::AArch64_LD1Fourv4s_POST:
+        case Opcode::AArch64_LD1Fourv8h_POST: {
+          uint8_t dataSize = getDataSize(metadata.operands[0]);
+          // ldr uop 0
+          cacheVector.push_back(createLdrUop(
+              architecture, metadata.operands[0].reg,
+              {metadata.operands[4].mem.base, AARCH64_REG_INVALID, 0},
+              capstoneHandle, false, 1, dataSize));
+          // ldr uop 1
+          cacheVector.push_back(createLdrUop(
+              architecture, metadata.operands[1].reg,
+              {metadata.operands[4].mem.base, AARCH64_REG_INVALID, dataSize},
+              capstoneHandle, true, 2, dataSize));
+          // ldr uop 2
+          cacheVector.push_back(
+              createLdrUop(architecture, metadata.operands[2].reg,
+                           {metadata.operands[4].mem.base, AARCH64_REG_INVALID,
+                            2 * dataSize},
+                           capstoneHandle, true, 2, dataSize));
+          // ldr uop 3
+          cacheVector.push_back(
+              createLdrUop(architecture, metadata.operands[3].reg,
+                           {metadata.operands[4].mem.base, AARCH64_REG_INVALID,
+                            3 * dataSize},
+                           capstoneHandle, true, 2, dataSize));
+          // offset generation uop
+          if (metadata.operands[5].type == AARCH64_OP_REG) {
+            cacheVector.push_back(createRegOffsetUop(
+                architecture, metadata.operands[4].mem.base,
+                metadata.operands[5].reg, capstoneHandle, true));
+          } else {
+            cacheVector.push_back(
+                createImmOffsetUop(architecture, metadata.operands[4].mem.base,
+                                   64, capstoneHandle, true));
+          }
+
+          iter = microDecodeCache_.try_emplace(word, cacheVector).first;
+          break;
+        }
+        case Opcode::AArch64_LD1Fourv1d_POST:
+        case Opcode::AArch64_LD1Fourv2s_POST:
+        case Opcode::AArch64_LD1Fourv8b_POST:
+        case Opcode::AArch64_LD1Fourv4h_POST: {
+          uint8_t dataSize = getDataSize(metadata.operands[0]);
+          // ldr uop 0
+          cacheVector.push_back(createLdrUop(
+              architecture, metadata.operands[0].reg,
+              {metadata.operands[4].mem.base, AARCH64_REG_INVALID, 0},
+              capstoneHandle, false, 1, dataSize));
+          // ldr uop 1
+          cacheVector.push_back(createLdrUop(
+              architecture, metadata.operands[1].reg,
+              {metadata.operands[4].mem.base, AARCH64_REG_INVALID, dataSize},
+              capstoneHandle, true, 2, dataSize));
+          // ldr uop 2
+          cacheVector.push_back(
+              createLdrUop(architecture, metadata.operands[2].reg,
+                           {metadata.operands[4].mem.base, AARCH64_REG_INVALID,
+                            2 * dataSize},
+                           capstoneHandle, true, 2, dataSize));
+          // ldr uop 3
+          cacheVector.push_back(
+              createLdrUop(architecture, metadata.operands[3].reg,
+                           {metadata.operands[4].mem.base, AARCH64_REG_INVALID,
+                            3 * dataSize},
+                           capstoneHandle, true, 2, dataSize));
+          // offset generation uop
+          if (metadata.operands[5].type == AARCH64_OP_REG) {
+            cacheVector.push_back(createRegOffsetUop(
+                architecture, metadata.operands[4].mem.base,
+                metadata.operands[5].reg, capstoneHandle, true));
+          } else {
+            cacheVector.push_back(
+                createImmOffsetUop(architecture, metadata.operands[4].mem.base,
+                                   32, capstoneHandle, true));
+          }
+
+          iter = microDecodeCache_.try_emplace(word, cacheVector).first;
+          break;
+        }
+        case Opcode::AArch64_LD1Twov16b:
+        case Opcode::AArch64_LD1Twov1d:
+        case Opcode::AArch64_LD1Twov2d:
+        case Opcode::AArch64_LD1Twov2s:
+        case Opcode::AArch64_LD1Twov4h:
+        case Opcode::AArch64_LD1Twov4s:
+        case Opcode::AArch64_LD1Twov8b:
+        case Opcode::AArch64_LD1Twov8h: {
+          uint8_t dataSize = getDataSize(metadata.operands[0]);
+          // ldr uop 0
+          cacheVector.push_back(createLdrUop(
+              architecture, metadata.operands[0].reg,
+              {metadata.operands[2].mem.base, AARCH64_REG_INVALID, 0},
+              capstoneHandle, false, 1, dataSize));
+          // ldr uop 1
+          cacheVector.push_back(createLdrUop(
+              architecture, metadata.operands[1].reg,
+              {metadata.operands[2].mem.base, AARCH64_REG_INVALID, dataSize},
+              capstoneHandle, true, 2, dataSize));
+
+          iter = microDecodeCache_.try_emplace(word, cacheVector).first;
+          break;
+        }
+        case Opcode::AArch64_LD1Twov16b_POST:
+        case Opcode::AArch64_LD1Twov2d_POST:
+        case Opcode::AArch64_LD1Twov4s_POST:
+        case Opcode::AArch64_LD1Twov8h_POST: {
+          uint8_t dataSize = getDataSize(metadata.operands[0]);
+          // ldr uop 0
+          cacheVector.push_back(createLdrUop(
+              architecture, metadata.operands[0].reg,
+              {metadata.operands[2].mem.base, AARCH64_REG_INVALID, 0},
+              capstoneHandle, false, 1, dataSize));
+          // ldr uop 1
+          cacheVector.push_back(createLdrUop(
+              architecture, metadata.operands[1].reg,
+              {metadata.operands[2].mem.base, AARCH64_REG_INVALID, dataSize},
+              capstoneHandle, true, 2, dataSize));
+          // offset generation uop
+          if (metadata.operands[3].type == AARCH64_OP_REG) {
+            cacheVector.push_back(createRegOffsetUop(
+                architecture, metadata.operands[2].mem.base,
+                metadata.operands[3].reg, capstoneHandle, true));
+          } else {
+            cacheVector.push_back(
+                createImmOffsetUop(architecture, metadata.operands[2].mem.base,
+                                   32, capstoneHandle, true));
+          }
+
+          iter = microDecodeCache_.try_emplace(word, cacheVector).first;
+          break;
+        }
+        case Opcode::AArch64_LD1Twov1d_POST:
+        case Opcode::AArch64_LD1Twov2s_POST:
+        case Opcode::AArch64_LD1Twov4h_POST:
+        case Opcode::AArch64_LD1Twov8b_POST: {
+          uint8_t dataSize = getDataSize(metadata.operands[0]);
+          // ldr uop 0
+          cacheVector.push_back(createLdrUop(
+              architecture, metadata.operands[0].reg,
+              {metadata.operands[2].mem.base, AARCH64_REG_INVALID, 0},
+              capstoneHandle, false, 1, dataSize));
+          // ldr uop 1
+          cacheVector.push_back(createLdrUop(
+              architecture, metadata.operands[1].reg,
+              {metadata.operands[2].mem.base, AARCH64_REG_INVALID, dataSize},
+              capstoneHandle, true, 2, dataSize));
+          // offset generation uop
+          if (metadata.operands[3].type == AARCH64_OP_REG) {
+            cacheVector.push_back(createRegOffsetUop(
+                architecture, metadata.operands[2].mem.base,
+                metadata.operands[3].reg, capstoneHandle, true));
+          } else {
+            cacheVector.push_back(
+                createImmOffsetUop(architecture, metadata.operands[2].mem.base,
+                                   16, capstoneHandle, true));
+          }
+
+          iter = microDecodeCache_.try_emplace(word, cacheVector).first;
+          break;
+        }
         case Opcode::AArch64_LDPDi:
         case Opcode::AArch64_LDPQi:
         case Opcode::AArch64_LDPSi:
@@ -113,17 +310,17 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
           // ldr uop 0
           cacheVector.push_back(createLdrUop(
               architecture, metadata.operands[orderA].reg,
-              {metadata.operands[2].mem.base, ARM64_REG_INVALID,
+              {metadata.operands[2].mem.base, AARCH64_REG_INVALID,
                metadata.operands[2].mem.disp + (orderA * dataSize)},
               capstoneHandle, false, 1, dataSize));
           // ldr uop 1
           cacheVector.push_back(createLdrUop(
               architecture, metadata.operands[orderB].reg,
-              {metadata.operands[2].mem.base, ARM64_REG_INVALID,
+              {metadata.operands[2].mem.base, AARCH64_REG_INVALID,
                metadata.operands[2].mem.disp + (orderB * dataSize)},
               capstoneHandle, true, 2, dataSize));
 
-          iter = microDecodeCache.try_emplace(word, cacheVector).first;
+          iter = microDecodeCache_.try_emplace(word, cacheVector).first;
           break;
         }
         case Opcode::AArch64_LDPDpost:
@@ -137,19 +334,19 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
           // ldr uop 0
           cacheVector.push_back(createLdrUop(
               architecture, metadata.operands[0].reg,
-              {metadata.operands[2].mem.base, ARM64_REG_INVALID, 0},
+              {metadata.operands[2].mem.base, AARCH64_REG_INVALID, 0},
               capstoneHandle, false, 1, dataSize));
           // ldr uop 1
           cacheVector.push_back(createLdrUop(
               architecture, metadata.operands[1].reg,
-              {metadata.operands[2].mem.base, ARM64_REG_INVALID, dataSize},
+              {metadata.operands[2].mem.base, AARCH64_REG_INVALID, dataSize},
               capstoneHandle, false, 2, dataSize));
           // offset generation uop
           cacheVector.push_back(createImmOffsetUop(
               architecture, metadata.operands[2].mem.base,
               metadata.operands[3].imm, capstoneHandle, true));
 
-          iter = microDecodeCache.try_emplace(word, cacheVector).first;
+          iter = microDecodeCache_.try_emplace(word, cacheVector).first;
           break;
         }
         case Opcode::AArch64_LDPDpre:
@@ -166,15 +363,15 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
           // ldr uop 0
           cacheVector.push_back(createLdrUop(
               architecture, metadata.operands[0].reg,
-              {metadata.operands[2].mem.base, ARM64_REG_INVALID, 0},
+              {metadata.operands[2].mem.base, AARCH64_REG_INVALID, 0},
               capstoneHandle, false, 1, dataSize));
           // ldr uop 1
           cacheVector.push_back(createLdrUop(
               architecture, metadata.operands[1].reg,
-              {metadata.operands[2].mem.base, ARM64_REG_INVALID, dataSize},
+              {metadata.operands[2].mem.base, AARCH64_REG_INVALID, dataSize},
               capstoneHandle, true, 2, dataSize));
 
-          iter = microDecodeCache.try_emplace(word, cacheVector).first;
+          iter = microDecodeCache_.try_emplace(word, cacheVector).first;
           break;
         }
         case Opcode::AArch64_LDRBpost:
@@ -190,14 +387,14 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
           // ldr uop
           cacheVector.push_back(createLdrUop(
               architecture, metadata.operands[0].reg,
-              {metadata.operands[1].mem.base, ARM64_REG_INVALID, 0},
+              {metadata.operands[1].mem.base, AARCH64_REG_INVALID, 0},
               capstoneHandle, false, 1, dataSize));
           // offset generation uop
           cacheVector.push_back(createImmOffsetUop(
               architecture, metadata.operands[1].mem.base,
               metadata.operands[2].imm, capstoneHandle, true));
 
-          iter = microDecodeCache.try_emplace(word, cacheVector).first;
+          iter = microDecodeCache_.try_emplace(word, cacheVector).first;
           break;
         }
         case Opcode::AArch64_LDRBpre:
@@ -217,10 +414,10 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
           // ldr uop
           cacheVector.push_back(createLdrUop(
               architecture, metadata.operands[0].reg,
-              {metadata.operands[1].mem.base, ARM64_REG_INVALID, 0},
+              {metadata.operands[1].mem.base, AARCH64_REG_INVALID, 0},
               capstoneHandle, true, 1, dataSize));
 
-          iter = microDecodeCache.try_emplace(word, cacheVector).first;
+          iter = microDecodeCache_.try_emplace(word, cacheVector).first;
           break;
         }
         case Opcode::AArch64_STPDi:
@@ -236,7 +433,7 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
           // store0 address uop
           cacheVector.push_back(
               createStrUop(architecture,
-                           {metadata.operands[2].mem.base, ARM64_REG_INVALID,
+                           {metadata.operands[2].mem.base, AARCH64_REG_INVALID,
                             metadata.operands[2].mem.disp},
                            capstoneHandle, false, 1, dataSize));
           // store0 data uop
@@ -246,14 +443,14 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
           // store1 address uop
           cacheVector.push_back(
               createStrUop(architecture,
-                           {metadata.operands[2].mem.base, ARM64_REG_INVALID,
+                           {metadata.operands[2].mem.base, AARCH64_REG_INVALID,
                             metadata.operands[2].mem.disp + dataSize},
                            capstoneHandle, false, 2, dataSize));
           // store1 data uop
           cacheVector.push_back(createSDUop(
               architecture, metadata.operands[1].reg, capstoneHandle, true, 2));
 
-          iter = microDecodeCache.try_emplace(word, cacheVector).first;
+          iter = microDecodeCache_.try_emplace(word, cacheVector).first;
           break;
         }
         case Opcode::AArch64_STPDpost:
@@ -269,7 +466,7 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
           // store0 address uop
           cacheVector.push_back(createStrUop(
               architecture,
-              {metadata.operands[2].mem.base, ARM64_REG_INVALID, 0},
+              {metadata.operands[2].mem.base, AARCH64_REG_INVALID, 0},
               capstoneHandle, false, 1, dataSize));
           // store0 data uop
           cacheVector.push_back(createSDUop(architecture,
@@ -278,7 +475,7 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
           // store1 address uop
           cacheVector.push_back(createStrUop(
               architecture,
-              {metadata.operands[2].mem.base, ARM64_REG_INVALID, dataSize},
+              {metadata.operands[2].mem.base, AARCH64_REG_INVALID, dataSize},
               capstoneHandle, false, 2, dataSize));
           // store1 data uop
           cacheVector.push_back(createSDUop(architecture,
@@ -289,7 +486,7 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
               architecture, metadata.operands[2].mem.base,
               metadata.operands[3].imm, capstoneHandle, true));
 
-          iter = microDecodeCache.try_emplace(word, cacheVector).first;
+          iter = microDecodeCache_.try_emplace(word, cacheVector).first;
           break;
         }
         case Opcode::AArch64_STPDpre:
@@ -309,7 +506,7 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
           // store0 address uop
           cacheVector.push_back(createStrUop(
               architecture,
-              {metadata.operands[2].mem.base, ARM64_REG_INVALID, 0},
+              {metadata.operands[2].mem.base, AARCH64_REG_INVALID, 0},
               capstoneHandle, false, 1, dataSize));
           // store0 data uop
           cacheVector.push_back(createSDUop(architecture,
@@ -318,13 +515,13 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
           // store1 address uop
           cacheVector.push_back(createStrUop(
               architecture,
-              {metadata.operands[2].mem.base, ARM64_REG_INVALID, dataSize},
+              {metadata.operands[2].mem.base, AARCH64_REG_INVALID, dataSize},
               capstoneHandle, false, 2, dataSize));
           // store1 data uop
           cacheVector.push_back(createSDUop(
               architecture, metadata.operands[1].reg, capstoneHandle, true, 2));
 
-          iter = microDecodeCache.try_emplace(word, cacheVector).first;
+          iter = microDecodeCache_.try_emplace(word, cacheVector).first;
           break;
         }
         case Opcode::AArch64_STRBpost:
@@ -342,7 +539,7 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
           // store address uop
           cacheVector.push_back(createStrUop(
               architecture,
-              {metadata.operands[1].mem.base, ARM64_REG_INVALID, 0},
+              {metadata.operands[1].mem.base, AARCH64_REG_INVALID, 0},
               capstoneHandle, false, 1, dataSize));
           // store data uop
           cacheVector.push_back(createSDUop(architecture,
@@ -353,7 +550,7 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
               architecture, metadata.operands[1].mem.base,
               metadata.operands[2].imm, capstoneHandle, true));
 
-          iter = microDecodeCache.try_emplace(word, cacheVector).first;
+          iter = microDecodeCache_.try_emplace(word, cacheVector).first;
           break;
         }
         case Opcode::AArch64_STRBpre:
@@ -375,13 +572,13 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
           // store address uop
           cacheVector.push_back(createStrUop(
               architecture,
-              {metadata.operands[1].mem.base, ARM64_REG_INVALID, 0},
+              {metadata.operands[1].mem.base, AARCH64_REG_INVALID, 0},
               capstoneHandle, false, 1, dataSize));
           // store data uop
           cacheVector.push_back(createSDUop(
               architecture, metadata.operands[0].reg, capstoneHandle, true, 1));
 
-          iter = microDecodeCache.try_emplace(word, cacheVector).first;
+          iter = microDecodeCache_.try_emplace(word, cacheVector).first;
           break;
         }
         case Opcode::AArch64_STRBui:
@@ -399,14 +596,14 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
           // store address uop
           cacheVector.push_back(
               createStrUop(architecture,
-                           {metadata.operands[1].mem.base, ARM64_REG_INVALID,
+                           {metadata.operands[1].mem.base, AARCH64_REG_INVALID,
                             metadata.operands[1].mem.disp},
                            capstoneHandle, false, 1, dataSize));
           // store data uop
           cacheVector.push_back(createSDUop(
               architecture, metadata.operands[0].reg, capstoneHandle, true, 1));
 
-          iter = microDecodeCache.try_emplace(word, cacheVector).first;
+          iter = microDecodeCache_.try_emplace(word, cacheVector).first;
           break;
         }
         default: {
@@ -430,92 +627,187 @@ uint8_t MicroDecoder::decode(const Architecture& architecture, uint32_t word,
 }
 
 cs_detail MicroDecoder::createDefaultDetail(std::vector<OpType> opTypes) {
-  cs_arm64 info = default_info;
+  cs_aarch64 info = default_info;
   cs_detail detail = default_detail;
   info.op_count = opTypes.size();
 
-  for (int op = 0; op < opTypes.size(); op++) {
+  for (size_t op = 0; op < opTypes.size(); op++) {
     info.operands[op] = default_op;
     switch (opTypes[op].type) {
-      case arm64_op_type::ARM64_OP_REG: {
-        info.operands[op].type = ARM64_OP_REG;
-        info.operands[op].reg = ARM64_REG_INVALID;
+      case aarch64_op_type::AARCH64_OP_REG: {
+        info.operands[op].type = AARCH64_OP_REG;
+        info.operands[op].reg = AARCH64_REG_INVALID;
         if (opTypes[op].isDestination) {
           info.operands[op].access = CS_AC_WRITE;
         }
         break;
       }
-      case arm64_op_type::ARM64_OP_IMM: {
-        info.operands[op].type = ARM64_OP_IMM;
+      case aarch64_op_type::AARCH64_OP_IMM: {
+        info.operands[op].type = AARCH64_OP_IMM;
         info.operands[op].imm = 0;
         break;
       }
-      case arm64_op_type::ARM64_OP_MEM: {
-        info.operands[op].type = ARM64_OP_MEM;
-        info.operands[op].mem = {ARM64_REG_INVALID, ARM64_REG_INVALID, 0};
+      case aarch64_op_type::AARCH64_OP_MEM: {
+        info.operands[op].type = AARCH64_OP_MEM;
+        info.operands[op].mem = {AARCH64_REG_INVALID, AARCH64_REG_INVALID, 0};
         break;
       }
-      case arm64_op_type::ARM64_OP_INVALID:
-      case arm64_op_type::ARM64_OP_FP:
-      case arm64_op_type::ARM64_OP_CIMM:
-      case arm64_op_type::ARM64_OP_REG_MRS:
-      case arm64_op_type::ARM64_OP_REG_MSR:
-      case arm64_op_type::ARM64_OP_PSTATE:
-      case arm64_op_type::ARM64_OP_SYS:
-      case arm64_op_type::ARM64_OP_SVCR:
-      case arm64_op_type::ARM64_OP_PREFETCH:
-      case arm64_op_type::ARM64_OP_BARRIER:
-      case arm64_op_type::ARM64_OP_SME_INDEX:
+      case aarch64_op_type::AARCH64_OP_INVALID:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_MEM_REG:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_MEM_IMM:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_FP:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_CIMM:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_REG_MRS:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_REG_MSR:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_IMPLICIT_IMM_0:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_SVCR:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_AT:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_DB:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_DC:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_ISB:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_TSB:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_PRFM:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_SVEPRFM:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_RPRFM:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_PSTATEIMM0_15:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_PSTATEIMM0_1:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_PSB:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_BTI:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_SVEPREDPAT:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_SVEVECLENSPECIFIER:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_SME:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_IMM_RANGE:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_TLBI:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_IC:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_DBNXS:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_EXACTFPIMM:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_SYSREG:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_SYSIMM:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_SYSALIAS:
+        [[fallthrough]];
+      case aarch64_op_type::AARCH64_OP_PRED:
         break;
     }
   }
-  detail.arm64 = info;
+  detail.aarch64 = info;
   return detail;
 }
 
 Instruction MicroDecoder::createImmOffsetUop(const Architecture& architecture,
-                                             arm64_reg base, int64_t offset,
+                                             aarch64_reg base, int64_t offset,
                                              csh capstoneHandle,
                                              bool lastMicroOp,
                                              int microOpIndex) {
-  cs_detail off_imm_detail =
-      createDefaultDetail({{ARM64_OP_REG, 1}, {ARM64_OP_REG}, {ARM64_OP_IMM}});
-  off_imm_detail.arm64.operands[0].reg = base;
-  off_imm_detail.arm64.operands[1].reg = base;
-  off_imm_detail.arm64.operands[2].imm = offset;
+  cs_detail off_imm_detail = createDefaultDetail(
+      {{AARCH64_OP_REG, 1}, {AARCH64_OP_REG}, {AARCH64_OP_IMM}});
+  off_imm_detail.aarch64.operands[0].reg = base;
+  off_imm_detail.aarch64.operands[1].reg = base;
+  off_imm_detail.aarch64.operands[2].imm = offset;
 
-  cs_insn off_imm_cs = {arm64_insn::ARM64_INS_ADD,
+  cs_insn off_imm_cs = {aarch64_insn::AARCH64_INS_ADD,
+                        aarch64_insn::AARCH64_INS_INVALID,
                         0x0,
                         4,
                         "",
                         "micro_offset_imm",
                         "",
+                        false,
+                        false,
                         &off_imm_detail,
                         MicroOpcode::OFFSET_IMM};
 
   InstructionMetadata off_imm_metadata(off_imm_cs);
-  microMetadataCache.emplace_front(off_imm_metadata);
-  Instruction off_imm(architecture, microMetadataCache.front(),
+  microMetadataCache_.emplace_front(off_imm_metadata);
+  Instruction off_imm(architecture, microMetadataCache_.front(),
                       MicroOpInfo({true, MicroOpcode::OFFSET_IMM, 0,
                                    lastMicroOp, microOpIndex}));
   off_imm.setExecutionInfo(architecture.getExecutionInfo(off_imm));
   return off_imm;
 }
 
+Instruction MicroDecoder::createRegOffsetUop(
+    const Architecture& architecture, aarch64_reg base, aarch64_reg offset,
+    csh capstoneHandle, bool lastMicroOp, int microOpIndex) {
+  cs_detail off_reg_detail = createDefaultDetail(
+      {{AARCH64_OP_REG, 1}, {AARCH64_OP_REG}, {AARCH64_OP_REG}});
+  off_reg_detail.aarch64.operands[0].reg = base;
+  off_reg_detail.aarch64.operands[1].reg = base;
+  off_reg_detail.aarch64.operands[2].reg = offset;
+
+  cs_insn off_reg_cs = {aarch64_insn::AARCH64_INS_ADD,
+                        aarch64_insn::AARCH64_INS_INVALID,
+                        0x0,
+                        4,
+                        "",
+                        "micro_offset_reg",
+                        "",
+                        false,
+                        false,
+                        &off_reg_detail,
+                        MicroOpcode::OFFSET_REG};
+
+  InstructionMetadata off_reg_metadata(off_reg_cs);
+  microMetadataCache_.emplace_front(off_reg_metadata);
+  Instruction off_reg(architecture, microMetadataCache_.front(),
+                      MicroOpInfo({true, MicroOpcode::OFFSET_REG, 0,
+                                   lastMicroOp, microOpIndex}));
+  off_reg.setExecutionInfo(architecture.getExecutionInfo(off_reg));
+  return off_reg;
+}
+
 Instruction MicroDecoder::createLdrUop(const Architecture& architecture,
-                                       arm64_reg dest, arm64_op_mem mem,
+                                       aarch64_reg dest, aarch64_op_mem mem,
                                        csh capstoneHandle, bool lastMicroOp,
                                        int microOpIndex, uint8_t dataSize) {
   cs_detail ldr_detail =
-      createDefaultDetail({{ARM64_OP_REG, 1}, {ARM64_OP_MEM}});
-  ldr_detail.arm64.operands[0].reg = dest;
-  ldr_detail.arm64.operands[1].mem = mem;
-  cs_insn ldr_cs = {
-      arm64_insn::ARM64_INS_LDR, 0x0, 4, "", "micro_ldr", "", &ldr_detail,
-      MicroOpcode::LDR_ADDR};
+      createDefaultDetail({{AARCH64_OP_REG, 1}, {AARCH64_OP_MEM}});
+  ldr_detail.aarch64.operands[0].reg = dest;
+  ldr_detail.aarch64.operands[1].mem = mem;
+  cs_insn ldr_cs = {aarch64_insn::AARCH64_INS_LDR,
+                    aarch64_insn::AARCH64_INS_INVALID,
+                    0x0,
+                    4,
+                    "",
+                    "micro_ldr",
+                    "",
+                    false,
+                    false,
+                    &ldr_detail,
+                    MicroOpcode::LDR_ADDR};
   InstructionMetadata ldr_metadata(ldr_cs);
-  microMetadataCache.emplace_front(ldr_metadata);
-  Instruction ldr(architecture, microMetadataCache.front(),
+  microMetadataCache_.emplace_front(ldr_metadata);
+  Instruction ldr(architecture, microMetadataCache_.front(),
                   MicroOpInfo({true, MicroOpcode::LDR_ADDR, dataSize,
                                lastMicroOp, microOpIndex}));
   ldr.setExecutionInfo(architecture.getExecutionInfo(ldr));
@@ -523,34 +815,50 @@ Instruction MicroDecoder::createLdrUop(const Architecture& architecture,
 }
 
 Instruction MicroDecoder::createSDUop(const Architecture& architecture,
-                                      arm64_reg src, csh capstoneHandle,
+                                      aarch64_reg src, csh capstoneHandle,
                                       bool lastMicroOp, int microOpIndex) {
-  cs_detail sd_detail = createDefaultDetail({{ARM64_OP_REG}});
-  sd_detail.arm64.operands[0].reg = src;
-  cs_insn sd_cs = {
-      arm64_insn::ARM64_INS_STR, 0x0, 4, "", "micro_sd", "", &sd_detail,
-      MicroOpcode::STR_DATA};
+  cs_detail sd_detail = createDefaultDetail({{AARCH64_OP_REG}});
+  sd_detail.aarch64.operands[0].reg = src;
+  cs_insn sd_cs = {aarch64_insn::AARCH64_INS_STR,
+                   aarch64_insn::AARCH64_INS_INVALID,
+                   0x0,
+                   4,
+                   "",
+                   "micro_sd",
+                   "",
+                   false,
+                   false,
+                   &sd_detail,
+                   MicroOpcode::STR_DATA};
   InstructionMetadata sd_metadata(sd_cs);
-  microMetadataCache.emplace_front(sd_metadata);
+  microMetadataCache_.emplace_front(sd_metadata);
   Instruction sd(
-      architecture, microMetadataCache.front(),
+      architecture, microMetadataCache_.front(),
       MicroOpInfo({true, MicroOpcode::STR_DATA, 0, lastMicroOp, microOpIndex}));
   sd.setExecutionInfo(architecture.getExecutionInfo(sd));
   return sd;
 }
 
 Instruction MicroDecoder::createStrUop(const Architecture& architecture,
-                                       arm64_op_mem mem, csh capstoneHandle,
+                                       aarch64_op_mem mem, csh capstoneHandle,
                                        bool lastMicroOp, int microOpIndex,
                                        uint8_t dataSize) {
-  cs_detail str_detail = createDefaultDetail({{ARM64_OP_MEM}});
-  str_detail.arm64.operands[0].mem = mem;
-  cs_insn str_cs = {
-      arm64_insn::ARM64_INS_STR, 0x0, 4, "", "micro_str", "", &str_detail,
-      MicroOpcode::STR_DATA};
+  cs_detail str_detail = createDefaultDetail({{AARCH64_OP_MEM}});
+  str_detail.aarch64.operands[0].mem = mem;
+  cs_insn str_cs = {aarch64_insn::AARCH64_INS_STR,
+                    aarch64_insn::AARCH64_INS_INVALID,
+                    0x0,
+                    4,
+                    "",
+                    "micro_str",
+                    "",
+                    false,
+                    false,
+                    &str_detail,
+                    MicroOpcode::STR_ADDR};
   InstructionMetadata str_metadata(str_cs);
-  microMetadataCache.emplace_front(str_metadata);
-  Instruction str(architecture, microMetadataCache.front(),
+  microMetadataCache_.emplace_front(str_metadata);
+  Instruction str(architecture, microMetadataCache_.front(),
                   MicroOpInfo({true, MicroOpcode::STR_ADDR, dataSize,
                                lastMicroOp, microOpIndex}));
   str.setExecutionInfo(architecture.getExecutionInfo(str));
diff --git a/src/lib/arch/riscv/Architecture.cc b/src/lib/arch/riscv/Architecture.cc
index 670663fc49..14edc45b93 100644
--- a/src/lib/arch/riscv/Architecture.cc
+++ b/src/lib/arch/riscv/Architecture.cc
@@ -11,12 +11,31 @@ namespace simeng {
 namespace arch {
 namespace riscv {
 
-std::unordered_map<uint32_t, Instruction> Architecture::decodeCache;
-std::forward_list<InstructionMetadata> Architecture::metadataCache;
+Architecture::Architecture(kernel::Linux& kernel, ryml::ConstNodeRef config)
+    : arch::Architecture(kernel) {
+  // Set initial rounding mode for F/D extensions
+  // TODO set fcsr accordingly when Zicsr extension supported
+  fesetround(FE_TONEAREST);
+
+  cs_err n;
+
+  // Check whether compressed instructions in use. Initialise variables and
+  // Capstone accordingly
+  if (config["Core"]["Compressed"].as<bool>()) {
+    addressAlignmentMask_ = constantsPool::addressAlignMaskCompressed;
+    minInsnLength_ = constantsPool::minInstWidthBytesCompressed;
+
+    n = cs_open(CS_ARCH_RISCV,
+                static_cast<cs_mode>(CS_MODE_RISCV64 | CS_MODE_RISCVC),
+                &capstoneHandle_);
+  } else {
+    addressAlignmentMask_ = constantsPool::addressAlignMask;
+    minInsnLength_ = constantsPool::minInstWidthBytes;
+
+    n = cs_open(CS_ARCH_RISCV, static_cast<cs_mode>(CS_MODE_RISCV64),
+                &capstoneHandle_);
+  }
 
-Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
-    : linux_(kernel) {
-  cs_err n = cs_open(CS_ARCH_RISCV, CS_MODE_RISCV64, &capstoneHandle);
   if (n != CS_ERR_OK) {
     std::cerr << "[SimEng:Architecture] Could not create capstone handle due "
                  "to error "
@@ -24,21 +43,32 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
     exit(1);
   }
 
-  cs_option(capstoneHandle, CS_OPT_DETAIL, CS_OPT_ON);
+  cs_option(capstoneHandle_, CS_OPT_DETAIL, CS_OPT_ON);
+
+  // Generate zero-indexed system register map
+  for (size_t i = 0; i < config::SimInfo::getSysRegVec().size(); i++) {
+    systemRegisterMap_[config::SimInfo::getSysRegVec()[i]] =
+        systemRegisterMap_.size();
+  }
+
+  cycleSystemReg_ = {
+      RegisterType::SYSTEM,
+      static_cast<uint16_t>(getSystemRegisterTag(RISCV_SYSREG_CYCLE))};
 
-  // Instantiate an executionInfo entry for each group in the InstructionGroup
+  // Instantiate an ExecutionInfo entry for each group in the InstructionGroup
   // namespace.
   for (int i = 0; i < NUM_GROUPS; i++) {
     groupExecutionInfo_[i] = {1, 1, {}};
   }
   // Extract execution latency/throughput for each group
   std::vector<uint8_t> inheritanceDistance(NUM_GROUPS, UINT8_MAX);
-  for (size_t i = 0; i < config["Latencies"].size(); i++) {
-    YAML::Node port_node = config["Latencies"][i];
+  for (size_t i = 0; i < config["Latencies"].num_children(); i++) {
+    ryml::ConstNodeRef port_node = config["Latencies"][i];
     uint16_t latency = port_node["Execution-Latency"].as<uint16_t>();
     uint16_t throughput = port_node["Execution-Throughput"].as<uint16_t>();
-    for (size_t j = 0; j < port_node["Instruction-Group"].size(); j++) {
-      uint16_t group = port_node["Instruction-Group"][j].as<uint16_t>();
+    for (size_t j = 0; j < port_node["Instruction-Group-Nums"].num_children();
+         j++) {
+      uint16_t group = port_node["Instruction-Group-Nums"][j].as<uint16_t>();
       groupExecutionInfo_[group].latency = latency;
       groupExecutionInfo_[group].stallCycles = throughput;
       // Set zero inheritance distance for latency assignment as it's explicitly
@@ -51,10 +81,10 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
       uint8_t distance = 1;
       while (groups.size()) {
         // Determine if there's any inheritance
-        if (groupInheritance.find(groups.front()) != groupInheritance.end()) {
+        if (groupInheritance_.find(groups.front()) != groupInheritance_.end()) {
           std::vector<uint16_t> inheritedGroups =
-              groupInheritance.at(groups.front());
-          for (int k = 0; k < inheritedGroups.size(); k++) {
+              groupInheritance_.at(groups.front());
+          for (size_t k = 0; k < inheritedGroups.size(); k++) {
             // Determine if this group has inherited latency values from a
             // smaller distance
             if (inheritanceDistance[inheritedGroups[k]] > distance) {
@@ -70,8 +100,9 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
       }
     }
     // Store any opcode-based latency override
-    for (size_t j = 0; j < port_node["Instruction-Opcode"].size(); j++) {
-      uint16_t opcode = port_node["Instruction-Opcode"][j].as<uint16_t>();
+    for (size_t j = 0; j < port_node["Instruction-Opcodes"].num_children();
+         j++) {
+      uint16_t opcode = port_node["Instruction-Opcodes"][j].as<uint16_t>();
       opcodeExecutionInfo_[opcode].latency = latency;
       opcodeExecutionInfo_[opcode].stallCycles = throughput;
     }
@@ -79,25 +110,28 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
 
   // ports entries in the groupExecutionInfo_ entries only apply for models
   // using the outoforder core archetype
-  if (config["Core"]["Simulation-Mode"].as<std::string>() == "outoforder") {
+  if (config::SimInfo::getSimMode() == config::SimulationMode::Outoforder) {
     // Create mapping between instructions groups and the ports that support
     // them
-    for (size_t i = 0; i < config["Ports"].size(); i++) {
+    for (size_t i = 0; i < config["Ports"].num_children(); i++) {
       // Store which ports support which groups
-      YAML::Node group_node = config["Ports"][i]["Instruction-Group-Support"];
-      for (size_t j = 0; j < group_node.size(); j++) {
+      ryml::ConstNodeRef group_node =
+          config["Ports"][i]["Instruction-Group-Support-Nums"];
+      for (size_t j = 0; j < group_node.num_children(); j++) {
         uint16_t group = group_node[j].as<uint16_t>();
-        uint8_t newPort = static_cast<uint8_t>(i);
+        uint16_t newPort = static_cast<uint16_t>(i);
+
         groupExecutionInfo_[group].ports.push_back(newPort);
         // Add inherited support for those appropriate groups
         std::queue<uint16_t> groups;
         groups.push(group);
         while (groups.size()) {
           // Determine if there's any inheritance
-          if (groupInheritance.find(groups.front()) != groupInheritance.end()) {
+          if (groupInheritance_.find(groups.front()) !=
+              groupInheritance_.end()) {
             std::vector<uint16_t> inheritedGroups =
-                groupInheritance.at(groups.front());
-            for (int k = 0; k < inheritedGroups.size(); k++) {
+                groupInheritance_.at(groups.front());
+            for (size_t k = 0; k < inheritedGroups.size(); k++) {
               groupExecutionInfo_[inheritedGroups[k]].ports.push_back(newPort);
               groups.push(inheritedGroups[k]);
             }
@@ -106,79 +140,116 @@ Architecture::Architecture(kernel::Linux& kernel, YAML::Node config)
         }
       }
       // Store any opcode-based port support override
-      YAML::Node opcode_node = config["Ports"][i]["Instruction-Opcode-Support"];
-      for (size_t j = 0; j < opcode_node.size(); j++) {
+      ryml::ConstNodeRef opcode_node =
+          config["Ports"][i]["Instruction-Opcode-Support"];
+      for (size_t j = 0; j < opcode_node.num_children(); j++) {
         // If latency information hasn't been defined, set to zero as to inform
         // later access to use group defined latencies instead
         uint16_t opcode = opcode_node[j].as<uint16_t>();
-        opcodeExecutionInfo_.try_emplace(
-            opcode, simeng::arch::riscv::executionInfo{0, 0, {}});
+        opcodeExecutionInfo_.try_emplace(opcode, ExecutionInfo{0, 0, {}});
         opcodeExecutionInfo_[opcode].ports.push_back(static_cast<uint8_t>(i));
       }
     }
   }
 }
-Architecture::~Architecture() {
-  cs_close(&capstoneHandle);
-  decodeCache.clear();
-  metadataCache.clear();
-  groupExecutionInfo_.clear();
-}
 
-uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
+Architecture::~Architecture() { cs_close(&capstoneHandle_); }
+
+uint8_t Architecture::predecode(const uint8_t* ptr, uint16_t bytesAvailable,
                                 uint64_t instructionAddress, MacroOp& output,
                                 std::string& disasm) const {
   // Check that instruction address is 4-byte aligned as required by RISC-V
-  if (instructionAddress & 0x3) {
+  // 2-byte when Compressed extension is supported
+  if (instructionAddress & addressAlignmentMask_) {
     // Consume 1-byte and raise a misaligned PC exception
     auto metadata = InstructionMetadata((uint8_t*)ptr, 1);
-    metadataCache.emplace_front(metadata);
+    metadataCache_.emplace_front(metadata);
     output.resize(1);
     auto& uop = output[0];
-    uop = std::make_shared<Instruction>(*this, metadataCache.front(),
+    uop = std::make_shared<Instruction>(*this, metadataCache_.front(),
                                         InstructionException::MisalignedPC);
     uop->setInstructionAddress(instructionAddress);
     // Return non-zero value to avoid fatal error
     return 1;
   }
 
-  assert(bytesAvailable >= 4 &&
-         "Fewer than 4 bytes supplied to RISC-V decoder");
-
-  // Dereference the instruction pointer to obtain the instruction word
-  uint32_t insn;
-  memcpy(&insn, ptr, 4);
+  assert(bytesAvailable >= minInsnLength_ &&
+         "Fewer than bytes limit supplied to RISC-V decoder");
+
+  // Get the first byte
+  uint8_t firstByte = ((uint8_t*)ptr)[0];
+
+  uint32_t insnEncoding = 0;
+  size_t insnSize = 4;
+
+  // Predecode bytes to determine whether we have a compressed instruction.
+  // This will allow continuation if a compressed instruction is in the last 2
+  // bytes of a fetch block, but will request more data if only half of a
+  // non-compressed instruction is present
+
+  // Check the 2 least significant bits as these determine instruction length
+  if ((firstByte & 0b11) != 0b11) {
+    // 2 byte - compressed
+    // Only use relevant bytes
+    // Dereference the instruction pointer to obtain the instruction word
+    memcpy(&insnEncoding, ptr, 2);
+    insnSize = 2;
+  } else {
+    // 4 byte
+    if (bytesAvailable < 4) {
+      // Not enough bytes available, bail
+      return 0;
+    }
+    // Dereference the instruction pointer to obtain the instruction word
+    memcpy(&insnEncoding, ptr, 4);
+  }
 
   // Try to find the decoding in the decode cache
-  auto iter = decodeCache.find(insn);
-  if (iter == decodeCache.end()) {
+  auto iter = decodeCache_.find(insnEncoding);
+  if (iter == decodeCache_.end()) {
     // No decoding present. Generate a fresh decoding, and add to cache
-    cs_insn rawInsn;
+    // Calloc memory to ensure rawInsn is initialised with zeros. Errors can
+    // occur otherwise as Capstone doesn't update variables for invalid
+    // instructions
+    cs_insn* rawInsnPointer = (cs_insn*)calloc(1, sizeof(cs_insn));
+    cs_insn rawInsn = *rawInsnPointer;
+    assert(rawInsn.size == 0 && "rawInsn not initialised correctly");
+
     cs_detail rawDetail;
     rawInsn.detail = &rawDetail;
+    // Size requires initialisation in case of capstone failure which won't
+    // update this value
+    rawInsn.size = insnSize;
 
-    size_t size = 4;
     uint64_t address = 0;
 
     const uint8_t* encoding = reinterpret_cast<const uint8_t*>(ptr);
 
-    bool success =
-        cs_disasm_iter(capstoneHandle, &encoding, &size, &address, &rawInsn);
+    bool success = cs_disasm_iter(capstoneHandle_, &encoding, &insnSize,
+                                  &address, &rawInsn);
 
-    auto metadata =
-        success ? InstructionMetadata(rawInsn) : InstructionMetadata(encoding);
+    auto metadata = success ? InstructionMetadata(rawInsn)
+                            : InstructionMetadata(encoding, rawInsn.size);
+
+    free(rawInsnPointer);
 
     // Cache the metadata
-    metadataCache.push_front(metadata);
+    metadataCache_.push_front(metadata);
 
     // Create an instruction using the metadata
-    Instruction newInsn(*this, metadataCache.front());
+    Instruction newInsn(*this, metadataCache_.front());
     // Set execution information for this instruction
     newInsn.setExecutionInfo(getExecutionInfo(newInsn));
+
     // Cache the instruction
-    iter = decodeCache.insert({insn, newInsn}).first;
+    iter = decodeCache_.insert({insnEncoding, newInsn}).first;
   }
 
+  assert(((insnEncoding & 0b11) != 0b11
+              ? iter->second.getMetadata().getInsnLength() == 2
+              : iter->second.getMetadata().getInsnLength() == 4) &&
+         "Predicted number of bytes don't match disassembled number of bytes");
+
   output.resize(1);
   auto& uop = output[0];
 
@@ -187,49 +258,23 @@ uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
 
   uop->setInstructionAddress(instructionAddress);
 
-  return 4;
-}
-
-executionInfo Architecture::getExecutionInfo(Instruction& insn) const {
-  // Assume no opcode-based override
-  executionInfo exeInfo = groupExecutionInfo_.at(insn.getGroup());
-  if (opcodeExecutionInfo_.find(insn.getMetadata().opcode) !=
-      opcodeExecutionInfo_.end()) {
-    // Replace with overrided values
-    executionInfo overrideInfo =
-        opcodeExecutionInfo_.at(insn.getMetadata().opcode);
-    if (overrideInfo.latency != 0) exeInfo.latency = overrideInfo.latency;
-    if (overrideInfo.stallCycles != 0)
-      exeInfo.stallCycles = overrideInfo.stallCycles;
-    if (overrideInfo.ports.size()) exeInfo.ports = overrideInfo.ports;
-  }
-  return exeInfo;
-}
-
-std::shared_ptr<arch::ExceptionHandler> Architecture::handleException(
-    const std::shared_ptr<simeng::Instruction>& instruction, const Core& core,
-    MemoryInterface& memory) const {
-  return std::make_shared<ExceptionHandler>(instruction, core, memory, linux_);
-}
-
-std::vector<RegisterFileStructure> Architecture::getRegisterFileStructures()
-    const {
-  uint16_t numSysRegs = static_cast<uint16_t>(systemRegisterMap_.size());
-  return {
-      {8, 32},          // General purpose
-      {8, 32},          // Floating Point
-      {8, numSysRegs},  // System
-  };
+  return iter->second.getMetadata().getInsnLength();
 }
 
 int32_t Architecture::getSystemRegisterTag(uint16_t reg) const {
   // Check below is done for speculative instructions that may be passed into
   // the function but will not be executed. If such invalid speculative
   // instructions get through they can cause an out-of-range error.
-  if (!systemRegisterMap_.count(reg)) return 0;
+  if (!systemRegisterMap_.count(reg)) return -1;
   return systemRegisterMap_.at(reg);
 }
 
+std::shared_ptr<arch::ExceptionHandler> Architecture::handleException(
+    const std::shared_ptr<simeng::Instruction>& instruction, const Core& core,
+    memory::MemoryInterface& memory) const {
+  return std::make_shared<ExceptionHandler>(instruction, core, memory, linux_);
+}
+
 ProcessStateChange Architecture::getInitialState() const {
   ProcessStateChange changes;
   // Set ProcessStateChange type
@@ -245,26 +290,27 @@ ProcessStateChange Architecture::getInitialState() const {
 
 uint8_t Architecture::getMaxInstructionSize() const { return 4; }
 
-std::vector<RegisterFileStructure>
-Architecture::getConfigPhysicalRegisterStructure(YAML::Node config) const {
-  return {{8, config["Register-Set"]["GeneralPurpose-Count"].as<uint16_t>()},
-          {8, config["Register-Set"]["FloatingPoint-Count"].as<uint16_t>()},
-          {8, getNumSystemRegisters()}};
-}
-
-std::vector<uint16_t> Architecture::getConfigPhysicalRegisterQuantities(
-    YAML::Node config) const {
-  return {config["Register-Set"]["GeneralPurpose-Count"].as<uint16_t>(),
-          config["Register-Set"]["FloatingPoint-Count"].as<uint16_t>(),
-          getNumSystemRegisters()};
-}
-uint16_t Architecture::getNumSystemRegisters() const {
-  return static_cast<uint16_t>(systemRegisterMap_.size());
-}
+uint8_t Architecture::getMinInstructionSize() const { return minInsnLength_; }
 
-// Left blank as no implementation necessary
 void Architecture::updateSystemTimerRegisters(RegisterFileSet* regFile,
                                               const uint64_t iterations) const {
+  regFile->set(cycleSystemReg_, iterations);
+}
+
+ExecutionInfo Architecture::getExecutionInfo(const Instruction& insn) const {
+  // Assume no opcode-based override
+  ExecutionInfo exeInfo = groupExecutionInfo_.at(insn.getGroup());
+  if (opcodeExecutionInfo_.find(insn.getMetadata().opcode) !=
+      opcodeExecutionInfo_.end()) {
+    // Replace with overrided values
+    ExecutionInfo overrideInfo =
+        opcodeExecutionInfo_.at(insn.getMetadata().opcode);
+    if (overrideInfo.latency != 0) exeInfo.latency = overrideInfo.latency;
+    if (overrideInfo.stallCycles != 0)
+      exeInfo.stallCycles = overrideInfo.stallCycles;
+    if (overrideInfo.ports.size()) exeInfo.ports = overrideInfo.ports;
+  }
+  return exeInfo;
 }
 
 }  // namespace riscv
diff --git a/src/lib/arch/riscv/ExceptionHandler.cc b/src/lib/arch/riscv/ExceptionHandler.cc
index 8f76c4cc3a..15a5518c64 100644
--- a/src/lib/arch/riscv/ExceptionHandler.cc
+++ b/src/lib/arch/riscv/ExceptionHandler.cc
@@ -5,6 +5,7 @@
 
 #include "InstructionMetadata.hh"
 #include "simeng/ArchitecturalRegisterFileSet.hh"
+#include "simeng/arch/riscv/Architecture.hh"
 
 namespace simeng {
 namespace arch {
@@ -12,9 +13,9 @@ namespace riscv {
 
 ExceptionHandler::ExceptionHandler(
     const std::shared_ptr<simeng::Instruction>& instruction, const Core& core,
-    MemoryInterface& memory, kernel::Linux& linux_)
+    memory::MemoryInterface& memory, kernel::Linux& linux_)
     : instruction_(*static_cast<Instruction*>(instruction.get())),
-      core(core),
+      core_(core),
       memory_(memory),
       linux_(linux_) {
   resumeHandling_ = [this]() { return init(); };
@@ -24,7 +25,7 @@ bool ExceptionHandler::tick() { return resumeHandling_(); }
 
 bool ExceptionHandler::init() {
   InstructionException exception = instruction_.getException();
-  const auto& registerFileSet = core.getArchitecturalRegisterFileSet();
+  const auto& registerFileSet = core_.getArchitecturalRegisterFileSet();
 
   if (exception == InstructionException::SupervisorCall) {
     // Retrieve syscall ID held in register a7
@@ -106,7 +107,7 @@ bool ExceptionHandler::init() {
         uint64_t count = registerFileSet.get(R2).get<uint64_t>();
 
         return readBufferThen(bufPtr, count, [=]() {
-          int64_t totalRead = linux_.getdents64(fd, dataBuffer.data(), count);
+          int64_t totalRead = linux_.getdents64(fd, dataBuffer_.data(), count);
           ProcessStateChange stateChange = {
               ChangeType::REPLACEMENT, {R0}, {totalRead}};
           // Check for failure
@@ -114,23 +115,18 @@ bool ExceptionHandler::init() {
             return concludeSyscall(stateChange);
           }
 
-          int64_t bytesRemaining = totalRead;
           // Get pointer and size of the buffer
           uint64_t iDst = bufPtr;
-          uint64_t iLength = bytesRemaining;
-          if (iLength > bytesRemaining) {
-            iLength = bytesRemaining;
-          }
-          bytesRemaining -= iLength;
           // Write data for this buffer in 128-byte chunks
-          auto iSrc = reinterpret_cast<const char*>(dataBuffer.data());
-          while (iLength > 0) {
-            uint8_t len = iLength > 128 ? 128 : static_cast<uint8_t>(iLength);
+          auto iSrc = reinterpret_cast<const char*>(dataBuffer_.data());
+          while (totalRead > 0) {
+            uint8_t len =
+                totalRead > 128 ? 128 : static_cast<uint8_t>(totalRead);
             stateChange.memoryAddresses.push_back({iDst, len});
             stateChange.memoryAddressValues.push_back({iSrc, len});
             iDst += len;
             iSrc += len;
-            iLength -= len;
+            totalRead -= len;
           }
           return concludeSyscall(stateChange);
         });
@@ -148,7 +144,7 @@ bool ExceptionHandler::init() {
         uint64_t bufPtr = registerFileSet.get(R1).get<uint64_t>();
         uint64_t count = registerFileSet.get(R2).get<uint64_t>();
         return readBufferThen(bufPtr, count, [=]() {
-          int64_t totalRead = linux_.read(fd, dataBuffer.data(), count);
+          int64_t totalRead = linux_.read(fd, dataBuffer_.data(), count);
           ProcessStateChange stateChange = {
               ChangeType::REPLACEMENT, {R0}, {totalRead}};
           // Check for failure
@@ -156,17 +152,13 @@ bool ExceptionHandler::init() {
             return concludeSyscall(stateChange);
           }
 
-          int64_t bytesRemaining = totalRead;
           // Get pointer and size of the buffer
           uint64_t iDst = bufPtr;
-          uint64_t iLength = bytesRemaining;
-          if (iLength > bytesRemaining) {
-            iLength = bytesRemaining;
-          }
-          bytesRemaining -= iLength;
+          // totalRead not negative due to above check so cast is safe
+          uint64_t iLength = static_cast<uint64_t>(totalRead);
 
           // Write data for this buffer in 128-byte chunks
-          auto iSrc = reinterpret_cast<const char*>(dataBuffer.data());
+          auto iSrc = reinterpret_cast<const char*>(dataBuffer_.data());
           while (iLength > 0) {
             uint8_t len = iLength > 128 ? 128 : static_cast<uint8_t>(iLength);
             stateChange.memoryAddresses.push_back({iDst, len});
@@ -183,7 +175,7 @@ bool ExceptionHandler::init() {
         uint64_t bufPtr = registerFileSet.get(R1).get<uint64_t>();
         uint64_t count = registerFileSet.get(R2).get<uint64_t>();
         return readBufferThen(bufPtr, count, [=]() {
-          int64_t retval = linux_.write(fd, dataBuffer.data(), count);
+          int64_t retval = linux_.write(fd, dataBuffer_.data(), count);
           ProcessStateChange stateChange = {
               ChangeType::REPLACEMENT, {R0}, {retval}};
           return concludeSyscall(stateChange);
@@ -207,7 +199,7 @@ bool ExceptionHandler::init() {
         // generates the memory write requests.
         auto invokeKernel = [=]() {
           // The iov structure has been read into `dataBuffer`
-          uint64_t* iovdata = reinterpret_cast<uint64_t*>(dataBuffer.data());
+          uint64_t* iovdata = reinterpret_cast<uint64_t*>(dataBuffer_.data());
 
           // Allocate buffers to hold the data read by the kernel
           std::vector<std::vector<uint8_t>> buffers(iovcnt);
@@ -233,7 +225,8 @@ bool ExceptionHandler::init() {
           }
 
           // Build list of memory write operations
-          int64_t bytesRemaining = totalRead;
+          // totalRead not negative due to above check so cast is safe
+          uint64_t bytesRemaining = static_cast<uint64_t>(totalRead);
           for (int64_t i = 0; i < iovcnt; i++) {
             // Get pointer and size of the buffer
             uint64_t iDst = iovdata[i * 2 + 0];
@@ -278,8 +271,8 @@ bool ExceptionHandler::init() {
         // Create the final handler in the chain, which invokes the kernel
         std::function<bool()> last = [=]() {
           // Rebuild the iovec structures using pointers to `dataBuffer` data
-          uint64_t* iovdata = reinterpret_cast<uint64_t*>(dataBuffer.data());
-          uint8_t* bufferPtr = dataBuffer.data() + iovcnt * 16;
+          uint64_t* iovdata = reinterpret_cast<uint64_t*>(dataBuffer_.data());
+          uint8_t* bufferPtr = dataBuffer_.data() + iovcnt * 16;
           for (int64_t i = 0; i < iovcnt; i++) {
             iovdata[i * 2 + 0] = reinterpret_cast<uint64_t>(bufferPtr);
 
@@ -289,7 +282,7 @@ bool ExceptionHandler::init() {
           }
 
           // Invoke the kernel
-          int64_t retval = linux_.writev(fd, dataBuffer.data(), iovcnt);
+          int64_t retval = linux_.writev(fd, dataBuffer_.data(), iovcnt);
           ProcessStateChange stateChange = {
               ChangeType::REPLACEMENT, {R0}, {retval}};
           return concludeSyscall(stateChange);
@@ -298,7 +291,7 @@ bool ExceptionHandler::init() {
         // Build the chain of buffer loads backwards through the iov buffers
         for (int64_t i = iovcnt - 1; i >= 0; i--) {
           last = [=]() {
-            uint64_t* iovdata = reinterpret_cast<uint64_t*>(dataBuffer.data());
+            uint64_t* iovdata = reinterpret_cast<uint64_t*>(dataBuffer_.data());
             uint64_t ptr = iovdata[i * 2 + 0];
             uint64_t len = iovdata[i * 2 + 1];
             return readBufferThen(ptr, len, last);
@@ -331,20 +324,21 @@ bool ExceptionHandler::init() {
         int64_t flag = registerFileSet.get(R3).get<int64_t>();
 
         char* filename = new char[kernel::Linux::LINUX_PATH_MAX];
-        return readStringThen(
-            filename, filenamePtr, kernel::Linux::LINUX_PATH_MAX,
-            [=](auto length) {
-              // Invoke the kernel
-              kernel::stat statOut;
-              uint64_t retval = linux_.newfstatat(dfd, filename, statOut, flag);
-              ProcessStateChange stateChange = {
-                  ChangeType::REPLACEMENT, {R0}, {retval}};
-              delete[] filename;
-              stateChange.memoryAddresses.push_back(
-                  {statbufPtr, sizeof(statOut)});
-              stateChange.memoryAddressValues.push_back(statOut);
-              return concludeSyscall(stateChange);
-            });
+        return readStringThen(filename, filenamePtr,
+                              kernel::Linux::LINUX_PATH_MAX, [=](auto length) {
+                                // Invoke the kernel
+                                kernel::stat statOut;
+                                uint64_t retval = linux_.newfstatat(
+                                    dfd, filename, statOut, flag);
+                                ProcessStateChange stateChange = {
+                                    ChangeType::REPLACEMENT, {R0}, {retval}};
+                                delete[] filename;
+                                stateChange.memoryAddresses.push_back(
+                                    {statbufPtr, sizeof(statOut)});
+                                stateChange.memoryAddressValues.push_back(
+                                    {statOut, sizeof(statOut)});
+                                return concludeSyscall(stateChange);
+                              });
 
         break;
       }
@@ -361,7 +355,7 @@ bool ExceptionHandler::init() {
       }
       case 93: {  // exit
         auto exitCode = registerFileSet.get(R0).get<uint64_t>();
-        std::cout << "[SimEng:ExceptionHandler] Received exit syscall: "
+        std::cout << "\n[SimEng:ExceptionHandler] Received exit syscall: "
                      "terminating with exit code "
                   << exitCode << std::endl;
         return fatal();
@@ -401,7 +395,7 @@ bool ExceptionHandler::init() {
       }
       case 113: {  // clock_gettime
         uint64_t clkId = registerFileSet.get(R0).get<uint64_t>();
-        uint64_t systemTimer = core.getSystemTimer();
+        uint64_t systemTimer = core_.getSystemTimer();
         uint64_t seconds;
         uint64_t nanoseconds;
         uint64_t retval =
@@ -434,9 +428,9 @@ bool ExceptionHandler::init() {
           // Currently, only a single CPU bitmask is supported
           if (bitmask != 1) {
             printException(instruction_);
-            std::cout
-                << "Unexpected CPU affinity mask returned in exception handler"
-                << std::endl;
+            std::cout << "\n[SimEng:ExceptionHandler] Unexpected CPU affinity "
+                         "mask returned in exception handler"
+                      << std::endl;
             return fatal();
           }
           uint64_t retval = (pid == 0) ? 1 : 0;
@@ -451,17 +445,18 @@ bool ExceptionHandler::init() {
       case 131: {  // tgkill
         // TODO currently returns success without action
         stateChange = {ChangeType::REPLACEMENT, {R0}, {0}};
+        break;
       }
       case 134: {  // rt_sigaction
         // TODO: Implement syscall logic. Ignored for now as it's assumed the
-        // current use of this syscall is to setup error handlers. Simualted
+        // current use of this syscall is to setup error handlers. Simulated
         // code is expected to work so no need for these handlers.
         stateChange = {ChangeType::REPLACEMENT, {R0}, {0ull}};
         break;
       }
       case 135: {  // rt_sigprocmask
         // TODO: Implement syscall logic. Ignored for now as it's assumed the
-        // current use of this syscall is to setup error handlers. Simualted
+        // current use of this syscall is to setup error handlers. Simulated
         // code is expected to work so no need for these handlers.
         stateChange = {ChangeType::REPLACEMENT, {R0}, {0ull}};
         break;
@@ -505,7 +500,7 @@ bool ExceptionHandler::init() {
       case 169: {  // gettimeofday
         uint64_t tvPtr = registerFileSet.get(R0).get<uint64_t>();
         uint64_t tzPtr = registerFileSet.get(R1).get<uint64_t>();
-        uint64_t systemTimer = core.getSystemTimer();
+        uint64_t systemTimer = core_.getSystemTimer();
 
         kernel::timeval tv;
         kernel::timeval tz;
@@ -616,15 +611,16 @@ bool ExceptionHandler::init() {
         uint64_t bufPtr = registerFileSet.get(R0).get<uint64_t>();
         size_t buflen = registerFileSet.get(R1).get<size_t>();
 
-        char buf[buflen];
+        std::vector<char> buf;
         for (size_t i = 0; i < buflen; i++) {
-          buf[i] = (uint8_t)rand();
+          buf.push_back((uint8_t)rand());
         }
 
         stateChange = {ChangeType::REPLACEMENT, {R0}, {(uint64_t)buflen}};
 
         stateChange.memoryAddresses.push_back({bufPtr, (uint8_t)buflen});
-        stateChange.memoryAddressValues.push_back(RegisterValue(buf, buflen));
+        stateChange.memoryAddressValues.push_back(
+            RegisterValue(buf.data(), buflen));
 
         break;
       }
@@ -633,7 +629,6 @@ bool ExceptionHandler::init() {
         stateChange = {ChangeType::REPLACEMENT, {R0}, {0ull}};
         break;
       }
-
       default:
         printException(instruction_);
         std::cout << "\n[SimEng:ExceptionHandler] Unrecognised syscall: "
@@ -641,6 +636,74 @@ bool ExceptionHandler::init() {
         return fatal();
     }
 
+    return concludeSyscall(stateChange);
+  } else if (exception == InstructionException::PipelineFlush) {
+    // Retrieve metadata, operand values and destination registers from
+    // instruction
+    auto metadata = instruction_.getMetadata();
+    auto operands = instruction_.getSourceOperands();
+    auto destinationRegs = instruction_.getDestinationRegisters();
+
+    uint8_t rm = 0b110;  // Set to invalid rounding mode
+    uint64_t result = 0;
+
+    ProcessStateChange stateChange;
+    switch (instruction_.getMetadata().opcode) {
+      case Opcode::RISCV_CSRRW:  // CSRRW rd,csr,rs1
+        if (metadata.operands[1].reg == RISCV_SYSREG_FRM) {
+          // Update CPP rounding mode but not floating point CSR as currently no
+          // implementation
+
+          rm = operands[0].get<uint64_t>() & 0b111;  // Take the lower 3 bits
+
+          switch (operands[0].get<uint64_t>()) {
+            case 0:  // RNE, Round to nearest, ties to even
+              fesetround(FE_TONEAREST);
+              break;
+            case 1:  // RTZ Round towards zero
+              fesetround(FE_TOWARDZERO);
+              break;
+            case 2:  // RDN Round down (-infinity)
+              fesetround(FE_DOWNWARD);
+              break;
+            case 3:  // RUP Round up (+infinity)
+              fesetround(FE_UPWARD);
+              break;
+            case 4:  // RMM Round to nearest, ties to max magnitude
+              // FE_TONEAREST ties towards even but no other options available
+              // in fenv
+              fesetround(FE_TONEAREST);
+              break;
+            default:
+              // Invalid Case
+              // TODO "If frm is set to an invalid
+              // value (101–111), any subsequent attempt to execute a
+              // floating-point operation with a dynamic rounding mode will
+              // raise an illegal instruction exception." - Volume I: RISC-V
+              // Unprivileged ISA V20191213 pg65
+              //
+              // Should be allowed to be set incorrectly and only caught when
+              // used. Set CSR to requested value, checking logic should be done
+              // by Instruction::setStaticRoundingModeThen. Requires full
+              // implementation of Zicsr
+              break;
+          }
+          // Shift rounding mode to correct position, frm[5:7]
+          result = rm << 5;
+        }
+
+        // Only update if registers should be written to
+        if (destinationRegs.size() > 0) {
+          // Dummy logic to allow progression. Set Rd to 0
+          stateChange = {
+              ChangeType::REPLACEMENT, {destinationRegs[0]}, {result}};
+        }
+        break;
+      default:
+        printException(instruction_);
+        return fatal();
+    }
+
     return concludeSyscall(stateChange);
   }
 
@@ -704,13 +767,13 @@ bool ExceptionHandler::readStringThen(char* buffer, uint64_t address,
 void ExceptionHandler::readLinkAt(span<char> path) {
   if (path.size() == kernel::Linux::LINUX_PATH_MAX) {
     // TODO: Handle LINUX_PATH_MAX case
-    std::cout << "[SimEng:ExceptionHandler] Path exceeds LINUX_PATH_MAX"
+    std::cout << "\n[SimEng:ExceptionHandler] Path exceeds LINUX_PATH_MAX"
               << std::endl;
     fatal();
     return;
   }
 
-  const auto& registerFileSet = core.getArchitecturalRegisterFileSet();
+  const auto& registerFileSet = core_.getArchitecturalRegisterFileSet();
   const auto dirfd = registerFileSet.get(R0).get<int64_t>();
   const auto bufAddress = registerFileSet.get(R2).get<uint64_t>();
   const auto bufSize = registerFileSet.get(R3).get<uint64_t>();
@@ -720,7 +783,7 @@ void ExceptionHandler::readLinkAt(span<char> path) {
 
   if (result < 0) {
     // TODO: Handle error case
-    std::cout << "[SimEng:ExceptionHandler] Error generated by readlinkat"
+    std::cout << "\n[SimEng:ExceptionHandler] Error generated by readlinkat"
               << std::endl;
     fatal();
     return;
@@ -763,7 +826,7 @@ bool ExceptionHandler::readBufferThen(uint64_t ptr, uint64_t length,
   auto completedReads = memory_.getCompletedReads();
   auto response =
       std::find_if(completedReads.begin(), completedReads.end(),
-                   [&](const MemoryReadResult& response) {
+                   [&](const memory::MemoryReadResult& response) {
                      return response.requestId == instruction_.getSequenceId();
                    });
   if (response == completedReads.end()) {
@@ -774,7 +837,7 @@ bool ExceptionHandler::readBufferThen(uint64_t ptr, uint64_t length,
   assert(response->data && "unhandled failed read in exception handler");
   uint8_t bytesRead = response->target.size;
   const uint8_t* data = response->data.getAsVector<uint8_t>();
-  dataBuffer.insert(dataBuffer.end(), data, data + bytesRead);
+  dataBuffer_.insert(dataBuffer_.end(), data, data + bytesRead);
   memory_.clearCompletedReads();
 
   // If there is more data, rerun this function for next chunk
@@ -800,11 +863,14 @@ void ExceptionHandler::printException(const Instruction& insn) const {
   std::cout << "[SimEng:ExceptionHandler] Encountered ";
   switch (exception) {
     case InstructionException::EncodingUnallocated:
-      std::cout << "illegal instruction";
+      std::cout << "unallocated instruction encoding";
       break;
     case InstructionException::ExecutionNotYetImplemented:
       std::cout << "execution not-yet-implemented";
       break;
+    case InstructionException::AliasNotYetImplemented:
+      std::cout << "alias not-yet-implemented";
+      break;
     case InstructionException::MisalignedPC:
       std::cout << "misaligned program counter";
       break;
@@ -823,6 +889,14 @@ void ExceptionHandler::printException(const Instruction& insn) const {
     case InstructionException::NoAvailablePort:
       std::cout << "unsupported execution port";
       break;
+    case InstructionException::IllegalInstruction:
+      std::cout << "illegal instruction";
+      break;
+    case InstructionException::PipelineFlush:
+      // TODO update/parameterize this output when more sources of this
+      // exception are implemented
+      std::cout << "unknown atomic operation";
+      break;
     default:
       std::cout << "unknown (id: " << static_cast<unsigned int>(exception)
                 << ")";
@@ -835,9 +909,10 @@ void ExceptionHandler::printException(const Instruction& insn) const {
             << insn.getInstructionAddress() << ": ";
 
   auto& metadata = insn.getMetadata();
-  for (uint8_t byte : metadata.encoding) {
+  for (uint8_t byteIndex = 0; byteIndex < metadata.getInsnLength();
+       byteIndex++) {
     std::cout << std::setfill('0') << std::setw(2)
-              << static_cast<unsigned int>(byte) << " ";
+              << static_cast<unsigned int>(metadata.encoding[byteIndex]) << " ";
   }
   std::cout << std::dec << "    ";
   if (exception == InstructionException::EncodingUnallocated) {
@@ -848,6 +923,12 @@ void ExceptionHandler::printException(const Instruction& insn) const {
   std::cout << std::endl;
   std::cout << "[SimEng:ExceptionHandler]      opcode ID: " << metadata.opcode;
   std::cout << std::endl;
+
+  std::string extraInformation = metadata.getExceptionString();
+  if (!extraInformation.empty()) {
+    std::cout << "[SimEng:ExceptionHandler]     Extra information: "
+              << extraInformation << std::endl;
+  }
 }
 
 bool ExceptionHandler::fatal() {
diff --git a/src/lib/arch/riscv/Instruction.cc b/src/lib/arch/riscv/Instruction.cc
index 6f75ecb356..a0937aa9e7 100644
--- a/src/lib/arch/riscv/Instruction.cc
+++ b/src/lib/arch/riscv/Instruction.cc
@@ -10,74 +10,80 @@ namespace simeng {
 namespace arch {
 namespace riscv {
 
-const Register Instruction::ZERO_REGISTER = {RegisterType::GENERAL, 0};
-
 Instruction::Instruction(const Architecture& architecture,
                          const InstructionMetadata& metadata)
-    : architecture_(architecture), metadata(metadata) {
-  decode();
-}
-
-Instruction::Instruction(const Architecture& architecture,
-                         const InstructionMetadata& metadata, uint8_t latency,
-                         uint8_t stallCycles)
-    : architecture_(architecture), metadata(metadata) {
-  latency_ = latency;
-  stallCycles_ = stallCycles;
-
+    : architecture_(architecture),
+      metadata_(metadata),
+      exception_(metadata.getMetadataException()) {
+  exceptionEncountered_ = metadata.getMetadataExceptionEncountered();
   decode();
 }
 
 Instruction::Instruction(const Architecture& architecture,
                          const InstructionMetadata& metadata,
                          InstructionException exception)
-    : architecture_(architecture), metadata(metadata) {
+    : architecture_(architecture), metadata_(metadata) {
   exception_ = exception;
   exceptionEncountered_ = true;
 }
 
-InstructionException Instruction::getException() const { return exception_; }
+const span<Register> Instruction::getSourceRegisters() const {
+  return {const_cast<Register*>(sourceRegisters_.data()), sourceRegisterCount_};
+}
 
-const span<Register> Instruction::getOperandRegisters() const {
-  return {const_cast<Register*>(sourceRegisters.data()), sourceRegisterCount};
+const span<RegisterValue> Instruction::getSourceOperands() const {
+  return {const_cast<RegisterValue*>(sourceValues_.data()),
+          sourceRegisterCount_};
 }
+
 const span<Register> Instruction::getDestinationRegisters() const {
-  return {const_cast<Register*>(destinationRegisters.data()),
-          destinationRegisterCount};
-}
-bool Instruction::isOperandReady(int index) const {
-  return static_cast<bool>(operands[index]);
+  return {const_cast<Register*>(destinationRegisters_.data()),
+          destinationRegisterCount_};
 }
 
-void Instruction::renameSource(uint8_t i, Register renamed) {
-  sourceRegisters[i] = renamed;
+void Instruction::renameSource(uint16_t i, Register renamed) {
+  sourceRegisters_[i] = renamed;
 }
-void Instruction::renameDestination(uint8_t i, Register renamed) {
-  destinationRegisters[i] = renamed;
+void Instruction::renameDestination(uint16_t i, Register renamed) {
+  destinationRegisters_[i] = renamed;
 }
 
-void Instruction::supplyOperand(uint8_t i, const RegisterValue& value) {
+void Instruction::supplyOperand(uint16_t i, const RegisterValue& value) {
   assert(!canExecute() &&
          "Attempted to provide an operand to a ready-to-execute instruction");
   assert(value.size() > 0 &&
          "Attempted to provide an uninitialised RegisterValue");
 
-  operands[i] = value;
-  operandsPending--;
+  sourceValues_[i] = value;
+  sourceOperandsPending_--;
+}
+
+bool Instruction::isOperandReady(int index) const {
+  return static_cast<bool>(sourceValues_[index]);
+}
+
+const span<RegisterValue> Instruction::getResults() const {
+  return {const_cast<RegisterValue*>(results_.data()),
+          destinationRegisterCount_};
+}
+
+span<const memory::MemoryAccessTarget> Instruction::getGeneratedAddresses()
+    const {
+  return {memoryAddresses_.data(), memoryAddresses_.size()};
 }
 
 void Instruction::supplyData(uint64_t address, const RegisterValue& data) {
-  for (size_t i = 0; i < memoryAddresses.size(); i++) {
-    if (memoryAddresses[i].address == address && !memoryData[i]) {
+  for (size_t i = 0; i < memoryAddresses_.size(); i++) {
+    if (memoryAddresses_[i].address == address && !memoryData_[i]) {
       if (!data) {
         // Raise exception for failed read
         // TODO: Move this logic to caller and distinguish between different
         // memory faults (e.g. bus error, page fault, seg fault)
         exception_ = InstructionException::DataAbort;
         exceptionEncountered_ = true;
-        memoryData[i] = RegisterValue(0, memoryAddresses[i].size);
+        memoryData_[i] = RegisterValue(0, memoryAddresses_[i].size);
       } else {
-        memoryData[i] = data;
+        memoryData_[i] = data;
       }
       dataPending_--;
       return;
@@ -86,67 +92,56 @@ void Instruction::supplyData(uint64_t address, const RegisterValue& data) {
 }
 
 span<const RegisterValue> Instruction::getData() const {
-  return {memoryData.data(), memoryData.size()};
+  return {memoryData_.data(), memoryData_.size()};
 }
 
-bool Instruction::canExecute() const { return (operandsPending == 0); }
-
-const span<RegisterValue> Instruction::getResults() const {
-  return {const_cast<RegisterValue*>(results.data()), destinationRegisterCount};
-}
+BranchType Instruction::getBranchType() const { return branchType_; }
 
-bool Instruction::isStoreAddress() const { return isStore_; }
-bool Instruction::isStoreData() const { return isStore_; }
-bool Instruction::isLoad() const { return isLoad_; }
-bool Instruction::isBranch() const { return isBranch_; }
-bool Instruction::isAtomic() const { return isAtomic_; }
+int64_t Instruction::getKnownOffset() const { return knownOffset_; }
 
-void Instruction::setMemoryAddresses(
-    const std::vector<MemoryAccessTarget>& addresses) {
-  memoryData = std::vector<RegisterValue>(addresses.size());
-  memoryAddresses = addresses;
-  dataPending_ = addresses.size();
+bool Instruction::isStoreAddress() const {
+  return isInstruction(InsnType::isStore);
 }
 
-span<const MemoryAccessTarget> Instruction::getGeneratedAddresses() const {
-  return {memoryAddresses.data(), memoryAddresses.size()};
+bool Instruction::isStoreData() const {
+  return isInstruction(InsnType::isStore);
 }
 
-std::tuple<bool, uint64_t> Instruction::checkEarlyBranchMisprediction() const {
-  assert(
-      !executed_ &&
-      "Early branch misprediction check shouldn't be called after execution");
+bool Instruction::isLoad() const { return isInstruction(InsnType::isLoad); }
 
-  if (!isBranch()) {
-    // Instruction isn't a branch; if predicted as taken, it will require a
-    // flush
-    return {prediction_.taken, instructionAddress_ + 4};
-  }
-
-  // Not enough information to determine this was a misprediction
-  return {false, 0};
-}
-
-BranchType Instruction::getBranchType() const { return branchType_; }
-
-int64_t Instruction::getKnownOffset() const { return knownOffset_; }
+bool Instruction::isBranch() const { return isInstruction(InsnType::isBranch); }
 
 uint16_t Instruction::getGroup() const {
   uint16_t base = InstructionGroups::INT;
 
-  if (isBranch()) return InstructionGroups::BRANCH;
-  if (isLoad()) return base + 8;
-  if (isStoreAddress()) return base + 9;
-  if (isDivide_) return base + 7;
-  if (isMultiply_) return base + 6;
-  if (isShift_) return base + 5;
-  if (isLogical_) return base + 4;
-  if (isCompare_) return base + 3;
+  if (isInstruction(InsnType::isFloat)) {
+    base = InstructionGroups::FLOAT;
+  }
+
+  if (isInstruction(InsnType::isBranch)) return InstructionGroups::BRANCH;
+  if (isInstruction(InsnType::isLoad)) return base + 8;
+  if (isInstruction(InsnType::isStore)) return base + 9;
+  if (isInstruction(InsnType::isDivide)) return base + 7;
+  if (isInstruction(InsnType::isMultiply)) return base + 6;
+  if (isInstruction(InsnType::isShift) || isInstruction(InsnType::isConvert))
+    return base + 5;
+  if (isInstruction(InsnType::isLogical)) return base + 4;
+  if (isInstruction(InsnType::isCompare)) return base + 3;
   return base + 2;  // Default return is {Data type}_SIMPLE_ARTH
 }
 
-void Instruction::setExecutionInfo(const executionInfo& info) {
-  if (isLoad_ || isStore_) {
+bool Instruction::canExecute() const { return (sourceOperandsPending_ == 0); }
+
+const std::vector<uint16_t>& Instruction::getSupportedPorts() {
+  if (supportedPorts_.size() == 0) {
+    exception_ = InstructionException::NoAvailablePort;
+    exceptionEncountered_ = true;
+  }
+  return supportedPorts_;
+}
+
+void Instruction::setExecutionInfo(const ExecutionInfo& info) {
+  if (isInstruction(InsnType::isLoad) || isInstruction(InsnType::isStore)) {
     lsqExecutionLatency_ = info.latency;
   } else {
     latency_ = info.latency;
@@ -155,15 +150,15 @@ void Instruction::setExecutionInfo(const executionInfo& info) {
   supportedPorts_ = info.ports;
 }
 
-const std::vector<uint16_t>& Instruction::getSupportedPorts() {
-  if (supportedPorts_.size() == 0) {
-    exception_ = InstructionException::NoAvailablePort;
-    exceptionEncountered_ = true;
-  }
-  return supportedPorts_;
+const InstructionMetadata& Instruction::getMetadata() const {
+  return metadata_;
 }
 
-const InstructionMetadata& Instruction::getMetadata() const { return metadata; }
+const Architecture& Instruction::getArchitecture() const {
+  return architecture_;
+}
+
+InstructionException Instruction::getException() const { return exception_; }
 
 }  // namespace riscv
 }  // namespace arch
diff --git a/src/lib/arch/riscv/InstructionMetadata.cc b/src/lib/arch/riscv/InstructionMetadata.cc
index 595f5f6ece..0d31ec00e0 100644
--- a/src/lib/arch/riscv/InstructionMetadata.cc
+++ b/src/lib/arch/riscv/InstructionMetadata.cc
@@ -1,9 +1,10 @@
 #include "InstructionMetadata.hh"
 
-#include <cassert>
 #include <cstring>
 #include <iostream>
 
+#include "simeng/arch/riscv/Architecture.hh"
+
 namespace simeng {
 namespace arch {
 namespace riscv {
@@ -14,7 +15,12 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
       implicitSourceCount(insn.detail->regs_read_count),
       implicitDestinationCount(insn.detail->regs_write_count),
       operandCount(insn.detail->riscv.op_count) {
-  std::memcpy(encoding, insn.bytes, sizeof(encoding));
+  // Populate 'encoding' field with correct bytes dependent on whether this is a
+  // compressed instruction
+  insnLengthBytes_ = insn.size;
+  std::memset(encoding, 0, 4);
+  std::memcpy(encoding, insn.bytes, insnLengthBytes_);
+
   // Copy printed output
   std::strncpy(mnemonic, insn.mnemonic, CS_MNEMONIC_SIZE);
   operandStr = std::string(insn.op_str);
@@ -27,6 +33,7 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
   std::memcpy(operands, insn.detail->riscv.operands,
               sizeof(cs_riscv_op) * operandCount);
 
+  convertCompressedInstruction(insn);
   alterPseudoInstructions(insn);
 }
 
@@ -36,7 +43,8 @@ InstructionMetadata::InstructionMetadata(const uint8_t* invalidEncoding,
       opcode(Opcode::RISCV_INSTRUCTION_LIST_END),
       implicitSourceCount(0),
       implicitDestinationCount(0),
-      operandCount(0) {
+      operandCount(0),
+      insnLengthBytes_(bytes) {
   assert(bytes <= sizeof(encoding));
   std::memcpy(encoding, invalidEncoding, bytes);
   mnemonic[0] = '\0';
@@ -56,10 +64,10 @@ void InstructionMetadata::alterPseudoInstructions(const cs_insn& insn) {
         // ADDI _, _, _-> ADDI x0, x0, 0
         // reg set to 1 to reflect capstones 1 indexed output
         operands[0].type = RISCV_OP_REG;
-        operands[0].reg = 1;
+        operands[0].reg = RISCV_REG_ZERO;
 
         operands[1].type = RISCV_OP_REG;
-        operands[1].reg = 1;
+        operands[1].reg = RISCV_REG_ZERO;
 
         operands[2].type = RISCV_OP_IMM;
         operands[2].imm = 0;
@@ -137,7 +145,7 @@ void InstructionMetadata::alterPseudoInstructions(const cs_insn& insn) {
         // sltz Rd, Rs is pseudo of SLT Rd, Rs, x0
         // SLT Rd, Rs, _ -> SLT Rd, Rs, x0
         operands[2].type = RISCV_OP_REG;
-        operands[2].reg = 1;
+        operands[2].reg = RISCV_REG_ZERO;
 
         operandCount = 3;
       } else if (operandCount == 2 && strcmp(mnemonic, "sgtz") == 0) {
@@ -152,10 +160,10 @@ void InstructionMetadata::alterPseudoInstructions(const cs_insn& insn) {
         // ret is pseudo of JALR x0, x1, 0
         // JALR _, _, _ -> JALR x0, x1, 0
         operands[0].type = RISCV_OP_REG;
-        operands[0].reg = 1;
+        operands[0].reg = RISCV_REG_ZERO;
 
         operands[1].type = RISCV_OP_REG;
-        operands[1].reg = 2;
+        operands[1].reg = RISCV_REG_RA;
 
         operands[2].type = RISCV_OP_IMM;
         operands[2].imm = 0;
@@ -165,7 +173,7 @@ void InstructionMetadata::alterPseudoInstructions(const cs_insn& insn) {
         // jr Rs is pseudo of JALR x0, Rs, 0
         // JALR Rs, _, _ -> JALR x0, Rs, 0
         operands[0].type = RISCV_OP_REG;
-        operands[0].reg = 1;
+        operands[0].reg = RISCV_REG_ZERO;
 
         operands[1] = insn.detail->riscv.operands[0];
 
@@ -177,7 +185,7 @@ void InstructionMetadata::alterPseudoInstructions(const cs_insn& insn) {
         // jalr Rs is pseudo of JALR x1, Rs, 0
         // JALR Rs, _, _ -> JALR x1, Rs, 0
         operands[0].type = RISCV_OP_REG;
-        operands[0].reg = 2;
+        operands[0].reg = RISCV_REG_RA;
 
         operands[1] = insn.detail->riscv.operands[0];
 
@@ -193,7 +201,7 @@ void InstructionMetadata::alterPseudoInstructions(const cs_insn& insn) {
         // jal offset is pseudo of JAL x1, offset
         // JAL offset, _ -> JAL x1, offset
         operands[0].type = RISCV_OP_REG;
-        operands[0].reg = 2;
+        operands[0].reg = RISCV_REG_RA;
 
         operands[1].type = RISCV_OP_IMM;
         operands[1].imm = insn.detail->riscv.operands[0].imm;
@@ -203,7 +211,7 @@ void InstructionMetadata::alterPseudoInstructions(const cs_insn& insn) {
         // j offset is pseudo of JAL x0, offset
         // JAL offset, _ -> JAL x0, offset
         operands[0].type = RISCV_OP_REG;
-        operands[0].reg = 1;
+        operands[0].reg = RISCV_REG_ZERO;
 
         operands[1].type = RISCV_OP_IMM;
         operands[1].imm = insn.detail->riscv.operands[0].imm;
@@ -252,17 +260,180 @@ void InstructionMetadata::alterPseudoInstructions(const cs_insn& insn) {
       }
       break;
     }
+
+    case Opcode::RISCV_CSRRS: {
+      if (operandCount == 1 && strcmp(mnemonic, "frflags") == 0) {
+        // frflags Rs is pseudo of CSRRS Rs, fflags, zero (Read FP exception
+        // flags) CSRRS Rs, _, _ -> CSRRS Rs, fflags, zero
+        operands[1].type =
+            RISCV_OP_IMM;  // TODO needs to become reg when Capstone updated
+        operands[1].imm = RISCV_SYSREG_FFLAGS;  // fflags address
+
+        operands[2].type = RISCV_OP_REG;
+        operands[2].reg = RISCV_REG_ZERO;
+
+        operandCount = 3;
+      } else if (strcmp(mnemonic, "rdinstret") == 0) {
+        return aliasNYI();
+      } else if (strcmp(mnemonic, "rdcycle") == 0) {
+        return aliasNYI();
+      } else if (strcmp(mnemonic, "rdtime") == 0) {
+        return aliasNYI();
+      } else if (strcmp(mnemonic, "csrr") == 0) {
+        return aliasNYI();
+      } else if (strcmp(mnemonic, "csrs") == 0) {
+        return aliasNYI();
+      } else if (strcmp(mnemonic, "frcsr") == 0) {
+        return aliasNYI();
+      } else if (operandCount == 1 && strcmp(mnemonic, "frrm") == 0) {
+        // frrm Rs is pseudo of CSRRS Rs, frm, zero (Read FP rounding mode)
+        // CSRRS Rs, _, _ -> CSRRS Rs, frm, zero
+        operands[1].type =
+            RISCV_OP_IMM;  // TODO needs to become reg when Capstone updated
+        operands[1].imm = RISCV_SYSREG_FRM;  // frm address
+
+        operands[2].type = RISCV_OP_REG;
+        operands[2].reg = RISCV_REG_ZERO;
+
+        operandCount = 3;
+      }
+      break;
+    }
+    case Opcode::RISCV_CSRRW: {
+      if (operandCount == 1 && strcmp(mnemonic, "fsflags") == 0) {
+        // fsflags Rs is pseudo of CSRRW zero, fflags, rs (Write FP exception
+        // flags)
+        // CSRRW Rs, _, _ -> CSRRW zero, fflags, Rs
+        operands[2] = operands[0];
+
+        operands[0].type = RISCV_OP_REG;
+        operands[0].reg = RISCV_REG_ZERO;
+
+        operands[1].type =
+            RISCV_OP_IMM;  // TODO needs to become reg when Capstone updated
+        operands[1].imm = RISCV_SYSREG_FFLAGS;  // fflags address
+
+        operandCount = 3;
+      } else if (operandCount == 2 && strcmp(mnemonic, "fsflags") == 0) {
+        // fsflags R1, R2 is pseudo of CSRRW r1, fflags, rs (Write FP exception
+        // flags)
+        // CSRRW R1, R2, _ -> CSRRW R1, fflags, R2
+        operands[2] = operands[1];
+
+        operands[1].type =
+            RISCV_OP_IMM;  // TODO needs to become reg when Capstone updated
+        operands[1].imm = RISCV_SYSREG_FFLAGS;  // fflags address
+
+        operandCount = 3;
+      } else if (strcmp(mnemonic, "csrw") == 0) {
+        return aliasNYI();
+      } else if (operandCount == 1 && strcmp(mnemonic, "fscsr") == 0) {
+        return aliasNYI();
+      } else if (operandCount == 2 && strcmp(mnemonic, "fscsr") == 0) {
+        return aliasNYI();
+        // 2 pseudoinstructions with same name but different number of registers
+      } else if (operandCount == 1 && strcmp(mnemonic, "fsrm") == 0) {
+        // fsrm Rs is pseudo of CSRRW zero, frm, rs (Write FP rounding mode)
+        // CSRRW Rs, _, _ -> CSRRW zero, frm, Rs
+        operands[2] = operands[0];
+
+        operands[0].type = RISCV_OP_REG;
+        operands[0].reg = RISCV_REG_ZERO;
+
+        operands[1].type =
+            RISCV_OP_IMM;  // TODO needs to become reg when Capstone updated
+        operands[1].imm = RISCV_SYSREG_FRM;  // frm address
+
+        operandCount = 3;
+      } else if (operandCount == 2 && strcmp(mnemonic, "fsrm") == 0) {
+        // fsrm R1, R2 is pseudo of CSRRW R1, frm, R2 (Write FP rounding mode)
+        // CSRRW R1, R2, _ -> CSRRW R1, frm, R2
+        operands[2] = operands[1];
+
+        operands[1].type = RISCV_OP_IMM;
+        operands[1].imm = RISCV_SYSREG_FRM;
+
+        operandCount = 3;
+      }
+      break;
+    }
+
+    case Opcode::RISCV_FSGNJ_S: {
+      if (operandCount == 2 && strcmp(mnemonic, "fmv.s") == 0) {
+        // fmv.s rd, rs is pseudo of fsgnj.s rd, rs, rs (Copy single-precision
+        // register)
+        // fsgnj.s Rd, Rs, _ -> fsgnj.s Rd, Rs, Rs
+        operands[2] = operands[1];
+        operandCount = 3;
+      }
+      break;
+    }
+    case Opcode::RISCV_FSGNJX_S: {
+      if (operandCount == 2 && strcmp(mnemonic, "fabs.s") == 0) {
+        // fabs.s rd, rs is pseudo of  fsgnjx.s rd, rs, rs (Single-precision
+        // absolute value)
+        // fsgnjx.s rd, rs, _ -> fsgnjx.s rd, rs, rs
+        operands[2] = operands[1];
+        operandCount = 3;
+      }
+      break;
+    }
+    case Opcode::RISCV_FSGNJN_S: {
+      if (operandCount == 2 && strcmp(mnemonic, "fneg.s") == 0) {
+        // fneg.s rd, rs is pseudo of  fsgnjn.s rd, rs, rs (Single-precision
+        // negate)
+        // fsgnjn.s rd, rs, _ -> fsgnjn.s rd, rs, rs
+        operands[2] = operands[1];
+        operandCount = 3;
+      }
+      break;
+    }
+
+    case Opcode::RISCV_FSGNJ_D: {
+      if (operandCount == 2 && strcmp(mnemonic, "fmv.d") == 0) {
+        // fmv.d rd, rs is pseudo of fsgnj.d rd, rs, rs (Copy double-precision
+        // register)
+        // fsgnj.d Rd, Rs, _ -> fsgnj.d Rd, Rs, Rs
+        operands[2] = operands[1];
+        operandCount = 3;
+      }
+      break;
+    }
+    case Opcode::RISCV_FSGNJX_D: {
+      if (operandCount == 2 && strcmp(mnemonic, "fabs.d") == 0) {
+        // fabs.d rd, rs is pseudo of  fsgnjx.d rd, rs, rs (Double-precision
+        // absolute value)
+        // fsgnjx.d rd, rs, _ -> fsgnjx.d rd, rs, rs
+        operands[2] = operands[1];
+        operandCount = 3;
+      }
+      break;
+    }
+    case Opcode::RISCV_FSGNJN_D: {
+      // fneg.d rd, rs, fsgnjn.d rd, rs, rs, Double-precision negate
+      if (operandCount == 2 && strcmp(mnemonic, "fneg.d") == 0) {
+        // fneg.d rd, rs is pseudo of  fsgnjn.d rd, rs, rs (Double-precision
+        // neagte)
+        // fsgnjn.d rd, rs, _ -> fsgnjn.d rd, rs, rs
+        operands[2] = operands[1];
+        operandCount = 3;
+      }
+      break;
+    }
   }
 }
 
-void InstructionMetadata::aliasNYI() { id = RISCV_INS_INVALID; }
+void InstructionMetadata::aliasNYI() {
+  metadataExceptionEncountered_ = true;
+  metadataException_ = InstructionException::AliasNotYetImplemented;
+}
 
 void InstructionMetadata::includeZeroRegisterPosOne() {
   // Given register sequence {Op_a, Op_b , _} return {Op_a, x0, Op_b}
   operands[2] = operands[1];
 
   operands[1].type = RISCV_OP_REG;
-  operands[1].reg = 1;
+  operands[1].reg = RISCV_REG_ZERO;
 
   operandCount = 3;
 }
@@ -273,11 +444,470 @@ void InstructionMetadata::includeZeroRegisterPosZero() {
   operands[1] = operands[0];
 
   operands[0].type = RISCV_OP_REG;
-  operands[0].reg = 1;
+  operands[0].reg = RISCV_REG_ZERO;
 
   operandCount = 3;
 }
 
+void InstructionMetadata::duplicateFirstOp() {
+  // Given register sequence {Op_a, Op_b, _} return {Op_a, Op_a, Op_b}
+  operands[2] = operands[1];
+  operands[1] = operands[0];
+
+  operandCount = 3;
+}
+
+void InstructionMetadata::convertCompressedInstruction(const cs_insn& insn) {
+  if (insnLengthBytes_ != 2) {
+    return;
+  }
+
+  switch (insn.opcode) {
+    case Opcode::RISCV_C_JR:
+      // jalr x0, 0(rs1)
+      // C.JR rs1, _, _ -> JALR x0, rs1, 0
+
+      // rs1=zero is reserved
+      if (operands[0].type != RISCV_OP_REG ||
+          operands[0].reg == RISCV_REG_ZERO) {
+        illegalAlias("C.JR has rs1=x0 which is reserved");
+      }
+
+      opcode = Opcode::RISCV_JALR;
+
+      operands[0].type = RISCV_OP_REG;
+      operands[0].reg = RISCV_REG_ZERO;
+
+      operands[1] = insn.detail->riscv.operands[0];
+
+      operands[2].type = RISCV_OP_IMM;
+      operands[2].imm = 0;
+
+      operandCount = 3;
+
+      break;
+    case Opcode::RISCV_C_MV:
+      // add rd, x0, rs2
+      // C.MV rd, rs2, _ -> ADD rd, zero, rs2
+
+      // rs2 != zero and rd == zero are hints
+
+      // rs2 = zero corresponds to C.JR
+      if (operands[1].type != RISCV_OP_REG ||
+          operands[1].reg == RISCV_REG_ZERO) {
+        illegalAlias("C.MV has rs2=x0 which is invalid");
+      }
+
+      opcode = Opcode::RISCV_ADD;
+
+      includeZeroRegisterPosOne();
+
+      break;
+    case Opcode::RISCV_C_LDSP: {
+      // TODO valid for RV64 only. Make this check once RV32 implemented
+      // ld rd, offset[8:3](x2)
+      // offset is immediate scaled by 8. Capstone does scaling for us
+
+      // rd = zero is reserved
+      if (operands[0].type != RISCV_OP_REG ||
+          operands[0].reg == RISCV_REG_ZERO) {
+        illegalAlias("C.LDSP has rd=x0 which is reserved");
+      }
+
+      opcode = Opcode::RISCV_LD;
+
+      break;
+    }
+    case Opcode::RISCV_C_ADDI4SPN:
+      // addi rd ′ , x2, nzuimm[9:2]
+
+      // nzuimm = zero is reserved
+      if (operands[2].type != RISCV_OP_IMM || operands[2].imm == 0) {
+        illegalAlias("C.ADDI4SPN has nzuimm=0 which is reserved");
+      }
+
+      opcode = Opcode::RISCV_ADDI;
+      // All operands correct
+      break;
+    case Opcode::RISCV_C_LI:
+      // addi rd, x0, imm[5:0]
+      // C.LI rd, imm, _ -> addi rd, zero, imm
+
+      // rd = zero encodes hints
+
+      opcode = Opcode::RISCV_ADDI;
+
+      includeZeroRegisterPosOne();
+
+      break;
+    case Opcode::RISCV_C_ADDI16SP:
+      // Opcode shared with C.LUI but has Rd = x2
+      // addi x2, x2, nzimm[9:4]
+      // C.ADDI16SP sp, imm, _ -> addi sp, sp, imm
+
+      // nzimm = zero is reserved
+      if (operands[1].type != RISCV_OP_IMM || operands[1].imm == 0) {
+        illegalAlias("C.ADDI16SP has nzimm=0 which is reserved");
+      }
+
+      opcode = Opcode::RISCV_ADDI;
+
+      duplicateFirstOp();
+
+      break;
+    case Opcode::RISCV_C_SLLI:
+      // slli rd, rd, shamt[5:0]
+      //
+      // "For RV32C, shamt[5] must be zero; the code points with shamt[5]=1 are
+      // reserved for custom extensions. For RV32C and RV64C, the shift amount
+      // must be non-zero; the code points with shamt=0 are HINTs. For all base
+      // ISAs, the code points with rd=x0 are HINTs, except those with
+      // shamt[5]=1 in RV32C." - Spec page 107
+      //
+      // C.SLLI rd, shamt, _ -> slli rd, rd, shamt
+
+      // shamt = zero is reserved for hints
+      // rd = zero encodes hints
+
+      opcode = Opcode::RISCV_SLLI;
+
+      duplicateFirstOp();
+
+      break;
+    case Opcode::RISCV_C_SDSP: {
+      // TODO rv64 ONLY, make check for this once RV32 implemented
+      // sd rs2, offset[8:3](x2)
+
+      opcode = Opcode::RISCV_SD;
+
+      break;
+    }
+    case Opcode::RISCV_C_SWSP: {
+      // sw rs2, offset[7:2](x2)
+      opcode = Opcode::RISCV_SW;
+
+      break;
+    }
+    case Opcode::RISCV_C_ADD:
+      // add rd, rd, rs2
+      //
+      // "code points with rs2=x0 correspond
+      // to the C.JALR and C.EBREAK
+      // instructions. The code points with
+      // rs2̸=x0 and rd=x0 are HINTs." - Spec page 108
+      //
+      // C.ADD rd, rs2, _ -> add rd, rd, rs2
+
+      // rs2 = zero corresponds to C.JALR and C.EBREAK
+      if (operands[1].type != RISCV_OP_REG ||
+          operands[1].reg == RISCV_REG_ZERO) {
+        illegalAlias("C.ADD has rs2=x0 which is invalid");
+      }
+
+      // rs2 != zero AND rd = zero are reserved for hints
+
+      opcode = Opcode::RISCV_ADD;
+
+      duplicateFirstOp();
+
+      break;
+    case Opcode::RISCV_C_LD: {
+      // TODO rv64 ONLY, make check for this once RV32 implemented
+      // ld rd ′ , offset[7:3](rs1 ′)
+
+      opcode = Opcode::RISCV_LD;
+
+      break;
+    }
+    case Opcode::RISCV_C_ADDI: {
+      // addi rd, rd, nzimm[5:0]
+      // C.ADDI rd, imm, _ -> addi rd, rd, imm
+
+      // rd = zero encodes C.NOP
+      if (operands[0].type != RISCV_OP_REG ||
+          operands[0].reg == RISCV_REG_ZERO) {
+        illegalAlias("C.ADDI has rd=x0 which is invalid");
+      }
+
+      // nzimm = zero is reserved for hints
+
+      opcode = Opcode::RISCV_ADDI;
+
+      duplicateFirstOp();
+
+      break;
+    }
+    case Opcode::RISCV_C_BNEZ:
+      // bne rs1 ′ , x0, offset[8:1]
+      // C.BNEZ rs1, imm, _ -> bne rs1, zero, imm
+      opcode = Opcode::RISCV_BNE;
+
+      includeZeroRegisterPosOne();
+
+      break;
+    case Opcode::RISCV_C_SD: {
+      // TODO rv64 ONLY, make check for this once RV32 implemented
+      // sd rs2 ′ , offset[7:3](rs1 ′)
+
+      opcode = Opcode::RISCV_SD;
+
+      break;
+    }
+    case Opcode::RISCV_C_BEQZ:
+      // beq rs1 ′ , x0, offset[8:1]
+      // C.BEQZ rs1, imm, _ -> beq rs1, zero, imm
+      opcode = Opcode::RISCV_BEQ;
+
+      includeZeroRegisterPosOne();
+
+      break;
+    case Opcode::RISCV_C_ANDI:
+      // andi rd ′, rd ′ , imm[5:0]
+      // C.ANDI rd, imm, _ -> andi rd, rd, imm
+      opcode = Opcode::RISCV_ANDI;
+
+      duplicateFirstOp();
+
+      break;
+    case Opcode::RISCV_C_LUI:
+      // lui rd, nzimm[17:12]
+
+      // nzimm = zero is reserved
+      if (operands[1].type != RISCV_OP_IMM || operands[1].imm == 0) {
+        illegalAlias("C.LUI has nzimm=0 which is reserved");
+      }
+
+      // rd = zero is reserved for hints
+
+      // rd = x2 encodes C.ADDI16SP
+      if (operands[0].type != RISCV_OP_REG || operands[0].reg == RISCV_REG_SP) {
+        illegalAlias("C.LUI has rd=x2 which is invalid");
+      }
+
+      opcode = Opcode::RISCV_LUI;
+      // All operands correct
+      break;
+    case Opcode::RISCV_C_LWSP: {
+      // lw rd, offset[7:2](x2)
+
+      // rd = zero is reserved
+      if (operands[0].type != RISCV_OP_REG ||
+          operands[0].reg == RISCV_REG_ZERO) {
+        illegalAlias("C.LWSP has rd=x0 which is reserved");
+      }
+
+      opcode = Opcode::RISCV_LW;
+
+      break;
+    }
+    case Opcode::RISCV_C_FLDSP:
+      // TODO RV32DC/RV64DC-only once RV32 implemented
+      // fld rd, offset[8:3](x2)
+      opcode = Opcode::RISCV_FLD;
+
+      break;
+    case Opcode::RISCV_C_SW: {
+      // sw rs2 ′, offset[6:2](rs1 ′)
+
+      opcode = Opcode::RISCV_SW;
+
+      break;
+    }
+    case Opcode::RISCV_C_J:
+      // jal x0, offset[11:1]
+      // C.J imm, _ -> jal zero, imm
+      opcode = Opcode::RISCV_JAL;
+
+      operands[1] = operands[0];
+
+      operands[0].type = RISCV_OP_REG;
+      operands[0].reg = RISCV_REG_ZERO;
+
+      operandCount = 2;
+
+      break;
+    case Opcode::RISCV_C_ADDIW:
+      // TODO rv64 ONLY, make check for this once RV32 implemented
+      // addiw rd, rd, imm[5:0]
+      // C.ADDIW rd, imm, _ -> addiw rd, rd, imm
+
+      // "The immediate can be zero for C.ADDIW, where this corresponds to
+      // [pseudoinstruction] sext.w rd" - Spec page 106
+      // rd = zero is reserved
+      if (operands[0].type != RISCV_OP_REG ||
+          operands[0].reg == RISCV_REG_ZERO) {
+        illegalAlias("C.ADDIW has rd=x0 which is reserved");
+      }
+
+      opcode = Opcode::RISCV_ADDIW;
+
+      duplicateFirstOp();
+
+      break;
+    case Opcode::RISCV_C_SUB:
+      // sub rd ′ , rd ′ , rs2 ′
+      // C.SUB rd, rs2, -> sub rd, rd, rs2
+      opcode = Opcode::RISCV_SUB;
+
+      duplicateFirstOp();
+
+      break;
+    case Opcode::RISCV_C_LW:
+      // lw rd ′ , offset[6:2](rs1 ′ )
+
+      opcode = Opcode::RISCV_LW;
+
+      break;
+    case Opcode::RISCV_C_SRLI:
+      // srli rd ′ , rd ′ , shamt[5:0]
+      // C.SRLI rd, imm, _ -> srli rd, rd, imm
+
+      // shamt = zero is reserved for hints
+
+      opcode = Opcode::RISCV_SRLI;
+
+      duplicateFirstOp();
+
+      break;
+    case Opcode::RISCV_C_ADDW:
+      // TODO rv64 ONLY, make check for this once RV32 implemented
+      // addw rd ′ , rd ′ , rs2 ′
+      // C.ADDW rd, rs2, _ -> addw rd, rd, rs2
+      opcode = Opcode::RISCV_ADDW;
+
+      duplicateFirstOp();
+
+      break;
+    case Opcode::RISCV_C_AND:
+      // and rd ′ , rd ′ , rs2 ′
+      // C.AND rd, rs2, _ -> and rd, rd, rs2
+      opcode = Opcode::RISCV_AND;
+
+      duplicateFirstOp();
+
+      break;
+    case Opcode::RISCV_C_OR:
+      // or rd ′ , rd ′ , rs2 ′
+      // C.OR rd, rs2, _ ->  or rd, rd, rs2
+
+      opcode = Opcode::RISCV_OR;
+
+      duplicateFirstOp();
+
+      break;
+    case Opcode::RISCV_C_JALR:
+      // jalr x1, 0(rs1)
+      // C.JALR rs1, _, _ -> jalr x1, rs1, 0
+
+      // rs1=zero corresponds to C.EBREAK instruction
+      if (operands[0].type != RISCV_OP_REG ||
+          operands[0].reg == RISCV_REG_ZERO) {
+        illegalAlias("C.JALR has rs1=x0 which is invalid");
+      }
+
+      opcode = Opcode::RISCV_JALR;
+
+      operands[1] = operands[0];
+
+      operands[0].reg = RISCV_REG_RA;
+
+      operands[2].type = RISCV_OP_IMM;
+      operands[2].imm = 0;
+
+      operandCount = 3;
+
+      break;
+    case Opcode::RISCV_C_XOR:
+      // xor rd ′ , rd ′ , rs2 ′
+      // C.XOR rd, rs2, _ -> xor rd, rd, rs2
+
+      opcode = Opcode::RISCV_XOR;
+
+      duplicateFirstOp();
+
+      break;
+    case Opcode::RISCV_C_SRAI:
+      // srai rd ′ , rd ′ , shamt[5:0]
+      // C.SRAI rd, imm, _ -> srai rd, rd, imm
+
+      // shamt = zero is reserved for hints
+
+      opcode = Opcode::RISCV_SRAI;
+
+      duplicateFirstOp();
+
+      break;
+    case Opcode::RISCV_C_FSD:
+      // TODO rv64dc ONLY, make check for this once RV32 implemented
+      // fsd rs2 ′, offset[7:3](rs1 ′)
+
+      opcode = Opcode::RISCV_FSD;
+
+      break;
+    case Opcode::RISCV_C_FLD:
+      // TODO rv64dc ONLY, make check for this once RV32 implemented
+      // fld rd ′, offset[7:3](rs1 ′)
+
+      opcode = Opcode::RISCV_FLD;
+
+      break;
+    case Opcode::RISCV_C_FSDSP:
+      // TODO rv64dc ONLY, make check for this once RV32 implemented
+      // fsd rs2, offset[8:3](x2)
+
+      opcode = Opcode::RISCV_FSD;
+
+      break;
+    case Opcode::RISCV_C_SUBW:
+      // TODO rv64 ONLY, make check for this once RV32 implemented
+      // subw rd ′ , rd ′ , rs2 ′
+      // C.SUBW rd, rs2, _ -> subw rd, rd, rs2
+
+      opcode = Opcode::RISCV_SUBW;
+
+      duplicateFirstOp();
+
+      break;
+    case Opcode::RISCV_C_NOP:
+      // nop
+      // C.NOP _, _, _-> addi x0, x0, 0
+
+      // TODO imm != zero is reserved for hints. Capstone doesn't give this
+      // value so can't be checked
+
+      opcode = Opcode::RISCV_ADDI;
+
+      // Duplicate implementation of nop pseudoinstruction
+      operands[0].type = RISCV_OP_REG;
+      operands[0].reg = RISCV_REG_ZERO;
+
+      operands[1].type = RISCV_OP_REG;
+      operands[1].reg = RISCV_REG_ZERO;
+
+      operands[2].type = RISCV_OP_IMM;
+      operands[2].imm = 0;
+
+      operandCount = 3;
+
+      break;
+    case Opcode::RISCV_C_EBREAK:
+      // ebreak
+
+      opcode = Opcode::RISCV_EBREAK;
+
+      break;
+    default:
+      // Unimplemented compressed instruction, raise exception
+      aliasNYI();
+      break;
+  }
+}
+void InstructionMetadata::illegalAlias(std::string info) {
+  metadataExceptionEncountered_ = true;
+  metadataException_ = InstructionException::IllegalInstruction;
+  exceptionString_ = info;
+}
+
 }  // namespace riscv
 }  // namespace arch
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/lib/arch/riscv/InstructionMetadata.hh b/src/lib/arch/riscv/InstructionMetadata.hh
index af5bebf815..d5f2e81edb 100644
--- a/src/lib/arch/riscv/InstructionMetadata.hh
+++ b/src/lib/arch/riscv/InstructionMetadata.hh
@@ -2,7 +2,8 @@
 
 #include <string>
 
-#include "capstone/capstone.h"
+#include "simeng/arch/riscv/Architecture.hh"
+#include "simeng/arch/riscv/Instruction.hh"
 
 namespace simeng {
 namespace arch {
@@ -24,6 +25,22 @@ struct InstructionMetadata {
   /** Constructs an invalid metadata object containing the invalid encoding. */
   InstructionMetadata(const uint8_t* invalidEncoding, uint8_t bytes = 4);
 
+  /* Returns the current exception state of the metadata */
+  InstructionException getMetadataException() const {
+    return metadataException_;
+  }
+
+  /* Returns a bool stating whether an exception has been encountered. */
+  bool getMetadataExceptionEncountered() const {
+    return metadataExceptionEncountered_;
+  }
+
+  /* Return extra information about the exception */
+  std::string getExceptionString() const { return exceptionString_; }
+
+  /* Returns the length of the instruction in bytes. */
+  uint8_t getInsnLength() const { return insnLengthBytes_; }
+
   /** The maximum operand string length as defined in Capstone */
   static const size_t MAX_OPERAND_STR_LENGTH =
       sizeof(cs_insn::op_str) / sizeof(char);
@@ -52,21 +69,25 @@ struct InstructionMetadata {
 
   /** The instruction's mnemonic. */
   char mnemonic[CS_MNEMONIC_SIZE];
+
   /** The remainder of the instruction's assembly representation. */
   std::string operandStr;
 
   /** The implicitly referenced registers. */
   uint16_t implicitSources[MAX_IMPLICIT_SOURCES];
+
   /** The number of implicitly referenced registers. */
   uint8_t implicitSourceCount;
 
   /** The implicitly referenced destination registers. */
   uint16_t implicitDestinations[MAX_IMPLICIT_DESTINATIONS];
+
   /** The number of implicitly referenced destination registers. */
   uint8_t implicitDestinationCount;
 
   /** The explicit operands. */
   cs_riscv_op operands[MAX_OPERANDS];
+
   /** The number of explicit operands. */
   uint8_t operandCount;
 
@@ -75,9 +96,17 @@ struct InstructionMetadata {
    * instruction. */
   void alterPseudoInstructions(const cs_insn& insn);
 
-  /** Flag the instruction as invalid due to a detected unsupported alias. */
+  /** Detect compressed instructions and update metadata to match the
+   * non-compressed instruction expansion. */
+  void convertCompressedInstruction(const cs_insn& insn);
+
+  /** Flag the instruction as aliasNYI due to a detected unsupported alias. */
   void aliasNYI();
 
+  /** Flag the instruction as illegal and provide some extra information via a
+   * string */
+  void illegalAlias(std::string info);
+
   /** RISC-V helper function
    * Use register zero as operands[1] and immediate value as operands[2] */
   void includeZeroRegisterPosOne();
@@ -85,6 +114,27 @@ struct InstructionMetadata {
   /** RISC-V helper function
    * Use register zero as operands[0] and immediate value as operands[2] */
   void includeZeroRegisterPosZero();
+
+  /** RISC-V helper function
+   * Duplicate operands[0] and move operands[1] to operands[2] */
+  void duplicateFirstOp();
+
+  /** RISC-V helper function
+   * Combine operands[1] and operands[2] which are of type imm and reg
+   * respectively into a single mem type operand */
+  void createMemOpPosOne();
+
+  /** The current exception state of this instruction. */
+  InstructionException metadataException_ = InstructionException::None;
+
+  /** Whether an exception has been encountered. */
+  bool metadataExceptionEncountered_ = false;
+
+  /** Additional information to print to the user */
+  std::string exceptionString_ = "";
+
+  /** The length of the instruction encoding in bytes. */
+  uint8_t insnLengthBytes_;
 };
 
 }  // namespace riscv
diff --git a/src/lib/arch/riscv/Instruction_address.cc b/src/lib/arch/riscv/Instruction_address.cc
index e893ce3644..0d6bbb9843 100644
--- a/src/lib/arch/riscv/Instruction_address.cc
+++ b/src/lib/arch/riscv/Instruction_address.cc
@@ -7,25 +7,53 @@ namespace simeng {
 namespace arch {
 namespace riscv {
 
-span<const MemoryAccessTarget> Instruction::generateAddresses() {
-  assert((isLoad() || isStoreAddress()) &&
-         "generateAddresses called on non-load-or-store instruction");
+span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
+  assert(
+      (isInstruction(InsnType::isLoad) || isInstruction(InsnType::isStore)) &&
+      "generateAddresses called on non-load-or-store instruction");
 
   uint64_t address;
-  if (isLoad() && isStoreAddress() && isAtomic()) {
+  if (isInstruction(InsnType::isLoad) && isInstruction(InsnType::isStore) &&
+      isInstruction(InsnType::isAtomic)) {
     // Atomics
-    address = operands[1].get<uint64_t>();
-  } else if (isLoad()) {
-    address = operands[0].get<uint64_t>() + metadata.operands[1].mem.disp;
+    // Metadata operands[2] corresponds to instruction sourceRegValues[1]
+    assert(metadata_.operands[2].type == RISCV_OP_MEM &&
+           "metadata_ operand not of correct type during RISC-V address "
+           "generation");
+    address = sourceValues_[1].get<uint64_t>();
+  } else if (isInstruction(InsnType::isLoad) &&
+             isInstruction(InsnType::isAtomic)) {
+    // Load reserved
+    // Metadata operands[1] corresponds to instruction sourceRegValues[0]
+    assert(metadata_.operands[1].type == RISCV_OP_MEM &&
+           "metadata_ operand not of correct type during RISC-V address "
+           "generation");
+    address = sourceValues_[0].get<uint64_t>();
+  } else if (isInstruction(InsnType::isStore) &&
+             isInstruction(InsnType::isAtomic)) {
+    // Store conditional
+    assert(metadata_.operands[2].type == RISCV_OP_MEM &&
+           "metadata_ operand not of correct type during RISC-V address "
+           "generation");
+    address = sourceValues_[1].get<uint64_t>();
+  } else if (isInstruction(InsnType::isLoad)) {
+    assert(metadata_.operands[1].type == RISCV_OP_MEM &&
+           "metadata_ operand not of correct type during RISC-V address "
+           "generation");
+    address = sourceValues_[0].get<uint64_t>() + sourceImm_;
   } else {
-    address = operands[1].get<uint64_t>() + metadata.operands[1].mem.disp;
+    assert((metadata_.operands[1].type == RISCV_OP_MEM) &&
+           "metadata_ operand not of correct type during RISC-V address "
+           "generation");
+
+    address = sourceValues_[1].get<uint64_t>() + sourceImm_;
   }
 
   // Atomics
-  if (Opcode::RISCV_AMOADD_D <= metadata.opcode &&
-      metadata.opcode <= Opcode::RISCV_AMOXOR_W_RL) {  // Atomics
+  if (Opcode::RISCV_AMOADD_D <= metadata_.opcode &&
+      metadata_.opcode <= Opcode::RISCV_AMOXOR_W_RL) {  // Atomics
     // THIS IS DEPENDENT ON CAPSTONE ENCODING AND COULD BREAK IF CHANGED
-    int size = ((metadata.opcode - 182) / 4) % 2;  // 1 = Word, 0 = Double
+    int size = ((metadata_.opcode - 182) / 4) % 2;  // 1 = Word, 0 = Double
     if (size == 1) {
       // Word
       setMemoryAddresses({{address, 4}});
@@ -36,10 +64,14 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
     return getGeneratedAddresses();
   }
 
-  switch (metadata.opcode) {
+  switch (metadata_.opcode) {
     case Opcode::RISCV_SD:
       [[fallthrough]];
-    case Opcode::RISCV_LD: {
+    case Opcode::RISCV_LD:
+      [[fallthrough]];
+    case Opcode::RISCV_FSD:
+      [[fallthrough]];
+    case Opcode::RISCV_FLD: {
       setMemoryAddresses({{address, 8}});
       break;
     }
@@ -47,7 +79,11 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
       [[fallthrough]];
     case Opcode::RISCV_LW:
       [[fallthrough]];
-    case Opcode::RISCV_LWU: {
+    case Opcode::RISCV_LWU:
+      [[fallthrough]];
+    case Opcode::RISCV_FSW:
+      [[fallthrough]];
+    case Opcode::RISCV_FLW: {
       setMemoryAddresses({{address, 4}});
       break;
     }
@@ -76,7 +112,7 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
     case Opcode::RISCV_LR_W_RL:
       [[fallthrough]];
     case Opcode::RISCV_LR_W_AQ_RL: {
-      setMemoryAddresses({{operands[0].get<uint64_t>(), 4}});
+      setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 4}});
       break;
     }
     case Opcode::RISCV_LR_D:
@@ -86,7 +122,7 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
     case Opcode::RISCV_LR_D_RL:
       [[fallthrough]];
     case Opcode::RISCV_LR_D_AQ_RL: {
-      setMemoryAddresses({{operands[0].get<uint64_t>(), 8}});
+      setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 8}});
       break;
     }
     case Opcode::RISCV_SC_W:
@@ -96,7 +132,7 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
     case Opcode::RISCV_SC_W_RL:
       [[fallthrough]];
     case Opcode::RISCV_SC_W_AQ_RL: {
-      setMemoryAddresses({{operands[1].get<uint64_t>(), 4}});
+      setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 4}});
       break;
     }
     case Opcode::RISCV_SC_D:
@@ -106,7 +142,7 @@ span<const MemoryAccessTarget> Instruction::generateAddresses() {
     case Opcode::RISCV_SC_D_RL:
       [[fallthrough]];
     case Opcode::RISCV_SC_D_AQ_RL: {
-      setMemoryAddresses({{operands[1].get<uint64_t>(), 8}});
+      setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 8}});
       break;
     }
     default:
diff --git a/src/lib/arch/riscv/Instruction_decode.cc b/src/lib/arch/riscv/Instruction_decode.cc
index a058a2bfe9..e8145d4c11 100644
--- a/src/lib/arch/riscv/Instruction_decode.cc
+++ b/src/lib/arch/riscv/Instruction_decode.cc
@@ -19,6 +19,32 @@ namespace riscv {
 Register csRegToRegister(unsigned int reg) {
   // Check from top of the range downwards
 
+  // Metadata could produce either 64-bit floating point register or 32-bit
+  // floating point register. Map both encodings to the same SimEng register.
+  // Only 64-bit registers are supported
+
+  // Modulus ensures only 64 bit registers are recognised
+  if (RISCV_REG_F31_64 >= reg && reg >= RISCV_REG_F0_64 && reg % 2 == 0) {
+    // Register ft0.64 has encoding 34 with subsequent encodings interleaved
+    // with 32 bit floating point registers. See
+    // capstone-lib-src/include/riscv.h
+    // Division always results in integer as reg is required to be even by if
+    // condition and ft0.64 is also even
+    return {RegisterType::FLOAT,
+            static_cast<uint16_t>((reg - RISCV_REG_F0_64) / 2)};
+  }
+
+  // Modulus ensures only 32 bit registers are recognised
+  if (RISCV_REG_F31_32 >= reg && reg >= RISCV_REG_F0_32 && reg % 2 == 1) {
+    // Register ft0.32 has encoding 33 with subsequent encodings interleaved
+    // with 64 bit floating point registers. See
+    // capstone-lib-src/include/riscv.h
+    // Division always results in integer as reg is required to be odd by if
+    // condition and ft0.32 is also odd
+    return {RegisterType::FLOAT,
+            static_cast<uint16_t>((reg - RISCV_REG_F0_32) / 2)};
+  }
+
   if (RISCV_REG_X31 >= reg && reg >= RISCV_REG_X1) {
     // Capstone produces 1 indexed register operands
     return {RegisterType::GENERAL, static_cast<uint16_t>(reg - 1)};
@@ -26,7 +52,7 @@ Register csRegToRegister(unsigned int reg) {
 
   if (reg == RISCV_REG_X0) {
     // Zero register
-    return Instruction::ZERO_REGISTER;
+    return RegisterType::ZERO_REGISTER;
   }
 
   assert(false && "Decoding failed due to unknown register identifier");
@@ -34,51 +60,18 @@ Register csRegToRegister(unsigned int reg) {
           std::numeric_limits<uint16_t>::max()};
 }
 
-/** Invalidate instructions that are currently not yet implemented. This
- prevents errors during speculated branches with unknown destinations;
- non-executable assertions. memory is decoded into valid but not implemented
- instructions tripping assertions.
- TODO remove once all extensions are supported*/
-void Instruction::invalidateIfNotImplemented() {
-  if (metadata.opcode >= Opcode::RISCV_ADD &&
-      metadata.opcode <= Opcode::RISCV_BNE)
-    return;
-  if (metadata.opcode >= Opcode::RISCV_DIV &&
-      metadata.opcode <= Opcode::RISCV_ECALL)
-    return;
-  if (metadata.opcode >= Opcode::RISCV_JAL &&
-      metadata.opcode <= Opcode::RISCV_SD)
-    return;
-  if (metadata.opcode >= Opcode::RISCV_SH &&
-      metadata.opcode <= Opcode::RISCV_SRAW)
-    return;
-  if (metadata.opcode >= Opcode::RISCV_SRL &&
-      metadata.opcode <= Opcode::RISCV_SW)
-    return;
-  if (metadata.opcode >= Opcode::RISCV_XOR &&
-      metadata.opcode <= Opcode::RISCV_XORI)
-    return;
-  if (metadata.opcode == Opcode::RISCV_FENCE) return;
-
-  exception_ = InstructionException::EncodingUnallocated;
-  exceptionEncountered_ = true;
-  return;
-}
-
 /******************
  * DECODING LOGIC
  *****************/
 void Instruction::decode() {
-  invalidateIfNotImplemented();
-  if (exceptionEncountered_) return;
-  if (metadata.id == RISCV_INS_INVALID) {
+  if (metadata_.id == RISCV_INS_INVALID) {
     exception_ = InstructionException::EncodingUnallocated;
     exceptionEncountered_ = true;
     return;
   }
 
   // Identify branches
-  switch (metadata.opcode) {
+  switch (metadata_.opcode) {
     case Opcode::RISCV_BEQ:
     case Opcode::RISCV_BNE:
     case Opcode::RISCV_BLT:
@@ -87,7 +80,7 @@ void Instruction::decode() {
     case Opcode::RISCV_BGEU:
     case Opcode::RISCV_JAL:
     case Opcode::RISCV_JALR:
-      isBranch_ = true;
+      setInstructionType(InsnType::isBranch);
       break;
       // Identify loads/stores
     case Opcode::RISCV_LR_D:
@@ -98,7 +91,7 @@ void Instruction::decode() {
     case Opcode::RISCV_LR_W_AQ:
     case Opcode::RISCV_LR_W_RL:
     case Opcode::RISCV_LR_W_AQ_RL:
-      isAtomic_ = true;
+      setInstructionType(InsnType::isAtomic);
       [[fallthrough]];
     case Opcode::RISCV_LB:
     case Opcode::RISCV_LBU:
@@ -107,7 +100,9 @@ void Instruction::decode() {
     case Opcode::RISCV_LW:
     case Opcode::RISCV_LWU:
     case Opcode::RISCV_LD:
-      isLoad_ = true;
+    case Opcode::RISCV_FLW:
+    case Opcode::RISCV_FLD:
+      setInstructionType(InsnType::isLoad);
       break;
     case Opcode::RISCV_SC_D:
     case Opcode::RISCV_SC_D_AQ:
@@ -117,27 +112,29 @@ void Instruction::decode() {
     case Opcode::RISCV_SC_W_AQ:
     case Opcode::RISCV_SC_W_RL:
     case Opcode::RISCV_SC_W_AQ_RL:
-      isAtomic_ = true;
+      setInstructionType(InsnType::isAtomic);
       [[fallthrough]];
     case Opcode::RISCV_SB:
     case Opcode::RISCV_SW:
     case Opcode::RISCV_SH:
     case Opcode::RISCV_SD:
-      isStore_ = true;
+    case Opcode::RISCV_FSW:
+    case Opcode::RISCV_FSD:
+      setInstructionType(InsnType::isStore);
       break;
   }
 
-  if (Opcode::RISCV_AMOADD_D <= metadata.opcode &&
-      metadata.opcode <= Opcode::RISCV_AMOXOR_W_RL) {
+  if (Opcode::RISCV_AMOADD_D <= metadata_.opcode &&
+      metadata_.opcode <= Opcode::RISCV_AMOXOR_W_RL) {
     // Atomics: both load and store
-    isLoad_ = true;
-    isStore_ = true;
-    isAtomic_ = true;
+    setInstructionType(InsnType::isLoad);
+    setInstructionType(InsnType::isStore);
+    setInstructionType(InsnType::isAtomic);
   }
 
-  // Extract explicit register accesses, ignore immediates until execute
-  for (size_t i = 0; i < metadata.operandCount; i++) {
-    const auto& op = metadata.operands[i];
+  // Extract explicit register accesses and immediates
+  for (size_t i = 0; i < metadata_.operandCount; i++) {
+    const auto& op = metadata_.operands[i];
 
     // First operand is always of REG type but could be either source or
     // destination
@@ -145,116 +142,159 @@ void Instruction::decode() {
       // If opcode is branch or store (but not atomic or jump) the first operand
       // is a source register, for all other instructions the first operand is a
       // destination register
-      if ((isBranch() && metadata.opcode != Opcode::RISCV_JAL &&
-           metadata.opcode != Opcode::RISCV_JALR) ||
-          (isStoreAddress() && !isAtomic())) {
-        sourceRegisters[sourceRegisterCount] = csRegToRegister(op.reg);
+      if ((isInstruction(InsnType::isBranch) &&
+           metadata_.opcode != Opcode::RISCV_JAL &&
+           metadata_.opcode != Opcode::RISCV_JALR) ||
+          (isInstruction(InsnType::isStore) &&
+           !isInstruction(InsnType::isAtomic))) {
+        sourceRegisters_[sourceRegisterCount_] = csRegToRegister(op.reg);
 
-        if (sourceRegisters[sourceRegisterCount] ==
-            Instruction::ZERO_REGISTER) {
+        if (sourceRegisters_[sourceRegisterCount_] ==
+            RegisterType::ZERO_REGISTER) {
           // Catch zero register references and pre-complete those operands
-          operands[sourceRegisterCount] = RegisterValue(0, 8);
+          sourceValues_[sourceRegisterCount_] = RegisterValue(0, 8);
         } else {
-          operandsPending++;
+          sourceOperandsPending_++;
         }
 
-        sourceRegisterCount++;
+        sourceRegisterCount_++;
       } else {
-        if (csRegToRegister(op.reg) != Instruction::ZERO_REGISTER) {
-          destinationRegisters[destinationRegisterCount] =
+        /**
+         * Register writes to x0 are discarded so no destination register is
+         * set.
+         *
+         * While the execution stage may still write to the results array,
+         * when Instruction::getResults and
+         * Instruction::getDestinationRegisters are called during writeback,
+         * zero sized spans are returned (determined by the value of
+         * destinationRegisterCount). This in turn means no register update is
+         * performed.
+         *
+         * TODO this will break if there are more than 2 destination registers
+         * with one being the zero register e.g. if an instruction implicitly
+         * writes to a system register. The current implementation could mean
+         * that the second result is discarded
+         *
+         */
+        if (csRegToRegister(op.reg) != RegisterType::ZERO_REGISTER) {
+          destinationRegisters_[destinationRegisterCount_] =
               csRegToRegister(op.reg);
 
-          destinationRegisterCount++;
-        } else {
-          /**
-           * Register writes to x0 are discarded so no destination register is
-           * set.
-           *
-           * While the execution stage may still write to the results array,
-           * when Instruction::getResults and
-           * Instruction::getDestinationRegisters are called during writeback,
-           * zero sized spans are returned (determined by the value of
-           * destinationRegisterCount). This in turn means no register update is
-           * performed.
-           *
-           * TODO this will break if there are more than 2 destination registers
-           * with one being the zero register e.g. if an instruction implicitly
-           * writes to a system register. The current implementation could mean
-           * that the second result is discarded
-           *
-           */
+          destinationRegisterCount_++;
         }
       }
-    }
+    } else if (i > 0) {
+      // First operand is never of MEM or IMM type, every register operand after
+      // the first is a source register
+      if (op.type == RISCV_OP_REG) {
+        //  Second or third register operand
+        sourceRegisters_[sourceRegisterCount_] = csRegToRegister(op.reg);
 
-    // For all instructions, every register operand after the first is a source
-    // register
-    else if (i > 0 && op.type == RISCV_OP_REG) {
-      //  Second or third operand
-      sourceRegisters[sourceRegisterCount] = csRegToRegister(op.reg);
+        if (sourceRegisters_[sourceRegisterCount_] ==
+            RegisterType::ZERO_REGISTER) {
+          // Catch zero register references and pre-complete those operands
+          sourceValues_[sourceRegisterCount_] = RegisterValue(0, 8);
+        } else {
+          sourceOperandsPending_++;
+        }
 
-      if (sourceRegisters[sourceRegisterCount] == Instruction::ZERO_REGISTER) {
-        // Catch zero register references and pre-complete those operands
-        operands[sourceRegisterCount] = RegisterValue(0, 8);
+        sourceRegisterCount_++;
+      } else if (op.type == RISCV_OP_MEM) {
+        // Memory operand
+        // Extract reg number from capstone object
+        sourceRegisters_[sourceRegisterCount_] = csRegToRegister(op.mem.base);
+        sourceImm_ = op.mem.disp;
+        sourceRegisterCount_++;
+        sourceOperandsPending_++;
+      } else if (op.type == RISCV_OP_IMM) {
+        // Immediate operand
+        sourceImm_ = op.imm;
       } else {
-        operandsPending++;
+        // Something has gone wrong
+        assert(false &&
+               "Unexpected register type in non-first "
+               "operand position");
       }
-
-      sourceRegisterCount++;
-    }
-
-    // First operand is never MEM type, only check after the first. If register
-    // contains memory address, extract reg number from capstone object
-    else if (i > 0 && op.type == RISCV_OP_MEM) {
-      //  Memory operand
-      sourceRegisters[sourceRegisterCount] = csRegToRegister(op.mem.base);
-      sourceRegisterCount++;
-      operandsPending++;
+    } else {
+      // Something has gone wrong
+      assert(false &&
+             "Unexpected register type in first "
+             "operand position");
     }
   }
 
-  if ((Opcode::RISCV_SLL <= metadata.opcode &&
-       metadata.opcode <= Opcode::RISCV_SLLW) ||
-      (Opcode::RISCV_SRA <= metadata.opcode &&
-       metadata.opcode <= Opcode::RISCV_SRAW) ||
-      (Opcode::RISCV_SRL <= metadata.opcode &&
-       metadata.opcode <= Opcode::RISCV_SRLW)) {
+  if ((Opcode::RISCV_SLL <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_SLLW) ||
+      (Opcode::RISCV_SRA <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_SRAW) ||
+      (Opcode::RISCV_SRL <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_SRLW)) {
     // Shift instructions
-    isShift_ = true;
+    setInstructionType(InsnType::isShift);
   }
 
-  if ((Opcode::RISCV_XOR <= metadata.opcode &&
-       metadata.opcode <= Opcode::RISCV_XORI) ||
-      (Opcode::RISCV_OR <= metadata.opcode &&
-       metadata.opcode <= Opcode::RISCV_ORI) ||
-      (Opcode::RISCV_AND <= metadata.opcode &&
-       metadata.opcode <= Opcode::RISCV_ANDI)) {
+  if ((Opcode::RISCV_XOR <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_XORI) ||
+      (Opcode::RISCV_OR <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_ORI) ||
+      (Opcode::RISCV_AND <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_ANDI) ||
+      (Opcode::RISCV_FSGNJN_D <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_FSGNJ_S)) {
     // Logical instructions
-    isLogical_ = true;
+    setInstructionType(InsnType::isLogical);
   }
 
-  if ((Opcode::RISCV_SLT <= metadata.opcode &&
-       metadata.opcode <= Opcode::RISCV_SLTU)) {
+  if ((Opcode::RISCV_SLT <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_SLTU) ||
+      (Opcode::RISCV_FEQ_D <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_FEQ_S) ||
+      (Opcode::RISCV_FLE_D <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_FLT_S) ||
+      (Opcode::RISCV_FMAX_D <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_FMIN_S)) {
     // Compare instructions
-    isCompare_ = true;
+    setInstructionType(InsnType::isCompare);
   }
 
-  if ((Opcode::RISCV_MUL <= metadata.opcode &&
-       metadata.opcode <= Opcode::RISCV_MULW)) {
+  if ((Opcode::RISCV_MUL <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_MULW) ||
+      (Opcode::RISCV_FMADD_D <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_FMADD_S) ||
+      (Opcode::RISCV_FMSUB_D <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_FMUL_S) ||
+      (Opcode::RISCV_FNMADD_D <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_FNMSUB_S)) {
     // Multiply instructions
-    isMultiply_ = true;
+    setInstructionType(InsnType::isMultiply);
   }
 
-  if (((Opcode::RISCV_REM <= metadata.opcode &&
-        metadata.opcode <= Opcode::RISCV_REMW) ||
-       (Opcode::RISCV_DIV <= metadata.opcode &&
-        metadata.opcode <= Opcode::RISCV_DIVW))) {
+  if ((Opcode::RISCV_REM <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_REMW) ||
+      (Opcode::RISCV_DIV <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_DIVW) ||
+      (Opcode::RISCV_FDIV_D <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_FDIV_S) ||
+      (Opcode::RISCV_FSQRT_D <= metadata_.opcode &&
+       metadata_.opcode <= Opcode::RISCV_FSQRT_S)) {
     // Divide instructions
-    isDivide_ = true;
+    setInstructionType(InsnType::isDivide);
+  }
+
+  if ((metadata_.opcode >= Opcode::RISCV_FADD_D &&
+       metadata_.opcode <= Opcode::RISCV_FDIV_S) ||
+      (metadata_.opcode >= Opcode::RISCV_FEQ_D &&
+       metadata_.opcode <= Opcode::RISCV_FSW)) {
+    // Floating point operation
+    setInstructionType(InsnType::isFloat);
+    if ((metadata_.opcode >= Opcode::RISCV_FCVT_D_L &&
+         metadata_.opcode <= Opcode::RISCV_FCVT_W_S)) {
+      setInstructionType(InsnType::isConvert);
+    }
   }
 
   // Set branch type
-  switch (metadata.opcode) {
+  switch (metadata_.opcode) {
     case Opcode::RISCV_BEQ:
     case Opcode::RISCV_BNE:
     case Opcode::RISCV_BLT:
@@ -262,12 +302,12 @@ void Instruction::decode() {
     case Opcode::RISCV_BGE:
     case Opcode::RISCV_BGEU:
       branchType_ = BranchType::Conditional;
-      knownOffset_ = metadata.operands[2].imm;
+      knownOffset_ = sourceImm_;
       break;
     case Opcode::RISCV_JAL:
     case Opcode::RISCV_JALR:
       branchType_ = BranchType::Unconditional;
-      knownOffset_ = metadata.operands[1].imm;
+      knownOffset_ = sourceImm_;
       break;
   }
 }
diff --git a/src/lib/arch/riscv/Instruction_execute.cc b/src/lib/arch/riscv/Instruction_execute.cc
index 005982a9fc..cacce63561 100644
--- a/src/lib/arch/riscv/Instruction_execute.cc
+++ b/src/lib/arch/riscv/Instruction_execute.cc
@@ -1,14 +1,40 @@
 
+#include <cfenv>
 #include <cmath>
+#include <iostream>
 #include <tuple>
 
 #include "InstructionMetadata.hh"
+#include "simeng/arch/riscv/Architecture.hh"
 #include "simeng/arch/riscv/Instruction.hh"
 
 namespace simeng {
 namespace arch {
 namespace riscv {
 
+/** NaN box single precision floating point values as defined in
+ * riscv-spec-20191213 page 73 */
+uint64_t NanBoxFloat(float f) {
+  static_assert(sizeof(float) == 4 && "Float not of size 4 bytes");
+
+  uint64_t box = 0xffffffff00000000;
+  std::memcpy(reinterpret_cast<char*>(&box), reinterpret_cast<char*>(&f),
+              sizeof(float));
+
+  return box;
+}
+
+float checkNanBox(RegisterValue operand) {
+  // Ensure NaN box is correct
+  if ((operand.get<uint64_t>() & 0xffffffff00000000) == 0xffffffff00000000) {
+    // Correct
+    return operand.get<float>();
+  } else {
+    // Not correct
+    return std::nanf("");
+  }
+}
+
 /** Multiply unsigned `a` and unsigned `b`, and return the high 64 bits of the
  * result. https://stackoverflow.com/a/28904636 */
 uint64_t mulhiuu(uint64_t a, uint64_t b) {
@@ -62,6 +88,83 @@ uint64_t zeroExtend(uint64_t bits, uint64_t msb) {
   return rightShift;
 }
 
+void Instruction::setStaticRoundingModeThen(
+    std::function<void(void)> operation) {
+  // Extract rounding mode (rm) from raw bytes
+  // The 3 relevant bits are always in positions 12-14. Take second byte from
+  // encoding and mask with 01110000. Shift right by 4 to remove trailing 0's
+  // and improve readability
+  uint8_t rm = (metadata_.encoding[1] & 0x70) >> 4;
+
+  /** A variable to hold the current fenv rounding mode/architectural dynamic
+   * rounding mode. Used to restore the rounding mode after the architecturally
+   * static rounding mode is used. */
+  int currRM_ = fegetround();
+
+  switch (rm) {
+    case 0x00:  // RNE, Round to nearest, ties to even
+      fesetround(FE_TONEAREST);
+      break;
+    case 0x01:  // RTZ Round towards zero
+      fesetround(FE_TOWARDZERO);
+      break;
+    case 0x02:  // RDN Round down (-infinity)
+      fesetround(FE_DOWNWARD);
+      break;
+    case 0x03:  // RUP Round up (+infinity)
+      fesetround(FE_UPWARD);
+      break;
+    case 0x04:  // RMM Round to nearest, ties to max magnitude
+      // FE_TONEAREST ties towards even but no other options available in fenv
+      fesetround(FE_TONEAREST);
+      break;
+    case 0x05:
+      // If frm is set to an invalid value (101–111), any subsequent attempt to
+      // execute a floating-point operation with a dynamic rounding mode will
+      // raise an illegal instruction exception.
+      // Reserved
+      std::cout << "[SimEng:Instruction_execute] Invalid static rounding mode "
+                   "5 used, "
+                   "instruction address:"
+                << instructionAddress_ << std::endl;
+      exceptionEncountered_ = true;
+      exception_ = InstructionException::IllegalInstruction;
+      break;
+    case 0x06:
+      // Reserved
+      std::cout << "[SimEng:Instruction_execute] Invalid static rounding mode "
+                   "6 used, "
+                   "instruction address:"
+                << instructionAddress_ << std::endl;
+      exceptionEncountered_ = true;
+      exception_ = InstructionException::IllegalInstruction;
+      break;
+    case 0x07:
+      // Use dynamic rounding mode e.g. that which is already set
+      // TODO check the dynamic rounding mode value in the CSR here. If set to
+      // invalid value raise an illegal instruction exception. From spec "any
+      // subsequent attempt to execute a floating-point operation with a dynamic
+      // rounding mode will raise an illegal instruction exception". Requires
+      // full Zicsr implementation
+      break;
+    default:
+      std::cout
+          << "[SimEng:Instruction_execute] Invalid static rounding mode out of "
+             "range, instruction address:"
+          << instructionAddress_ << std::endl;
+      exceptionEncountered_ = true;
+      exception_ = InstructionException::IllegalInstruction;
+  }
+
+  operation();
+
+  fesetround(currRM_);
+
+  // TODO if it appears that repeated rounding mode changes are slow, could
+  // set target rounding mode variable and only update if different to currentRM
+  return;
+}
+
 void Instruction::executionNYI() {
   exceptionEncountered_ = true;
   exception_ = InstructionException::ExecutionNotYetImplemented;
@@ -70,46 +173,46 @@ void Instruction::executionNYI() {
 
 void Instruction::execute() {
   assert(!executed_ && "Attempted to execute an instruction more than once");
-  assert(
-      canExecute() &&
-      "Attempted to execute an instruction before all operands were provided");
+  assert(canExecute() &&
+         "Attempted to execute an instruction before all source operands were "
+         "provided");
 
-  // Implementation of rv64iam according to the v. 20191213 unprivileged spec
+  // Implementation of rv64imafdc according to the v. 20191213 unprivileged spec
 
   executed_ = true;
-  switch (metadata.opcode) {
+  switch (metadata_.opcode) {
     case Opcode::RISCV_LB: {  // LB rd,rs1,imm
-      results[0] = RegisterValue(bitExtend(memoryData[0].get<uint8_t>(), 8), 8);
+      results_[0] =
+          RegisterValue(bitExtend(memoryData_[0].get<uint8_t>(), 8), 8);
       break;
     }
     case Opcode::RISCV_LBU: {  // LBU rd,rs1,imm
-      results[0] =
-          RegisterValue(zeroExtend(memoryData[0].get<uint8_t>(), 8), 8);
+      results_[0] =
+          RegisterValue(zeroExtend(memoryData_[0].get<uint8_t>(), 8), 8);
       break;
     }
     case Opcode::RISCV_LH: {  // LH rd,rs1,imm
-      results[0] =
-          RegisterValue(bitExtend(memoryData[0].get<uint16_t>(), 16), 8);
+      results_[0] =
+          RegisterValue(bitExtend(memoryData_[0].get<uint16_t>(), 16), 8);
       break;
     }
     case Opcode::RISCV_LHU: {  // LHU rd,rs1,imm
-      results[0] =
-          RegisterValue(zeroExtend(memoryData[0].get<uint16_t>(), 16), 8);
+      results_[0] =
+          RegisterValue(zeroExtend(memoryData_[0].get<uint16_t>(), 16), 8);
       break;
     }
     case Opcode::RISCV_LW: {  // LW rd,rs1,imm
-      results[0] =
-          RegisterValue(bitExtend(memoryData[0].get<uint32_t>(), 32), 8);
+      results_[0] =
+          RegisterValue(bitExtend(memoryData_[0].get<uint32_t>(), 32), 8);
       break;
     }
     case Opcode::RISCV_LWU: {  // LWU rd,rs1,imm
-      results[0] =
-          RegisterValue(zeroExtend(memoryData[0].get<uint32_t>(), 32), 8);
+      results_[0] =
+          RegisterValue(zeroExtend(memoryData_[0].get<uint32_t>(), 32), 8);
       break;
     }
     case Opcode::RISCV_LD: {  // LD rd,rs1,imm
-      // Note: elements of memory data are RegisterValue's
-      results[0] = memoryData[0];
+      results_[0] = RegisterValue(memoryData_[0].get<uint64_t>(), 8);
       break;
     }
     case Opcode::RISCV_SB:  // SB rs1,rs2,imm
@@ -119,334 +222,322 @@ void Instruction::execute() {
     case Opcode::RISCV_SW:  // SW rs1,rs2,imm
       [[fallthrough]];
     case Opcode::RISCV_SD: {  // SD rs1,rs2,imm
-      memoryData[0] = operands[0];
+      memoryData_[0] = sourceValues_[0];
       break;
     }
     case Opcode::RISCV_SLL: {  // SLL rd,rs1,rs2
-      const int64_t rs1 = operands[0].get<int64_t>();
+      const int64_t rs1 = sourceValues_[0].get<int64_t>();
       const int64_t rs2 =
-          operands[1].get<int64_t>() & 63;  // Only use lowest 6 bits
+          sourceValues_[1].get<int64_t>() & 63;  // Only use lowest 6 bits
       int64_t out = static_cast<int64_t>(rs1 << rs2);
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_SLLI: {  // SLLI rd,rs1,shamt
-      const int64_t rs1 = operands[0].get<int64_t>();
-      const int64_t shamt =
-          metadata.operands[2].imm & 63;  // Only use lowest 6 bits
+      const int64_t rs1 = sourceValues_[0].get<int64_t>();
+      const int64_t shamt = sourceImm_ & 63;  // Only use lowest 6 bits
       int64_t out = static_cast<int64_t>(rs1 << shamt);
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_SLLW: {  // SLLW rd,rs1,rs2
-      const int32_t rs1 = operands[0].get<int32_t>();
+      const int32_t rs1 = sourceValues_[0].get<int32_t>();
       const int32_t rs2 =
-          operands[1].get<int32_t>() & 63;  // Only use lowest 6 bits
+          sourceValues_[1].get<int32_t>() & 31;  // Only use lowest 5 bits
       int64_t out = signExtendW(static_cast<int32_t>(rs1 << rs2));
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_SLLIW: {  // SLLIW rd,rs1,shamt
-      const int32_t rs1 = operands[0].get<uint32_t>();
-      const int32_t shamt =
-          metadata.operands[2].imm & 63;  // Only use lowest 6 bits
+      const int32_t rs1 = sourceValues_[0].get<uint32_t>();
+      const int32_t shamt = sourceImm_ & 31;  // Only use lowest 5 bits
       uint64_t out = signExtendW(static_cast<uint32_t>(rs1 << shamt));
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_SRL: {  // SRL rd,rs1,rs2
-      const uint64_t rs1 = operands[0].get<uint64_t>();
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
       const uint64_t rs2 =
-          operands[1].get<uint64_t>() & 63;  // Only use lowest 6 bits
+          sourceValues_[1].get<uint64_t>() & 63;  // Only use lowest 6 bits
       uint64_t out = static_cast<uint64_t>(rs1 >> rs2);
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_SRLI: {  // SRLI rd,rs1,shamt
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t shamt =
-          metadata.operands[2].imm & 63;  // Only use lowest 6 bits
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      const uint64_t shamt = sourceImm_ & 63;  // Only use lowest 6 bits
       uint64_t out = static_cast<uint64_t>(rs1 >> shamt);
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_SRLW: {  // SRLW rd,rs1,rs2
-      const uint32_t rs1 = operands[0].get<uint32_t>();
+      const uint32_t rs1 = sourceValues_[0].get<uint32_t>();
       const uint32_t rs2 =
-          operands[1].get<uint32_t>() & 63;  // Only use lowest 6 bits
+          sourceValues_[1].get<uint32_t>() & 31;  // Only use lowest 5 bits
       uint64_t out = signExtendW(static_cast<uint64_t>(rs1 >> rs2));
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_SRLIW: {  // SRLIW rd,rs1,shamt
-      const uint32_t rs1 = operands[0].get<uint32_t>();
-      const uint32_t shamt =
-          metadata.operands[2].imm & 63;  // Only use lowest 6 bits
+      const uint32_t rs1 = sourceValues_[0].get<uint32_t>();
+      const uint32_t shamt = sourceImm_ & 31;  // Only use lowest 5 bits
       uint64_t out = signExtendW(static_cast<uint32_t>(rs1 >> shamt));
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_SRA: {  // SRA rd,rs1,rs2
-      const int64_t rs1 = operands[0].get<int64_t>();
+      const int64_t rs1 = sourceValues_[0].get<int64_t>();
       const int64_t rs2 =
-          operands[1].get<int64_t>() & 63;  // Only use lowest 6 bits
+          sourceValues_[1].get<int64_t>() & 63;  // Only use lowest 6 bits
       int64_t out = static_cast<int64_t>(rs1 >> rs2);
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_SRAI: {  // SRAI rd,rs1,shamt
-      const int64_t rs1 = operands[0].get<int64_t>();
-      const int64_t shamt =
-          metadata.operands[2].imm & 63;  // Only use lowest 6 bits
+      const int64_t rs1 = sourceValues_[0].get<int64_t>();
+      const int64_t shamt = sourceImm_ & 63;  // Only use lowest 6 bits
       int64_t out = static_cast<int64_t>(rs1 >> shamt);
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_SRAW: {  // SRAW rd,rs1,rs2
-      const int32_t rs1 = operands[0].get<int32_t>();
+      const int32_t rs1 = sourceValues_[0].get<int32_t>();
       const int32_t rs2 =
-          operands[1].get<int32_t>() & 63;  // Only use lowest 6 bits
+          sourceValues_[1].get<int32_t>() & 31;  // Only use lowest 5 bits
       int64_t out = static_cast<int32_t>(rs1 >> rs2);
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_SRAIW: {  // SRAIW rd,rs1,shamt
-      const int32_t rs1 = operands[0].get<int32_t>();
-      const int32_t shamt =
-          metadata.operands[2].imm & 63;  // Only use lowest 6 bits
+      const int32_t rs1 = sourceValues_[0].get<int32_t>();
+      const int32_t shamt = sourceImm_ & 31;  // Only use lowest 5 bits
       int64_t out = static_cast<int32_t>(rs1 >> shamt);
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_ADD: {  // ADD rd,rs1,rs2
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t rs2 = operands[1].get<uint64_t>();
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      const uint64_t rs2 = sourceValues_[1].get<uint64_t>();
       uint64_t out = static_cast<uint64_t>(rs1 + rs2);
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_ADDW: {  // ADDW rd,rs1,rs2
-      const int32_t rs1 = operands[0].get<int32_t>();
-      const int32_t rs2 = operands[1].get<int32_t>();
+      const int32_t rs1 = sourceValues_[0].get<int32_t>();
+      const int32_t rs2 = sourceValues_[1].get<int32_t>();
       int64_t out = static_cast<int64_t>(static_cast<int32_t>(rs1 + rs2));
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_ADDI: {  // ADDI rd,rs1,imm
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t rs2 = metadata.operands[2].imm;
-      uint64_t out = static_cast<uint64_t>(rs1 + rs2);
-      results[0] = RegisterValue(out, 8);
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      uint64_t out = static_cast<uint64_t>(rs1 + sourceImm_);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_ADDIW: {  // ADDIW rd,rs1,imm
-      const int32_t rs1 = operands[0].get<int32_t>();
-      const int32_t imm = metadata.operands[2].imm;
-      uint64_t out = signExtendW(rs1 + imm);
-      results[0] = RegisterValue(out, 8);
+      const int32_t rs1 = sourceValues_[0].get<int32_t>();
+      uint64_t out = signExtendW(rs1 + sourceImm_);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_SUB: {  // SUB rd,rs1,rs2
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t rs2 = operands[1].get<uint64_t>();
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      const uint64_t rs2 = sourceValues_[1].get<uint64_t>();
       uint64_t out = static_cast<uint64_t>(rs1 - rs2);
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_SUBW: {  // SUBW rd,rs1,rs2
-      const int32_t rs1 = operands[0].get<int32_t>();
-      const int32_t rs2 = operands[1].get<int32_t>();
+      const int32_t rs1 = sourceValues_[0].get<int32_t>();
+      const int32_t rs2 = sourceValues_[1].get<int32_t>();
       int64_t out = static_cast<int64_t>(static_cast<int32_t>(rs1 - rs2));
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
-    case Opcode::RISCV_LUI: {  // LUI rd,imm
-      uint64_t out = signExtendW(metadata.operands[1].imm
-                                 << 12);  // Shift into upper 20 bits
-      results[0] = RegisterValue(out, 8);
+    case Opcode::RISCV_LUI: {                        // LUI rd,imm
+      uint64_t out = signExtendW(sourceImm_ << 12);  // Shift into upper 20 bits
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_AUIPC: {  // AUIPC rd,imm
       const int64_t pc = instructionAddress_;
-      const int64_t uimm = signExtendW(metadata.operands[1].imm
-                                       << 12);  // Shift into upper 20 bits
+      const int64_t uimm =
+          signExtendW(sourceImm_ << 12);  // Shift into upper 20 bits
       uint64_t out = static_cast<uint64_t>(pc + uimm);
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_XOR: {  // XOR rd,rs1,rs2
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t rs2 = operands[1].get<uint64_t>();
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      const uint64_t rs2 = sourceValues_[1].get<uint64_t>();
       uint64_t out = static_cast<uint64_t>(rs1 ^ rs2);
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_XORI: {  // XORI rd,rs1,imm
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t imm = metadata.operands[2].imm;
-      uint64_t out = static_cast<uint64_t>(rs1 ^ imm);
-      results[0] = RegisterValue(out, 8);
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      uint64_t out = static_cast<uint64_t>(rs1 ^ sourceImm_);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_OR: {  // OR rd,rs1,rs2
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t rs2 = operands[1].get<uint64_t>();
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      const uint64_t rs2 = sourceValues_[1].get<uint64_t>();
       uint64_t out = static_cast<uint64_t>(rs1 | rs2);
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_ORI: {  // ORI rd,rs1,imm
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t imm = metadata.operands[2].imm;
-      uint64_t out = static_cast<uint64_t>(rs1 | imm);
-      results[0] = RegisterValue(out, 8);
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      uint64_t out = static_cast<uint64_t>(rs1 | sourceImm_);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_AND: {  // AND rd,rs1,rs2
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t rs2 = operands[1].get<uint64_t>();
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      const uint64_t rs2 = sourceValues_[1].get<uint64_t>();
       uint64_t out = static_cast<uint64_t>(rs1 & rs2);
-      results[0] = RegisterValue(out, 8);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_ANDI: {  // ANDI rd,rs1,imm
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t imm = metadata.operands[2].imm;
-      uint64_t out = static_cast<uint64_t>(rs1 & imm);
-      results[0] = RegisterValue(out, 8);
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      uint64_t out = static_cast<uint64_t>(rs1 & sourceImm_);
+      results_[0] = RegisterValue(out, 8);
       break;
     }
     case Opcode::RISCV_SLT: {  // SLT rd,rs1,rs2
-      const int64_t rs1 = operands[0].get<int64_t>();
-      const int64_t rs2 = operands[1].get<int64_t>();
+      const int64_t rs1 = sourceValues_[0].get<int64_t>();
+      const int64_t rs2 = sourceValues_[1].get<int64_t>();
       if (rs1 < rs2) {
-        results[0] = RegisterValue(static_cast<uint64_t>(1), 8);
+        results_[0] = RegisterValue(static_cast<uint64_t>(1), 8);
       } else {
-        results[0] = RegisterValue(static_cast<uint64_t>(0), 8);
+        results_[0] = RegisterValue(static_cast<uint64_t>(0), 8);
       }
       break;
     }
     case Opcode::RISCV_SLTU: {  // SLTU rd,rs1,rs2
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t rs2 = operands[1].get<uint64_t>();
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      const uint64_t rs2 = sourceValues_[1].get<uint64_t>();
       if (rs1 < rs2) {
-        results[0] = RegisterValue(static_cast<uint64_t>(1), 8);
+        results_[0] = RegisterValue(static_cast<uint64_t>(1), 8);
       } else {
-        results[0] = RegisterValue(static_cast<uint64_t>(0), 8);
+        results_[0] = RegisterValue(static_cast<uint64_t>(0), 8);
       }
       break;
     }
     case Opcode::RISCV_SLTI: {  // SLTI rd,rs1,imm
-      const int64_t rs1 = operands[0].get<int64_t>();
-      const int64_t imm = metadata.operands[2].imm;
-      if (rs1 < imm) {
-        results[0] = RegisterValue(static_cast<uint64_t>(1), 8);
+      const int64_t rs1 = sourceValues_[0].get<int64_t>();
+      if (rs1 < sourceImm_) {
+        results_[0] = RegisterValue(static_cast<uint64_t>(1), 8);
       } else {
-        results[0] = RegisterValue(static_cast<uint64_t>(0), 8);
+        results_[0] = RegisterValue(static_cast<uint64_t>(0), 8);
       }
       break;
     }
     case Opcode::RISCV_SLTIU: {  // SLTIU rd,rs1,imm
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t imm = static_cast<int64_t>(metadata.operands[2].imm);
-      if (rs1 < imm) {
-        results[0] = RegisterValue(static_cast<uint64_t>(1), 8);
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      if (rs1 < static_cast<uint64_t>(sourceImm_)) {
+        results_[0] = RegisterValue(static_cast<uint64_t>(1), 8);
       } else {
-        results[0] = RegisterValue(static_cast<uint64_t>(0), 8);
+        results_[0] = RegisterValue(static_cast<uint64_t>(0), 8);
       }
       break;
     }
     case Opcode::RISCV_BEQ: {  // BEQ rs1,rs2,imm
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t rs2 = operands[1].get<uint64_t>();
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      const uint64_t rs2 = sourceValues_[1].get<uint64_t>();
       if (rs1 == rs2) {
-        branchAddress_ = instructionAddress_ +
-                         metadata.operands[2].imm;  // Set LSB of result to 0
+        branchAddress_ =
+            instructionAddress_ + sourceImm_;  // Set LSB of result to 0
         branchTaken_ = true;
       } else {
-        branchAddress_ = instructionAddress_ + 4;
+        branchAddress_ = instructionAddress_ + metadata_.getInsnLength();
         branchTaken_ = false;
       }
       break;
     }
     case Opcode::RISCV_BNE: {  // BNE rs1,rs2,imm
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t rs2 = operands[1].get<uint64_t>();
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      const uint64_t rs2 = sourceValues_[1].get<uint64_t>();
       if (rs1 != rs2) {
-        branchAddress_ = instructionAddress_ +
-                         metadata.operands[2].imm;  // Set LSB of result to 0
+        branchAddress_ =
+            instructionAddress_ + sourceImm_;  // Set LSB of result to 0
         branchTaken_ = true;
       } else {
-        branchAddress_ = instructionAddress_ + 4;
+        // Increase by instruction size to account for compressed instructions
+        branchAddress_ = instructionAddress_ + metadata_.getInsnLength();
         branchTaken_ = false;
       }
       break;
     }
     case Opcode::RISCV_BLT: {  // BLT rs1,rs2,imm
-      const int64_t rs1 = operands[0].get<int64_t>();
-      const int64_t rs2 = operands[1].get<int64_t>();
+      const int64_t rs1 = sourceValues_[0].get<int64_t>();
+      const int64_t rs2 = sourceValues_[1].get<int64_t>();
       if (rs1 < rs2) {
-        branchAddress_ = instructionAddress_ +
-                         metadata.operands[2].imm;  // Set LSB of result to 0
+        branchAddress_ =
+            instructionAddress_ + sourceImm_;  // Set LSB of result to 0
         branchTaken_ = true;
       } else {
-        branchAddress_ = instructionAddress_ + 4;
+        branchAddress_ = instructionAddress_ + metadata_.getInsnLength();
         branchTaken_ = false;
       }
       break;
     }
     case Opcode::RISCV_BLTU: {  // BLTU rs1,rs2,imm
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t rs2 = operands[1].get<uint64_t>();
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      const uint64_t rs2 = sourceValues_[1].get<uint64_t>();
       if (rs1 < rs2) {
-        branchAddress_ = instructionAddress_ +
-                         metadata.operands[2].imm;  // Set LSB of result to 0
+        branchAddress_ =
+            instructionAddress_ + sourceImm_;  // Set LSB of result to 0
         branchTaken_ = true;
       } else {
-        branchAddress_ = instructionAddress_ + 4;
+        branchAddress_ = instructionAddress_ + metadata_.getInsnLength();
         branchTaken_ = false;
       }
       break;
     }
     case Opcode::RISCV_BGE: {  // BGE rs1,rs2,imm
-      const int64_t rs1 = operands[0].get<int64_t>();
-      const int64_t rs2 = operands[1].get<int64_t>();
+      const int64_t rs1 = sourceValues_[0].get<int64_t>();
+      const int64_t rs2 = sourceValues_[1].get<int64_t>();
       if (rs1 >= rs2) {
-        branchAddress_ = instructionAddress_ +
-                         metadata.operands[2].imm;  // Set LSB of result to 0
+        branchAddress_ =
+            instructionAddress_ + sourceImm_;  // Set LSB of result to 0
         branchTaken_ = true;
       } else {
-        branchAddress_ = instructionAddress_ + 4;
+        branchAddress_ = instructionAddress_ + metadata_.getInsnLength();
         branchTaken_ = false;
       }
       break;
     }
     case Opcode::RISCV_BGEU: {  // BGEU rs1,rs2,imm
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t rs2 = operands[1].get<uint64_t>();
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      const uint64_t rs2 = sourceValues_[1].get<uint64_t>();
       if (rs1 >= rs2) {
-        branchAddress_ = instructionAddress_ +
-                         metadata.operands[2].imm;  // Set LSB of result to 0
+        branchAddress_ =
+            instructionAddress_ + sourceImm_;  // Set LSB of result to 0
         branchTaken_ = true;
       } else {
-        branchAddress_ = instructionAddress_ + 4;
+        branchAddress_ = instructionAddress_ + metadata_.getInsnLength();
         branchTaken_ = false;
       }
       break;
     }
     case Opcode::RISCV_JAL: {  // JAL rd,imm
-      branchAddress_ = instructionAddress_ +
-                       metadata.operands[1].imm;  // Set LSB of result to 0
+      branchAddress_ =
+          instructionAddress_ + sourceImm_;  // Set LSB of result to 0
       branchTaken_ = true;
-      results[0] = RegisterValue(instructionAddress_ + 4, 8);
+      results_[0] =
+          RegisterValue(instructionAddress_ + metadata_.getInsnLength(), 8);
       break;
     }
     case Opcode::RISCV_JALR: {  // JALR rd,rs1,imm
-      branchAddress_ =
-          (operands[0].get<uint64_t>() + metadata.operands[2].imm) &
-          ~1;  // Set LSB of result to 0
+      branchAddress_ = (sourceValues_[0].get<uint64_t>() + sourceImm_) &
+                       ~1;  // Set LSB of result to 0
       branchTaken_ = true;
-      results[0] = RegisterValue(instructionAddress_ + 4, 8);
+      results_[0] =
+          RegisterValue(instructionAddress_ + metadata_.getInsnLength(), 8);
       break;
     }
       // TODO EBREAK
@@ -459,8 +550,8 @@ void Instruction::execute() {
     case Opcode::RISCV_FENCE: {  // FENCE
       // TODO currently modelled as a NOP as all codes are currently single
       // threaded "Informally, no other RISC-V hart or external device can
-      // observe any operation in the successor set following a FENCE before any
-      // operation in the predecessor set preceding the FENCE."
+      // observe any operation in the successor set following a FENCE before
+      // any operation in the predecessor set preceding the FENCE."
       // https://msyksphinz-self.github.io/riscv-isadoc/html/rvi.html#fence
 
       /* "a simple implementation ... might be able to implement the FENCE
@@ -474,21 +565,21 @@ void Instruction::execute() {
     case Opcode::RISCV_LR_W_AQ:
     case Opcode::RISCV_LR_W_RL:
     case Opcode::RISCV_LR_W_AQ_RL: {
-      // TODO set "reservation set" in memory, currently not needed as all codes
-      //  are single threaded
+      // TODO set "reservation set" in memory, currently not needed as all
+      // codes are single threaded
       // TODO check that address is naturally aligned to operand size,
       //  if not raise address-misaligned/access-fault exception
       // TODO use aq and rl bits to prevent reordering with other memory
       // operations
-      results[0] =
-          RegisterValue(bitExtend(memoryData[0].get<uint32_t>(), 32), 8);
+      results_[0] =
+          RegisterValue(bitExtend(memoryData_[0].get<uint32_t>(), 32), 8);
       break;
     }
     case Opcode::RISCV_LR_D:  // LR.D rd,rs1
     case Opcode::RISCV_LR_D_AQ:
     case Opcode::RISCV_LR_D_RL:
     case Opcode::RISCV_LR_D_AQ_RL: {
-      results[0] = RegisterValue(memoryData[0].get<uint64_t>(), 8);
+      results_[0] = RegisterValue(memoryData_[0].get<uint64_t>(), 8);
       break;
     }
     case Opcode::RISCV_SC_W:  // SC.W rd,rs1,rs2
@@ -506,8 +597,8 @@ void Instruction::execute() {
       //  if not raise address-misaligned/access-fault exception
       // TODO use aq and rl bits to prevent reordering with other memory
       // operations
-      memoryData[0] = operands[0];
-      results[0] = RegisterValue(static_cast<uint64_t>(0), 8);
+      memoryData_[0] = sourceValues_[0];
+      results_[0] = RegisterValue(static_cast<uint64_t>(0), 8);
       break;
     }
     case Opcode::RISCV_AMOSWAP_W:  // AMOSWAP.W rd,rs1,rs2
@@ -519,92 +610,100 @@ void Instruction::execute() {
       // Store rd to memory at address rs1
       // TODO raise address misaligned or access-fault errors
       // TODO account for AQ and RL bits
-      int64_t rd = signExtendW(memoryData[0].get<uint32_t>());
-      int32_t rs2 = operands[0].get<int32_t>();
-      results[0] = RegisterValue(rd, 8);
-      memoryData[0] = rs2;
+      int64_t rd = signExtendW(memoryData_[0].get<uint32_t>());
+      int32_t rs2 = sourceValues_[0].get<int32_t>();
+      results_[0] = RegisterValue(rd, 8);
+      memoryData_[0] = rs2;
       break;
     }
     case Opcode::RISCV_AMOSWAP_D:  // AMOSWAP.D rd,rs1,rs2
     case Opcode::RISCV_AMOSWAP_D_AQ:
     case Opcode::RISCV_AMOSWAP_D_RL:
     case Opcode::RISCV_AMOSWAP_D_AQ_RL: {
-      uint64_t rd = memoryData[0].get<uint64_t>();
-      uint64_t rs2 = operands[0].get<uint64_t>();
-      results[0] = RegisterValue(rd, 8);
-      memoryData[0] = rs2;
+      uint64_t rd = memoryData_[0].get<uint64_t>();
+      uint64_t rs2 = sourceValues_[0].get<uint64_t>();
+      results_[0] = RegisterValue(rd, 8);
+      memoryData_[0] = rs2;
       break;
     }
     case Opcode::RISCV_AMOADD_W:  // AMOADD.W rd,rs1,rs2
     case Opcode::RISCV_AMOADD_W_AQ:
     case Opcode::RISCV_AMOADD_W_RL:
     case Opcode::RISCV_AMOADD_W_AQ_RL: {
-      int64_t rd = signExtendW(memoryData[0].get<uint32_t>());
-      results[0] = RegisterValue(rd, 8);
-      memoryData[0] = static_cast<int32_t>(rd + operands[0].get<int64_t>());
+      int64_t rd = signExtendW(memoryData_[0].get<uint32_t>());
+      results_[0] = RegisterValue(rd, 8);
+      memoryData_[0] =
+          static_cast<int32_t>(rd + sourceValues_[0].get<int64_t>());
       break;
     }
     case Opcode::RISCV_AMOADD_D:  // AMOADD.D rd,rs1,rs2
     case Opcode::RISCV_AMOADD_D_AQ:
     case Opcode::RISCV_AMOADD_D_RL:
     case Opcode::RISCV_AMOADD_D_AQ_RL: {
-      int64_t rd = memoryData[0].get<uint64_t>();
-      results[0] = RegisterValue(rd, 8);
-      memoryData[0] = static_cast<int64_t>(rd + operands[0].get<int64_t>());
+      int64_t rd = memoryData_[0].get<uint64_t>();
+      results_[0] = RegisterValue(rd, 8);
+      memoryData_[0] =
+          static_cast<int64_t>(rd + sourceValues_[0].get<int64_t>());
       break;
     }
     case Opcode::RISCV_AMOAND_W:  // AMOAND.W rd,rs1,rs2
     case Opcode::RISCV_AMOAND_W_AQ:
     case Opcode::RISCV_AMOAND_W_RL:
     case Opcode::RISCV_AMOAND_W_AQ_RL: {
-      int64_t rd = signExtendW(memoryData[0].get<uint32_t>());
-      results[0] = RegisterValue(rd, 8);
-      memoryData[0] = static_cast<int32_t>(rd & operands[0].get<int64_t>());
+      int64_t rd = signExtendW(memoryData_[0].get<uint32_t>());
+      results_[0] = RegisterValue(rd, 8);
+      memoryData_[0] =
+          static_cast<int32_t>(rd & sourceValues_[0].get<int64_t>());
       break;
     }
     case Opcode::RISCV_AMOAND_D:  // AMOAND.D rd,rs1,rs2
     case Opcode::RISCV_AMOAND_D_AQ:
     case Opcode::RISCV_AMOAND_D_RL:
     case Opcode::RISCV_AMOAND_D_AQ_RL: {
-      int64_t rd = memoryData[0].get<uint64_t>();
-      results[0] = RegisterValue(rd, 8);
-      memoryData[0] = static_cast<int64_t>(rd & operands[0].get<int64_t>());
+      int64_t rd = memoryData_[0].get<uint64_t>();
+      results_[0] = RegisterValue(rd, 8);
+      memoryData_[0] =
+          static_cast<int64_t>(rd & sourceValues_[0].get<int64_t>());
       break;
     }
     case Opcode::RISCV_AMOOR_W:  // AMOOR.W rd,rs1,rs2
     case Opcode::RISCV_AMOOR_W_AQ:
     case Opcode::RISCV_AMOOR_W_RL:
     case Opcode::RISCV_AMOOR_W_AQ_RL: {
-      int64_t rd = signExtendW(memoryData[0].get<uint32_t>());
-      results[0] = RegisterValue(rd, 8);
-      memoryData[0] = static_cast<int32_t>(rd | operands[0].get<int64_t>());
+      int64_t rd = signExtendW(memoryData_[0].get<uint32_t>());
+      results_[0] = RegisterValue(rd, 8);
+      memoryData_[0] =
+          static_cast<int32_t>(rd | sourceValues_[0].get<int64_t>());
       break;
     }
     case Opcode::RISCV_AMOOR_D:  // AMOOR.D rd,rs1,rs2
     case Opcode::RISCV_AMOOR_D_AQ:
     case Opcode::RISCV_AMOOR_D_RL:
     case Opcode::RISCV_AMOOR_D_AQ_RL: {
-      int64_t rd = memoryData[0].get<uint64_t>();
-      results[0] = RegisterValue(rd, 8);
-      memoryData[0] = static_cast<int64_t>(rd | operands[0].get<int64_t>());
+      int64_t rd = memoryData_[0].get<uint64_t>();
+      results_[0] = RegisterValue(rd, 8);
+      memoryData_[0] =
+          static_cast<int64_t>(rd | sourceValues_[0].get<int64_t>());
       break;
     }
     case Opcode::RISCV_AMOXOR_W:  // AMOXOR.W rd,rs1,rs2
     case Opcode::RISCV_AMOXOR_W_AQ:
     case Opcode::RISCV_AMOXOR_W_RL:
     case Opcode::RISCV_AMOXOR_W_AQ_RL: {
-      int64_t rd = signExtendW(memoryData[0].get<uint32_t>());
-      results[0] = RegisterValue(rd, 8);
-      memoryData[0] = static_cast<int32_t>(rd ^ operands[0].get<int64_t>());
+      int64_t rd = signExtendW(memoryData_[0].get<uint32_t>());
+      results_[0] = RegisterValue(rd, 8);
+      memoryData_[0] =
+          static_cast<int32_t>(rd ^ sourceValues_[0].get<int64_t>());
       break;
     }
     case Opcode::RISCV_AMOXOR_D:  // AMOXOR.D rd,rs1,rs2
     case Opcode::RISCV_AMOXOR_D_AQ:
     case Opcode::RISCV_AMOXOR_D_RL:
     case Opcode::RISCV_AMOXOR_D_AQ_RL: {
-      int64_t rd = memoryData[0].get<uint64_t>();
-      results[0] = RegisterValue(rd, 8);
-      memoryData[0] = static_cast<int64_t>(rd ^ operands[0].get<int64_t>());
+      int64_t rd = memoryData_[0].get<uint64_t>();
+      results_[0] = RegisterValue(rd, 8);
+      memoryData_[0] =
+          static_cast<int64_t>(rd ^ sourceValues_[0].get<int64_t>());
       break;
     }
 
@@ -612,38 +711,40 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOMIN_W_AQ:
     case Opcode::RISCV_AMOMIN_W_RL:
     case Opcode::RISCV_AMOMIN_W_AQ_RL: {
-      results[0] = RegisterValue(signExtendW(memoryData[0].get<int32_t>()), 8);
-      memoryData[0] =
-          std::min(memoryData[0].get<int32_t>(), operands[0].get<int32_t>());
+      results_[0] =
+          RegisterValue(signExtendW(memoryData_[0].get<int32_t>()), 8);
+      memoryData_[0] = std::min(memoryData_[0].get<int32_t>(),
+                                sourceValues_[0].get<int32_t>());
       break;
     }
     case Opcode::RISCV_AMOMIN_D:  // AMOMIN.D rd,rs1,rs2
     case Opcode::RISCV_AMOMIN_D_AQ:
     case Opcode::RISCV_AMOMIN_D_RL:
     case Opcode::RISCV_AMOMIN_D_AQ_RL: {
-      int64_t rd = memoryData[0].get<int64_t>();
-      results[0] = RegisterValue(rd, 8);
-      memoryData[0] =
-          static_cast<int64_t>(std::min(rd, operands[0].get<int64_t>()));
+      int64_t rd = memoryData_[0].get<int64_t>();
+      results_[0] = RegisterValue(rd, 8);
+      memoryData_[0] =
+          static_cast<int64_t>(std::min(rd, sourceValues_[0].get<int64_t>()));
       break;
     }
     case Opcode::RISCV_AMOMINU_W:  // AMOMINU.W rd,rs1,rs2
     case Opcode::RISCV_AMOMINU_W_AQ:
     case Opcode::RISCV_AMOMINU_W_RL:
     case Opcode::RISCV_AMOMINU_W_AQ_RL: {
-      results[0] = RegisterValue(signExtendW(memoryData[0].get<uint32_t>()), 8);
-      memoryData[0] =
-          std::min(memoryData[0].get<uint32_t>(), operands[0].get<uint32_t>());
+      results_[0] =
+          RegisterValue(signExtendW(memoryData_[0].get<uint32_t>()), 8);
+      memoryData_[0] = std::min(memoryData_[0].get<uint32_t>(),
+                                sourceValues_[0].get<uint32_t>());
       break;
     }
     case Opcode::RISCV_AMOMINU_D:  // AMOMINU.D rd,rs1,rs2
     case Opcode::RISCV_AMOMINU_D_AQ:
     case Opcode::RISCV_AMOMINU_D_RL:
     case Opcode::RISCV_AMOMINU_D_AQ_RL: {
-      uint64_t rd = memoryData[0].get<uint64_t>();
-      results[0] = RegisterValue(rd, 8);
-      memoryData[0] =
-          static_cast<uint64_t>(std::min(rd, operands[0].get<uint64_t>()));
+      uint64_t rd = memoryData_[0].get<uint64_t>();
+      results_[0] = RegisterValue(rd, 8);
+      memoryData_[0] =
+          static_cast<uint64_t>(std::min(rd, sourceValues_[0].get<uint64_t>()));
       break;
     }
 
@@ -651,46 +752,48 @@ void Instruction::execute() {
     case Opcode::RISCV_AMOMAX_W_AQ:
     case Opcode::RISCV_AMOMAX_W_RL:
     case Opcode::RISCV_AMOMAX_W_AQ_RL: {
-      results[0] = RegisterValue(signExtendW(memoryData[0].get<int32_t>()), 8);
-      memoryData[0] =
-          std::max(memoryData[0].get<int32_t>(), operands[0].get<int32_t>());
+      results_[0] =
+          RegisterValue(signExtendW(memoryData_[0].get<int32_t>()), 8);
+      memoryData_[0] = std::max(memoryData_[0].get<int32_t>(),
+                                sourceValues_[0].get<int32_t>());
       break;
     }
     case Opcode::RISCV_AMOMAX_D:  // AMOMAX.D rd,rs1,rs2
     case Opcode::RISCV_AMOMAX_D_AQ:
     case Opcode::RISCV_AMOMAX_D_RL:
     case Opcode::RISCV_AMOMAX_D_AQ_RL: {
-      int64_t rd = memoryData[0].get<int64_t>();
-      results[0] = RegisterValue(rd, 8);
-      memoryData[0] =
-          static_cast<int64_t>(std::max(rd, operands[0].get<int64_t>()));
+      int64_t rd = memoryData_[0].get<int64_t>();
+      results_[0] = RegisterValue(rd, 8);
+      memoryData_[0] =
+          static_cast<int64_t>(std::max(rd, sourceValues_[0].get<int64_t>()));
       break;
     }
     case Opcode::RISCV_AMOMAXU_W:  // AMOMAXU.W rd,rs1,rs2
     case Opcode::RISCV_AMOMAXU_W_AQ:
     case Opcode::RISCV_AMOMAXU_W_RL:
     case Opcode::RISCV_AMOMAXU_W_AQ_RL: {
-      results[0] = RegisterValue(signExtendW(memoryData[0].get<uint32_t>()), 8);
-      memoryData[0] =
-          std::max(memoryData[0].get<uint32_t>(), operands[0].get<uint32_t>());
+      results_[0] =
+          RegisterValue(signExtendW(memoryData_[0].get<uint32_t>()), 8);
+      memoryData_[0] = std::max(memoryData_[0].get<uint32_t>(),
+                                sourceValues_[0].get<uint32_t>());
       break;
     }
     case Opcode::RISCV_AMOMAXU_D:  // AMOMAXU.D rd,rs1,rs2
     case Opcode::RISCV_AMOMAXU_D_AQ:
     case Opcode::RISCV_AMOMAXU_D_RL:
     case Opcode::RISCV_AMOMAXU_D_AQ_RL: {
-      uint64_t rd = memoryData[0].get<uint64_t>();
-      results[0] = RegisterValue(rd, 8);
-      memoryData[0] =
-          static_cast<uint64_t>(std::max(rd, operands[0].get<uint64_t>()));
+      uint64_t rd = memoryData_[0].get<uint64_t>();
+      results_[0] = RegisterValue(rd, 8);
+      memoryData_[0] =
+          static_cast<uint64_t>(std::max(rd, sourceValues_[0].get<uint64_t>()));
       break;
     }
 
       // Integer multiplication division extension (M)
     case Opcode::RISCV_MUL: {  // MUL rd,rs1,rs2
-      const int64_t rs1 = operands[0].get<int64_t>();
-      const int64_t rs2 = operands[1].get<int64_t>();
-      results[0] = RegisterValue(static_cast<int64_t>(rs1 * rs2), 8);
+      const int64_t rs1 = sourceValues_[0].get<int64_t>();
+      const int64_t rs2 = sourceValues_[1].get<int64_t>();
+      results_[0] = RegisterValue(static_cast<int64_t>(rs1 * rs2), 8);
       break;
     }
       //    case Opcode::RISCV_MULH: {//MULH rd,rs1,rs2
@@ -702,9 +805,9 @@ void Instruction::execute() {
       //      break;
       //    }
     case Opcode::RISCV_MULHU: {  // MULHU rd,rs1,rs2
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t rs2 = operands[1].get<uint64_t>();
-      results[0] = RegisterValue(mulhiuu(rs1, rs2), 8);
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      const uint64_t rs2 = sourceValues_[1].get<uint64_t>();
+      results_[0] = RegisterValue(mulhiuu(rs1, rs2), 8);
       break;
     }
       //    case Opcode::RISCV_MULHSU: {//MULHSU rd,rs1,rs2
@@ -716,117 +819,874 @@ void Instruction::execute() {
       //      break;
       //    }
     case Opcode::RISCV_MULW: {  // MULW rd,rs1,rs2
-      const uint32_t rs1 = operands[0].get<uint32_t>();
-      const uint32_t rs2 = operands[1].get<uint32_t>();
-      results[0] = RegisterValue(signExtendW(rs1 * rs2), 8);
+      const uint32_t rs1 = sourceValues_[0].get<uint32_t>();
+      const uint32_t rs2 = sourceValues_[1].get<uint32_t>();
+      results_[0] = RegisterValue(signExtendW(rs1 * rs2), 8);
       break;
     }
 
     case Opcode::RISCV_DIV: {  // DIV rd,rs1,rs2
-      const int64_t rs1 = operands[0].get<int64_t>();
-      const int64_t rs2 = operands[1].get<int64_t>();
+      const int64_t rs1 = sourceValues_[0].get<int64_t>();
+      const int64_t rs2 = sourceValues_[1].get<int64_t>();
       if (rs2 == 0) {
         // divide by zero
-        results[0] = RegisterValue(static_cast<uint64_t>(-1), 8);
+        results_[0] = RegisterValue(static_cast<uint64_t>(-1), 8);
       } else if (rs1 == static_cast<int64_t>(0x8000000000000000) && rs2 == -1) {
         // division overflow
-        results[0] = RegisterValue(rs1, 8);
+        results_[0] = RegisterValue(rs1, 8);
       } else {
-        results[0] = RegisterValue(static_cast<int64_t>(rs1 / rs2), 8);
+        results_[0] = RegisterValue(static_cast<int64_t>(rs1 / rs2), 8);
       }
       break;
     }
     case Opcode::RISCV_DIVW: {  // DIVW rd,rs1,rs2
-      const int32_t rs1 = operands[0].get<int32_t>();
-      const int32_t rs2 = operands[1].get<int32_t>();
+      const int32_t rs1 = sourceValues_[0].get<int32_t>();
+      const int32_t rs2 = sourceValues_[1].get<int32_t>();
       if (rs2 == 0) {
         // divide by zero
-        results[0] = RegisterValue(static_cast<uint64_t>(-1), 8);
+        results_[0] = RegisterValue(static_cast<uint64_t>(-1), 8);
       } else if (rs1 == static_cast<int32_t>(0x80000000) && rs2 == -1) {
         // division overflow
-        results[0] = RegisterValue(static_cast<int64_t>(signExtendW(rs1)), 8);
+        results_[0] = RegisterValue(static_cast<int64_t>(signExtendW(rs1)), 8);
       } else {
-        results[0] =
+        results_[0] =
             RegisterValue(static_cast<int64_t>(signExtendW(rs1 / rs2)), 8);
       }
       break;
     }
     case Opcode::RISCV_DIVU: {  // DIVU rd,rs1,rs2
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t rs2 = operands[1].get<uint64_t>();
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      const uint64_t rs2 = sourceValues_[1].get<uint64_t>();
       if (rs2 == 0) {
         // divide by zero
-        results[0] = RegisterValue(static_cast<uint64_t>(-1), 8);
+        results_[0] = RegisterValue(static_cast<uint64_t>(-1), 8);
       } else {
-        results[0] = RegisterValue(static_cast<uint64_t>(rs1 / rs2), 8);
+        results_[0] = RegisterValue(static_cast<uint64_t>(rs1 / rs2), 8);
       }
       break;
     }
     case Opcode::RISCV_DIVUW: {  // DIVUW rd,rs1,rs2
-      const uint32_t rs1 = operands[0].get<uint32_t>();
-      const uint32_t rs2 = operands[1].get<uint32_t>();
+      const uint32_t rs1 = sourceValues_[0].get<uint32_t>();
+      const uint32_t rs2 = sourceValues_[1].get<uint32_t>();
       if (rs2 == 0) {
         // divide by zero
-        results[0] = RegisterValue(static_cast<uint64_t>(-1), 8);
+        results_[0] = RegisterValue(static_cast<uint64_t>(-1), 8);
       } else {
-        results[0] =
+        results_[0] =
             RegisterValue(static_cast<uint64_t>(signExtendW(rs1 / rs2)), 8);
       }
       break;
     }
     case Opcode::RISCV_REM: {  // REM rd,rs1,rs2
-      const int64_t rs1 = operands[0].get<int64_t>();
-      const int64_t rs2 = operands[1].get<int64_t>();
+      const int64_t rs1 = sourceValues_[0].get<int64_t>();
+      const int64_t rs2 = sourceValues_[1].get<int64_t>();
       if (rs2 == 0) {
         // divide by zero
-        results[0] = RegisterValue(static_cast<uint64_t>(rs1), 8);
+        results_[0] = RegisterValue(static_cast<uint64_t>(rs1), 8);
       } else if (rs1 == static_cast<int64_t>(0x8000000000000000) && rs2 == -1) {
         // division overflow
-        results[0] = RegisterValue(static_cast<int64_t>(0), 8);
+        results_[0] = RegisterValue(static_cast<int64_t>(0), 8);
       } else {
-        results[0] = RegisterValue(static_cast<int64_t>(rs1 % rs2), 8);
+        results_[0] = RegisterValue(static_cast<int64_t>(rs1 % rs2), 8);
       }
       break;
     }
     case Opcode::RISCV_REMW: {  // REMW rd,rs1,rs2
-      const int32_t rs1 = operands[0].get<int32_t>();
-      const int32_t rs2 = operands[1].get<int32_t>();
+      const int32_t rs1 = sourceValues_[0].get<int32_t>();
+      const int32_t rs2 = sourceValues_[1].get<int32_t>();
       if (rs2 == 0) {
         // divide by zero
-        results[0] = RegisterValue(static_cast<int64_t>(signExtendW(rs1)), 8);
+        results_[0] = RegisterValue(static_cast<int64_t>(signExtendW(rs1)), 8);
       } else if (rs1 == static_cast<int32_t>(0x80000000) && rs2 == -1) {
         // division overflow
-        results[0] = RegisterValue(static_cast<int64_t>(0), 8);
+        results_[0] = RegisterValue(static_cast<int64_t>(0), 8);
       } else {
-        results[0] =
+        results_[0] =
             RegisterValue(static_cast<int64_t>(signExtendW(rs1 % rs2)), 8);
       }
       break;
     }
     case Opcode::RISCV_REMU: {  // REMU rd,rs1,rs2
-      const uint64_t rs1 = operands[0].get<uint64_t>();
-      const uint64_t rs2 = operands[1].get<uint64_t>();
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+      const uint64_t rs2 = sourceValues_[1].get<uint64_t>();
       if (rs2 == 0) {
         // divide by zero
-        results[0] = RegisterValue(rs1, 8);
+        results_[0] = RegisterValue(rs1, 8);
       } else {
-        results[0] = RegisterValue(static_cast<uint64_t>(rs1 % rs2), 8);
+        results_[0] = RegisterValue(static_cast<uint64_t>(rs1 % rs2), 8);
       }
       break;
     }
     case Opcode::RISCV_REMUW: {  // REMUW rd,rs1,rs2
-      const uint32_t rs1 = operands[0].get<uint32_t>();
-      const uint32_t rs2 = operands[1].get<uint32_t>();
+      const uint32_t rs1 = sourceValues_[0].get<uint32_t>();
+      const uint32_t rs2 = sourceValues_[1].get<uint32_t>();
       if (rs2 == 0) {
         // divide by zero
-        results[0] = RegisterValue(static_cast<int64_t>(signExtendW(rs1)), 8);
+        results_[0] = RegisterValue(static_cast<int64_t>(signExtendW(rs1)), 8);
       } else {
-        results[0] =
+        results_[0] =
             RegisterValue(static_cast<uint64_t>(signExtendW(rs1 % rs2)), 8);
       }
       break;
     }
 
+      // Control and Status Register extension (Zicsr)
+
+      // Currently do not read-modify-write ATOMICALLY
+      // Left mostly unimplemented due to Capstone being unable to disassemble
+      // CSR addresses. Some partial functionality is implemented for
+      // correctness of other extensions
+    case Opcode::RISCV_CSRRW: {  // CSRRW rd,csr,rs1
+      // TODO dummy implementation to allow progression and correct setting of
+      // floating point rounding modes. Full functionality to be implemented
+      // with Zicsr implementation
+
+      // Raise exception to force pipeline flush and commit of all older
+      // instructions in program order before execution. Execution
+      // logic in ExceptionHandler.cc
+      exceptionEncountered_ = true;
+      exception_ = InstructionException::PipelineFlush;
+
+      break;
+    }
+    case Opcode::RISCV_CSRRWI: {  // CSRRWI rd,csr,imm
+      executionNYI();
+      break;
+    }
+    case Opcode::RISCV_CSRRS: {  // CSRRS rd,csr,rs1
+      // dummy implementation to allow progression
+      // TODO implement fully when Zicsr extension is supported
+      results_[0] = RegisterValue(static_cast<uint64_t>(0), 8);
+      break;
+    }
+    case Opcode::RISCV_CSRRSI: {  // CSRRSI rd,csr,imm
+      executionNYI();
+      break;
+    }
+    case Opcode::RISCV_CSRRC: {  // CSRRC rd,csr,rs1
+      executionNYI();
+      break;
+    }
+    case Opcode::RISCV_CSRRCI: {  // CSRRCI rd,csr,imm
+      executionNYI();
+      break;
+    }
+
+      // Single-Precision Floating-Point (F)
+      // Double-Precision Floating-Point (D)
+    case Opcode::RISCV_FSD: {  // FSD rs1,rs2,imm
+      memoryData_[0] = sourceValues_[0];
+      break;
+    }
+    case Opcode::RISCV_FSW: {  // FSW rs1,rs2,imm
+      memoryData_[0] = sourceValues_[0];
+      break;
+    }
+    case Opcode::RISCV_FLD: {  // FLD rd,rs1,imm
+      results_[0] = memoryData_[0].get<double>();
+      break;
+    }
+    case Opcode::RISCV_FLW: {  // FLW rd,rs1,imm
+      const float memSingle = memoryData_[0].get<float>();
+
+      results_[0] = RegisterValue(NanBoxFloat(memSingle), 8);
+      break;
+    }
+
+    case Opcode::RISCV_FADD_D: {  // FADD.D rd,rs1,rs2
+      setStaticRoundingModeThen([&] {
+        const double rs1 = sourceValues_[0].get<double>();
+        const double rs2 = sourceValues_[1].get<double>();
+
+        results_[0] = RegisterValue(rs1 + rs2, 8);
+      });
+      break;
+    }
+    case Opcode::RISCV_FADD_S: {  // FADD.S rd,rs1,rs2
+
+      setStaticRoundingModeThen([&] {
+        const float rs1 = checkNanBox(sourceValues_[0]);
+        const float rs2 = checkNanBox(sourceValues_[1]);
+
+        results_[0] = RegisterValue(NanBoxFloat(rs1 + rs2), 8);
+      });
+      break;
+    }
+    case Opcode::RISCV_FSUB_D: {  // FSUB.D rd,rs1,rs2
+
+      setStaticRoundingModeThen([&] {
+        const double rs1 = sourceValues_[0].get<double>();
+        const double rs2 = sourceValues_[1].get<double>();
+
+        results_[0] = RegisterValue(rs1 - rs2, 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FSUB_S: {  // FSUB.S rd,rs1,rs2
+
+      setStaticRoundingModeThen([&] {
+        const float rs1 = checkNanBox(sourceValues_[0]);
+        const float rs2 = checkNanBox(sourceValues_[1]);
+
+        results_[0] = RegisterValue(NanBoxFloat(rs1 - rs2), 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FDIV_D: {  // FDIV.D rd,rs1,rs2
+
+      setStaticRoundingModeThen([&] {
+        const double rs1 = sourceValues_[0].get<double>();
+        const double rs2 = sourceValues_[1].get<double>();
+
+        results_[0] = RegisterValue(rs1 / rs2, 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FDIV_S: {  // FDIV.S rd,rs1,rs2
+
+      setStaticRoundingModeThen([&] {
+        const float rs1 = checkNanBox(sourceValues_[0]);
+        const float rs2 = checkNanBox(sourceValues_[1]);
+
+        results_[0] = RegisterValue(NanBoxFloat(rs1 / rs2), 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FMUL_D: {  // FMUL.D rd,rs1,rs2
+
+      setStaticRoundingModeThen([&] {
+        const double rs1 = sourceValues_[0].get<double>();
+        const double rs2 = sourceValues_[1].get<double>();
+
+        results_[0] = RegisterValue(rs1 * rs2, 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FMUL_S: {  // FMUL.S rd,rs1,rs2
+
+      setStaticRoundingModeThen([&] {
+        const float rs1 = checkNanBox(sourceValues_[0]);
+        const float rs2 = checkNanBox(sourceValues_[1]);
+
+        results_[0] = RegisterValue(NanBoxFloat(rs1 * rs2), 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FSQRT_D: {  // FSQRT.D rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const double rs1 = sourceValues_[0].get<double>();
+
+        const double sqrtAns = sqrt(rs1);
+
+        // With -ve rs1, sqrt = -NaN, but qemu returns canonical (+)NaN. Adjust
+        // for this here
+        const double res = std::isnan(sqrtAns) ? nanf("0") : sqrtAns;
+
+        results_[0] = RegisterValue(res, 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FSQRT_S: {  // FSQRT.S rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const float rs1 = checkNanBox(sourceValues_[0]);
+
+        const float sqrtAns = sqrtf(rs1);
+
+        // With -ve rs1, sqrt = -NaN, but qemu returns canonical (+)NaN. Adjust
+        // for this here
+        const float res = std::isnan(sqrtAns) ? nanf("0") : sqrtAns;
+
+        results_[0] = RegisterValue(NanBoxFloat(res), 8);
+      });
+
+      break;
+    }
+
+    case Opcode::RISCV_FMIN_D: {  // FMIN.D rd,rs1,rs2
+      const double rs1 = sourceValues_[0].get<double>();
+      const double rs2 = sourceValues_[1].get<double>();
+
+      // cpp fmin reference: This function is not required to be sensitive to
+      // the sign of zero, although some implementations additionally enforce
+      // that if one argument is +0 and the other is -0, then +0 is returned.
+      // But RISC-V spec requires -0.0 to be considered < +0.0
+      if (rs1 == 0 && rs2 == 0) {
+        results_[0] = RegisterValue(0x8000000000000000, 8);
+      } else {
+        results_[0] = RegisterValue(fmin(rs1, rs2), 8);
+      }
+
+      break;
+    }
+    case Opcode::RISCV_FMIN_S: {  // FMIN.S rd,rs1,rs2
+      const float rs1 = checkNanBox(sourceValues_[0]);
+      const float rs2 = checkNanBox(sourceValues_[1]);
+
+      // Comments regarding fminf similar to RISCV_FMIN_D
+      if (rs1 == 0 && rs2 == 0) {
+        results_[0] = RegisterValue(0xffffffff80000000, 8);
+      } else {
+        results_[0] = RegisterValue(NanBoxFloat(fminf(rs1, rs2)), 8);
+      }
+
+      break;
+    }
+    case Opcode::RISCV_FMAX_D: {  // FMAX.D rd,rs1,rs2
+
+      const double rs1 = sourceValues_[0].get<double>();
+      const double rs2 = sourceValues_[1].get<double>();
+
+      // cpp fmax reference: This function is not required to be sensitive to
+      // the sign of zero, although some implementations additionally enforce
+      // that if one argument is +0 and the other is -0, then +0 is returned.
+      // But RISC-V spec requires this to be the case
+      if (rs1 == 0 && rs2 == 0) {
+        results_[0] = RegisterValue(0x0000000000000000, 8);
+      } else {
+        results_[0] = RegisterValue(fmax(rs1, rs2), 8);
+      }
+      break;
+    }
+    case Opcode::RISCV_FMAX_S: {  // FMAX.S rd,rs1,rs2
+      const float rs1 = checkNanBox(sourceValues_[0]);
+      const float rs2 = checkNanBox(sourceValues_[1]);
+
+      // Comments regarding fmaxf similar to RISCV_FMAX_D
+      if (rs1 == 0 && rs2 == 0) {
+        results_[0] = RegisterValue(0xffffffff00000000, 8);
+      } else {
+        results_[0] = RegisterValue(NanBoxFloat(fmaxf(rs1, rs2)), 8);
+      }
+      break;
+    }
+
+      // TODO "The fused multiply-add instructions must set the invalid
+      // operation exception flag when the multiplicands are ∞ and zero, even
+      // when the addend is a quiet NaN." pg69, require Zicsr extension
+    case Opcode::RISCV_FMADD_D: {  // FMADD.D rd,rs1,rs2,rs3
+      // The fused multiply-add instructions must set the invalid operation
+      // exception flag when the multiplicands are infinity and zero, even when
+      // the addend is a quiet NaN.
+
+      setStaticRoundingModeThen([&] {
+        const double rs1 = sourceValues_[0].get<double>();
+        const double rs2 = sourceValues_[1].get<double>();
+        const double rs3 = sourceValues_[2].get<double>();
+
+        results_[0] = RegisterValue(fma(rs1, rs2, rs3), 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FMADD_S: {  // FMADD.S rd,rs1,rs2,rs3
+
+      setStaticRoundingModeThen([&] {
+        const float rs1 = checkNanBox(sourceValues_[0]);
+        const float rs2 = checkNanBox(sourceValues_[1]);
+        const float rs3 = checkNanBox(sourceValues_[2]);
+
+        if (std::isnan(rs1) || std::isnan(rs2) || std::isnan(rs3)) {
+          results_[0] = RegisterValue(NanBoxFloat(std::nanf("")), 8);
+        } else {
+          results_[0] = RegisterValue(NanBoxFloat(fmaf(rs1, rs2, rs3)), 8);
+        }
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FNMSUB_D: {  // FNMSUB.D rd,rs1,rs2,rs3
+
+      setStaticRoundingModeThen([&] {
+        const double rs1 = sourceValues_[0].get<double>();
+        const double rs2 = sourceValues_[1].get<double>();
+        const double rs3 = sourceValues_[2].get<double>();
+
+        results_[0] = RegisterValue(-(rs1 * rs2) + rs3, 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FNMSUB_S: {  // FNMSUB.S rd,rs1,rs2,rs3
+
+      setStaticRoundingModeThen([&] {
+        const float rs1 = checkNanBox(sourceValues_[0]);
+        const float rs2 = checkNanBox(sourceValues_[1]);
+        const float rs3 = checkNanBox(sourceValues_[2]);
+
+        if (std::isnan(rs1) || std::isnan(rs2) || std::isnan(rs3)) {
+          results_[0] = RegisterValue(NanBoxFloat(std::nanf("")), 8);
+        } else {
+          results_[0] = RegisterValue(NanBoxFloat(-(rs1 * rs2) + rs3), 8);
+        }
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FMSUB_D: {  // FMSUB.D rd,rs1,rs2,rs3
+
+      setStaticRoundingModeThen([&] {
+        const double rs1 = sourceValues_[0].get<double>();
+        const double rs2 = sourceValues_[1].get<double>();
+        const double rs3 = sourceValues_[2].get<double>();
+
+        results_[0] = RegisterValue((rs1 * rs2) - rs3, 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FMSUB_S: {  // FMSUB.S rd,rs1,rs2,rs3
+
+      setStaticRoundingModeThen([&] {
+        const float rs1 = checkNanBox(sourceValues_[0]);
+        const float rs2 = checkNanBox(sourceValues_[1]);
+        const float rs3 = checkNanBox(sourceValues_[2]);
+
+        if (std::isnan(rs1) || std::isnan(rs2) || std::isnan(rs3)) {
+          results_[0] = RegisterValue(NanBoxFloat(std::nanf("")), 8);
+        } else {
+          results_[0] = RegisterValue(NanBoxFloat((rs1 * rs2) - rs3), 8);
+        }
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FNMADD_D: {  // FNMADD.D rd,rs1,rs2,rs3
+
+      setStaticRoundingModeThen([&] {
+        const double rs1 = sourceValues_[0].get<double>();
+        const double rs2 = sourceValues_[1].get<double>();
+        const double rs3 = sourceValues_[2].get<double>();
+
+        results_[0] = RegisterValue(-(rs1 * rs2) - rs3, 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FNMADD_S: {  // FNMADD.S rd,rs1,rs2,rs3
+
+      setStaticRoundingModeThen([&] {
+        const float rs1 = checkNanBox(sourceValues_[0]);
+        const float rs2 = checkNanBox(sourceValues_[1]);
+        const float rs3 = checkNanBox(sourceValues_[2]);
+
+        // Some implementations return -NaN if certain inputs are NaN but spec
+        // requires +NaN. Ensure this happens
+        if (std::isnan(rs1) || std::isnan(rs2) || std::isnan(rs3)) {
+          results_[0] = RegisterValue(NanBoxFloat(std::nanf("")), 8);
+        } else {
+          results_[0] = RegisterValue(NanBoxFloat(-(rs1 * rs2) - rs3), 8);
+        }
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FCVT_D_L: {  // FCVT.D.L rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const int64_t rs1 = sourceValues_[0].get<int64_t>();
+
+        results_[0] = RegisterValue((double)rs1, 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FCVT_D_W: {  // FCVT.D.W rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const int32_t rs1 = sourceValues_[0].get<int32_t>();
+
+        results_[0] = RegisterValue((double)rs1, 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FCVT_S_L: {  // FCVT.S.L rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const int64_t rs1 = sourceValues_[0].get<int64_t>();
+
+        results_[0] = RegisterValue(NanBoxFloat((float)rs1), 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FCVT_S_W: {  // FCVT.S.W rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const int32_t rs1 = sourceValues_[0].get<int32_t>();
+
+        results_[0] = RegisterValue(NanBoxFloat((float)rs1), 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FCVT_W_D: {  // FCVT.W.D rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const double rs1 = sourceValues_[0].get<double>();
+
+        if (std::isnan(rs1)) {
+          results_[0] = RegisterValue(0x7FFFFFFF, 8);
+        } else {
+          results_[0] = RegisterValue(signExtendW((int32_t)rint(rs1)), 8);
+        }
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FCVT_W_S: {  // FCVT.W.S rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const float rs1 = checkNanBox(sourceValues_[0]);
+
+        if (std::isnan(rs1)) {
+          results_[0] = RegisterValue(0x7FFFFFFF, 8);
+        } else {
+          results_[0] = RegisterValue(signExtendW((int32_t)rintf(rs1)), 8);
+        }
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FCVT_L_D: {  // FCVT.L.D rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const double rs1 = sourceValues_[0].get<double>();
+
+        if (std::isnan(rs1)) {
+          results_[0] = RegisterValue(0x7FFFFFFFFFFFFFFF, 8);
+        } else {
+          results_[0] = RegisterValue((int64_t)rint(rs1), 8);
+        }
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FCVT_L_S: {  // FCVT.L.S rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const float rs1 = checkNanBox(sourceValues_[0]);
+
+        if (std::isnan(rs1)) {
+          results_[0] = RegisterValue(0x7FFFFFFFFFFFFFFF, 8);
+        } else {
+          results_[0] = RegisterValue((int64_t)rintf(rs1), 8);
+        }
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FCVT_WU_D: {  // FCVT.WU.D rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const double rs1 = sourceValues_[0].get<double>();
+
+        if (std::isnan(rs1) || rs1 >= pow(2, 32) - 1) {
+          results_[0] = RegisterValue(0xFFFFFFFFFFFFFFFF, 8);
+        } else {
+          if (rs1 < 0) {
+            // TODO: set csr flag when Zicsr implementation is complete
+            results_[0] = RegisterValue((uint64_t)0, 8);
+          } else {
+            results_[0] = RegisterValue(signExtendW((uint32_t)rint(rs1)), 8);
+          }
+        }
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FCVT_WU_S: {  // FCVT.WU.S rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const float rs1 = checkNanBox(sourceValues_[0]);
+
+        if (std::isnan(rs1) || rs1 >= pow(2, 32) - 1) {
+          results_[0] = RegisterValue(0xFFFFFFFFFFFFFFFF, 8);
+        } else {
+          if (rs1 < 0) {
+            // TODO: set csr flag when Zicsr implementation is complete
+            results_[0] = RegisterValue((uint64_t)0, 8);
+          } else {
+            results_[0] = RegisterValue(signExtendW((uint32_t)rintf(rs1)), 8);
+          }
+        }
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FCVT_LU_D: {  // FCVT.LU.D rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const double rs1 = sourceValues_[0].get<double>();
+
+        if (std::isnan(rs1) || rs1 >= pow(2, 64) - 1) {
+          results_[0] = RegisterValue(0xFFFFFFFFFFFFFFFF, 8);
+        } else {
+          if (rs1 < 0) {
+            // TODO: set csr flag when Zicsr implementation is complete
+            results_[0] = RegisterValue((uint64_t)0, 8);
+          } else {
+            results_[0] = RegisterValue((uint64_t)rint(rs1), 8);
+          }
+        }
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FCVT_LU_S: {  // FCVT.LU.S rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const float rs1 = checkNanBox(sourceValues_[0]);
+
+        if (std::isnan(rs1) || rs1 >= pow(2, 64) - 1) {
+          results_[0] = RegisterValue(0xFFFFFFFFFFFFFFFF, 8);
+        } else {
+          if (rs1 < 0) {
+            // TODO: set csr flag when Zicsr implementation is complete
+            results_[0] = RegisterValue((uint64_t)0, 8);
+          } else {
+            results_[0] = RegisterValue((uint64_t)rintf(rs1), 8);
+          }
+        }
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FCVT_D_LU: {  // FCVT.D.LU rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+
+        results_[0] = RegisterValue((double)rs1, 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FCVT_D_WU: {  // FCVT.D.WU rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const uint32_t rs1 = sourceValues_[0].get<uint32_t>();
+
+        results_[0] = RegisterValue((double)rs1, 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FCVT_S_LU: {  // FCVT.S.LU rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+
+        results_[0] = RegisterValue(NanBoxFloat((float)rs1), 8);
+      });
+
+      break;
+    }
+    case Opcode::RISCV_FCVT_S_WU: {  // FCVT.S.WU rd,rs1
+
+      setStaticRoundingModeThen([&] {
+        const uint32_t rs1 = sourceValues_[0].get<uint32_t>();
+
+        results_[0] = RegisterValue(NanBoxFloat((float)rs1), 8);
+      });
+
+      break;
+    }
+
+    case Opcode::RISCV_FCVT_D_S: {  // FCVT.D.S rd,rs1
+      const float rs1 = checkNanBox(sourceValues_[0]);
+
+      results_[0] = RegisterValue((double)rs1, 8);
+      break;
+    }
+    case Opcode::RISCV_FCVT_S_D: {  // FCVT.S.D rd,rs1
+      const double rs1 = sourceValues_[0].get<double>();
+
+      results_[0] = RegisterValue(NanBoxFloat((float)rs1), 8);
+      break;
+    }
+
+    case Opcode::RISCV_FSGNJ_D: {  // FSGNJ.D rd,rs1,rs2
+      const double rs1 = sourceValues_[0].get<double>();
+      const double rs2 = sourceValues_[1].get<double>();
+
+      results_[0] = RegisterValue(std::copysign(rs1, rs2), 8);
+      break;
+    }
+    case Opcode::RISCV_FSGNJ_S: {  // FSGNJ.S rd,rs1,rs2
+      const float rs1 = checkNanBox(sourceValues_[0]);
+      const float rs2 = checkNanBox(sourceValues_[1]);
+
+      results_[0] = RegisterValue(NanBoxFloat(std::copysign(rs1, rs2)), 8);
+      break;
+    }
+    case Opcode::RISCV_FSGNJN_D: {  // FSGNJN.D rd,rs1,rs2
+      const double rs1 = sourceValues_[0].get<double>();
+      const double rs2 = sourceValues_[1].get<double>();
+
+      results_[0] = RegisterValue(std::copysign(rs1, -rs2), 8);
+      break;
+    }
+
+    case Opcode::RISCV_FSGNJN_S: {  // FSGNJN.S rd,rs1,rs2
+      const float rs1 = checkNanBox(sourceValues_[0]);
+      const float rs2 = checkNanBox(sourceValues_[1]);
+
+      results_[0] = RegisterValue(NanBoxFloat(std::copysign(rs1, -rs2)), 8);
+      break;
+    }
+    case Opcode::RISCV_FSGNJX_D: {  // FSGNJX.D rd,rs1,rs2
+      const double rs1 = sourceValues_[0].get<double>();
+      const double rs2 = sourceValues_[1].get<double>();
+
+      const double xorSign = pow(-1, std::signbit(rs1) ^ std::signbit(rs2));
+
+      results_[0] = RegisterValue(std::copysign(rs1, xorSign), 8);
+      break;
+    }
+    case Opcode::RISCV_FSGNJX_S: {  // FSGNJX.S rd,rs1,rs2
+      const float rs1 = checkNanBox(sourceValues_[0]);
+      const float rs2 = checkNanBox(sourceValues_[1]);
+
+      const float xorSign = pow(-1, std::signbit(rs1) ^ std::signbit(rs2));
+
+      results_[0] = RegisterValue(NanBoxFloat(std::copysign(rs1, xorSign)), 8);
+      break;
+    }
+
+    case Opcode::RISCV_FMV_D_X: {  // FMV.D.X rd,rs1
+      const double rs1 = sourceValues_[0].get<double>();
+
+      results_[0] = RegisterValue(rs1, 8);
+      break;
+    }
+    case Opcode::RISCV_FMV_X_D: {  // FMV.X.D rd,rs1
+      const double rs1 = sourceValues_[0].get<double>();
+
+      results_[0] = RegisterValue(rs1, 8);
+      break;
+    }
+    case Opcode::RISCV_FMV_W_X: {  // FMV.W.X rd,rs1
+      const float rs1 = sourceValues_[0].get<float>();
+
+      results_[0] = RegisterValue(NanBoxFloat(rs1), 8);
+      break;
+    }
+    case Opcode::RISCV_FMV_X_W: {  // FMV.X.W rd,rs1
+      const uint64_t rs1 = sourceValues_[0].get<uint64_t>();
+
+      results_[0] = RegisterValue(signExtendW(rs1), 8);
+      break;
+    }
+
+      // TODO FLT.S and FLE.S perform what the IEEE 754-2008 standard refers
+      // to as signaling comparisons: that is, they set the invalid operation
+      // exception flag if either input is NaN. FEQ.S performs a quiet
+      // comparison: it only sets the invalid operation exception flag if
+      // either input is a signaling NaN. For all three instructions, the
+      // result is 0 if either operand is NaN. This requires a proper
+      // implementation of the Zicsr extension
+    case Opcode::RISCV_FEQ_D: {  // FEQ.D rd,rs1,rs2
+      // TODO FEQ.S performs a quiet
+      // comparison: it only sets the invalid operation exception flag if
+      // either input is a signaling NaN. Qemu doesn't seem to set CSR flags
+      // with sNANs so unsure of correct implementation. Also require proper
+      // Zicsr implementation
+      const double rs1 = sourceValues_[0].get<double>();
+      const double rs2 = sourceValues_[1].get<double>();
+
+      if (rs1 == rs2 && !std::isnan(rs1) && !std::isnan(rs2)) {
+        results_[0] = RegisterValue(static_cast<uint64_t>(1), 8);
+      } else {
+        results_[0] = RegisterValue(static_cast<uint64_t>(0), 8);
+      }
+      break;
+    }
+    case Opcode::RISCV_FEQ_S: {  // FEQ.S rd,rs1,rs2
+      const float rs1 = checkNanBox(sourceValues_[0]);
+      const float rs2 = checkNanBox(sourceValues_[1]);
+
+      if (rs1 == rs2 && !std::isnan(rs1) && !std::isnan(rs2)) {
+        results_[0] = RegisterValue(static_cast<uint64_t>(1), 8);
+      } else {
+        results_[0] = RegisterValue(static_cast<uint64_t>(0), 8);
+      }
+      break;
+    }
+    case Opcode::RISCV_FLT_D: {  // FLT.D rd,rs1,rs2
+      const double rs1 = sourceValues_[0].get<double>();
+      const double rs2 = sourceValues_[1].get<double>();
+
+      if (std::isnan(rs1) || std::isnan(rs2)) {
+        // TODO: set csr flag when Zicsr implementation is complete
+      }
+      if (rs1 < rs2 && !std::isnan(rs1) && !std::isnan(rs2)) {
+        results_[0] = RegisterValue(static_cast<uint64_t>(1), 8);
+      } else {
+        results_[0] = RegisterValue(static_cast<uint64_t>(0), 8);
+      }
+      break;
+    }
+    case Opcode::RISCV_FLT_S: {  // FLT.S rd,rs1,rs2
+      const float rs1 = checkNanBox(sourceValues_[0]);
+      const float rs2 = checkNanBox(sourceValues_[1]);
+
+      if (std::isnan(rs1) || std::isnan(rs2)) {
+        // TODO: set csr flag when Zicsr implementation is complete
+      }
+      if (rs1 < rs2 && !std::isnan(rs1) && !std::isnan(rs2)) {
+        results_[0] = RegisterValue(static_cast<uint64_t>(1), 8);
+      } else {
+        results_[0] = RegisterValue(static_cast<uint64_t>(0), 8);
+      }
+      break;
+    }
+    case Opcode::RISCV_FLE_D: {  // FLE.D rd,rs1,rs2
+      const double rs1 = sourceValues_[0].get<double>();
+      const double rs2 = sourceValues_[1].get<double>();
+
+      if (std::isnan(rs1) || std::isnan(rs2)) {
+        // TODO: set csr flag when Zicsr implementation is complete
+      }
+      if (rs1 <= rs2 && !std::isnan(rs1) && !std::isnan(rs2)) {
+        results_[0] = RegisterValue(static_cast<uint64_t>(1), 8);
+      } else {
+        results_[0] = RegisterValue(static_cast<uint64_t>(0), 8);
+      }
+      break;
+    }
+    case Opcode::RISCV_FLE_S: {  // FLE.S rd,rs1,rs2
+      const float rs1 = checkNanBox(sourceValues_[0]);
+      const float rs2 = checkNanBox(sourceValues_[1]);
+
+      if (std::isnan(rs1) || std::isnan(rs2)) {
+        // TODO: set csr flag when Zicsr implementation is complete
+      }
+      if (rs1 <= rs2 && !std::isnan(rs1) && !std::isnan(rs2)) {
+        results_[0] = RegisterValue(static_cast<uint64_t>(1), 8);
+      } else {
+        results_[0] = RegisterValue(static_cast<uint64_t>(0), 8);
+      }
+      break;
+    }
+    case Opcode::RISCV_FCLASS_S: {
+      executionNYI();
+      break;
+    }
+    case Opcode::RISCV_FCLASS_D: {
+      executionNYI();
+      break;
+    }
+
     default:
       return executionNYI();
   }
diff --git a/src/lib/branchpredictors/AlwaysNotTakenPredictor.cc b/src/lib/branchpredictors/AlwaysNotTakenPredictor.cc
new file mode 100644
index 0000000000..f9ccb416bc
--- /dev/null
+++ b/src/lib/branchpredictors/AlwaysNotTakenPredictor.cc
@@ -0,0 +1,14 @@
+#include "simeng/branchpredictors/AlwaysNotTakenPredictor.hh"
+
+namespace simeng {
+BranchPrediction AlwaysNotTakenPredictor::predict(
+    [[maybe_unused]] uint64_t address, BranchType type, int64_t knownOffset) {
+  return {false, 0};
+}
+
+void AlwaysNotTakenPredictor::update(uint64_t address, bool taken,
+                                     uint64_t targetAddress, BranchType type,
+                                     uint64_t instructionId) {}
+
+void AlwaysNotTakenPredictor::flush(uint64_t address) {}
+}  // namespace simeng
diff --git a/src/lib/branchpredictors/GenericPredictor.cc b/src/lib/branchpredictors/GenericPredictor.cc
new file mode 100644
index 0000000000..fce3fd2b1d
--- /dev/null
+++ b/src/lib/branchpredictors/GenericPredictor.cc
@@ -0,0 +1,156 @@
+#include "simeng/branchpredictors/GenericPredictor.hh"
+
+#include <iostream>
+
+namespace simeng {
+
+GenericPredictor::GenericPredictor(ryml::ConstNodeRef config)
+    : btbBits_(config["Branch-Predictor"]["BTB-Tag-Bits"].as<uint8_t>()),
+      satCntBits_(
+          config["Branch-Predictor"]["Saturating-Count-Bits"].as<uint8_t>()),
+      globalHistoryLength_(
+          config["Branch-Predictor"]["Global-History-Length"].as<uint16_t>()),
+      rasSize_(config["Branch-Predictor"]["RAS-entries"].as<uint16_t>()) {
+  // Calculate the saturation counter boundary between weakly taken and
+  // not-taken. `(2 ^ num_sat_cnt_bits) / 2` gives the weakly taken state
+  // value
+  uint8_t weaklyTaken = 1 << (satCntBits_ - 1);
+  uint8_t satCntVal = (config["Branch-Predictor"]["Fallback-Static-Predictor"]
+                           .as<std::string>() == "Always-Taken")
+                          ? weaklyTaken
+                          : (weaklyTaken - 1);
+  // Create branch prediction structures
+  btb_ = std::vector<std::pair<uint8_t, uint64_t>>(1ull << btbBits_,
+                                                   {satCntVal, 0});
+
+  // Generate a bitmask that is used to ensure only the relevant number of
+  // bits are stored in the global history. This is two times the
+  // globalHistoryLength_ to allow rolling back of the speculatively updated
+  // global history in the event of a misprediction.
+  globalHistoryMask_ = (1ull << (globalHistoryLength_ * 2)) - 1;
+}
+
+GenericPredictor::~GenericPredictor() {
+  btb_.clear();
+  ras_.clear();
+  rasHistory_.clear();
+  ftq_.clear();
+}
+
+BranchPrediction GenericPredictor::predict(uint64_t address, BranchType type,
+                                           int64_t knownOffset) {
+  // Get index via an XOR hash between the global history and the instruction
+  // address. This hash is then ANDed to keep it within bounds of the btb.
+  // The address is shifted to remove the two least-significant bits as these
+  // are always 0 in an ISA with 4-byte aligned instructions.
+  uint64_t hashedIndex =
+      ((address >> 2) ^ globalHistory_) & ((1ull << btbBits_) - 1);
+
+  // Get prediction from BTB
+  bool direction = btb_[hashedIndex].first >= (1ull << (satCntBits_ - 1));
+  uint64_t target =
+      (knownOffset != 0) ? address + knownOffset : btb_[hashedIndex].second;
+  BranchPrediction prediction = {direction, target};
+
+  // Amend prediction based on branch type
+  if (type == BranchType::Unconditional) {
+    prediction.isTaken = true;
+  } else if (type == BranchType::Return) {
+    prediction.isTaken = true;
+    // Return branches can use the RAS if an entry is available
+    if (ras_.size() > 0) {
+      prediction.target = ras_.back();
+      // Record top of RAS used for target prediction
+      rasHistory_[address] = ras_.back();
+      ras_.pop_back();
+    }
+  } else if (type == BranchType::SubroutineCall) {
+    prediction.isTaken = true;
+    // Subroutine call branches must push their associated return address to RAS
+    if (ras_.size() >= rasSize_) {
+      ras_.pop_front();
+    }
+    ras_.push_back(address + 4);
+    // Record that this address is a branch-and-link instruction
+    rasHistory_[address] = 0;
+  } else if (type == BranchType::Conditional) {
+    if (!prediction.isTaken) prediction.target = address + 4;
+  }
+
+  // Store the hashed index for correct hashing in update()
+  ftq_.emplace_back(prediction.isTaken, hashedIndex);
+
+  // Speculatively update the global history
+  globalHistory_ =
+      ((globalHistory_ << 1) | prediction.isTaken) & globalHistoryMask_;
+
+  return prediction;
+}
+
+void GenericPredictor::update(uint64_t address, bool isTaken,
+                              uint64_t targetAddress, BranchType type,
+                              uint64_t instructionId) {
+  // Make sure that this function is called in program order; and then update
+  // the lastUpdatedInstructionId variable
+  assert(instructionId >= lastUpdatedInstructionId_ &&
+         (lastUpdatedInstructionId_ = instructionId) >= 0 &&
+         "Update not called on branch instructions in program order");
+
+  // Get previous prediction and index calculated from the FTQ
+  bool prevPrediction = ftq_.front().first;
+  uint64_t hashedIndex = ftq_.front().second;
+  ftq_.pop_front();
+
+  // Calculate 2-bit saturating counter value
+  uint8_t satCntVal = btb_[hashedIndex].first;
+  // Only alter value if it would transition to a valid state
+  if (!((satCntVal == (1 << satCntBits_) - 1) && isTaken) &&
+      !(satCntVal == 0 && !isTaken)) {
+    satCntVal += isTaken ? 1 : -1;
+  }
+
+  // Update BTB entry
+  btb_[hashedIndex].first = satCntVal;
+  if (isTaken) {
+    btb_[hashedIndex].second = targetAddress;
+  }
+
+  // Update global history if prediction was incorrect
+  if (prevPrediction != isTaken) {
+    // Bit-flip the global history bit corresponding to this prediction
+    // We know how many predictions there have since been by the size of the FTQ
+    globalHistory_ ^= (1ull << (ftq_.size()));
+  }
+}
+
+void GenericPredictor::flush(uint64_t address) {
+  // If address interacted with RAS, rewind entry
+  auto it = rasHistory_.find(address);
+  if (it != rasHistory_.end()) {
+    uint64_t target = it->second;
+    if (target != 0) {
+      // If history entry belongs to a return instruction, push target back onto
+      // stack
+      if (ras_.size() >= rasSize_) {
+        ras_.pop_front();
+      }
+      ras_.push_back(target);
+    } else {
+      // If history entry belongs to a branch-and-link instruction, pop target
+      // off of stack
+      if (ras_.size()) {
+        ras_.pop_back();
+      }
+    }
+    rasHistory_.erase(it);
+  }
+
+  assert((ftq_.size() > 0) &&
+         "Cannot flush instruction from Branch Predictor "
+         "when the ftq is empty");
+  ftq_.pop_back();
+
+  // Roll back global history
+  globalHistory_ >>= 1;
+}
+}  // namespace simeng
diff --git a/src/lib/branchpredictors/PerceptronPredictor.cc b/src/lib/branchpredictors/PerceptronPredictor.cc
new file mode 100644
index 0000000000..2e517939eb
--- /dev/null
+++ b/src/lib/branchpredictors/PerceptronPredictor.cc
@@ -0,0 +1,201 @@
+#include "simeng/branchpredictors/PerceptronPredictor.hh"
+
+namespace simeng {
+
+PerceptronPredictor::PerceptronPredictor(ryml::ConstNodeRef config)
+    : btbBits_(config["Branch-Predictor"]["BTB-Tag-Bits"].as<uint64_t>()),
+      globalHistoryLength_(
+          config["Branch-Predictor"]["Global-History-Length"].as<uint64_t>()),
+      rasSize_(config["Branch-Predictor"]["RAS-entries"].as<uint64_t>()) {
+  // Build BTB based on config options
+  uint32_t btbSize = (1ul << btbBits_);
+  btb_.resize(btbSize);
+
+  // Initialise perceptron values with 0 for the global history weights, and 1
+  // for the bias weight; and initialise the target with 0 (i.e., unknown)
+  for (uint32_t i = 0; i < btbSize; i++) {
+    btb_[i].first.assign(globalHistoryLength_, 0);
+    btb_[i].first.push_back(1);
+    btb_[i].second = 0;
+  }
+
+  // Set up training threshold according to empirically determined formula
+  trainingThreshold_ = (uint64_t)((1.93 * globalHistoryLength_) + 14);
+
+  // Generate a bitmask that is used to ensure only the relevant number of
+  // bits are stored in the global history. This is two times the
+  // globalHistoryLength_ to allow rolling back of the speculatively updated
+  // global history in the event of a misprediction.
+  globalHistoryMask_ = (1ull << (globalHistoryLength_ * 2)) - 1;
+}
+
+PerceptronPredictor::~PerceptronPredictor() {
+  ras_.clear();
+  rasHistory_.clear();
+  ftq_.clear();
+}
+
+BranchPrediction PerceptronPredictor::predict(uint64_t address, BranchType type,
+                                              int64_t knownOffset) {
+  // Get the hashed index for the prediction table.  XOR the global history with
+  // the non-zero bits of the address, and then keep only the btbBits_ bits of
+  // the output to keep it in bounds of the prediction table.
+  // The address is shifted to remove the two least-significant bits as these
+  // are always 0 in an ISA with 4-byte aligned instructions.
+  uint64_t hashedIndex =
+      ((address >> 2) ^ globalHistory_) & ((1ull << btbBits_) - 1);
+
+  // Retrieve the perceptron from the BTB
+  std::vector<int8_t> perceptron = btb_[hashedIndex].first;
+
+  // Get dot product of perceptron and history
+  int64_t Pout = getDotProduct(perceptron, globalHistory_);
+
+  // Determine direction prediction based on its sign
+  bool direction = (Pout >= 0);
+
+  // If there is a known offset then calculate target accordingly, otherwise
+  // retrieve the target prediction from the btb.
+  uint64_t target =
+      (knownOffset != 0) ? address + knownOffset : btb_[hashedIndex].second;
+
+  BranchPrediction prediction = {direction, target};
+
+  // Amend prediction based on branch type
+  if (type == BranchType::Unconditional) {
+    prediction.isTaken = true;
+  } else if (type == BranchType::Return) {
+    prediction.isTaken = true;
+    // Return branches can use the RAS if an entry is available
+    if (ras_.size() > 0) {
+      prediction.target = ras_.back();
+      // Record top of RAS used for target prediction
+      rasHistory_[address] = ras_.back();
+      ras_.pop_back();
+    }
+  } else if (type == BranchType::SubroutineCall) {
+    prediction.isTaken = true;
+    // Subroutine call branches must push their associated return address to RAS
+    if (ras_.size() >= rasSize_) {
+      ras_.pop_front();
+    }
+    ras_.push_back(address + 4);
+    // Record that this address is a branch-and-link instruction
+    rasHistory_[address] = 0;
+  } else if (type == BranchType::Conditional) {
+    if (!prediction.isTaken) prediction.target = address + 4;
+  }
+
+  // Store the Pout and global history for correct update() --
+  // needs to be global history and not the hashed index as hashing loses
+  // information and the global history is required for updating perceptrons.
+  ftq_.emplace_back(Pout, globalHistory_);
+
+  // Speculatively update the global history based on the direction
+  // prediction being made
+  globalHistory_ =
+      ((globalHistory_ << 1) | prediction.isTaken) & globalHistoryMask_;
+
+  return prediction;
+}
+
+void PerceptronPredictor::update(uint64_t address, bool isTaken,
+                                 uint64_t targetAddress, BranchType type,
+                                 uint64_t instructionId) {
+  // Make sure that this function is called in program order; and then update
+  // the lastUpdatedInstructionId variable
+  assert(instructionId >= lastUpdatedInstructionId_ &&
+         (lastUpdatedInstructionId_ = instructionId) >= 0 &&
+         "Update not called on branch instructions in program order");
+
+  // Retrieve the previous global history and branch direction prediction from
+  // the front of the ftq (assumes branches are updated in program order).
+  int64_t prevPout = ftq_.front().first;
+  uint64_t prevGlobalHistory = ftq_.front().second;
+  ftq_.pop_front();
+
+  // Work out hashed index
+  uint64_t hashedIndex =
+      ((address >> 2) ^ prevGlobalHistory) & ((1ull << btbBits_) - 1);
+
+  std::vector<int8_t> perceptron = btb_[hashedIndex].first;
+
+  // Work out the most recent prediction
+  bool directionPrediction = (prevPout >= 0);
+
+  // Update the perceptron if the prediction was wrong, or the dot product's
+  // magnitude was not greater than the training threshold
+  if ((directionPrediction != isTaken) ||
+      (static_cast<uint64_t>(std::abs(prevPout)) < trainingThreshold_)) {
+    int8_t t = (isTaken) ? 1 : -1;
+
+    for (uint64_t i = 0; i < globalHistoryLength_; i++) {
+      int8_t xi = ((prevGlobalHistory &
+                    (1ull << ((globalHistoryLength_ - 1) - i))) == 0)
+                      ? -1
+                      : 1;
+      int8_t product_xi_t = xi * t;
+      // Make sure no overflow (+-127)
+      if (!(perceptron[i] == 127 && product_xi_t == 1) &&
+          !(perceptron[i] == -127 && product_xi_t == -1)) {
+        perceptron[i] += product_xi_t;
+      }
+    }
+    perceptron[globalHistoryLength_] += t;
+  }
+
+  btb_[hashedIndex].first = perceptron;
+  if (isTaken) {
+    btb_[hashedIndex].second = targetAddress;
+  }
+
+  // Update global history if prediction was incorrect
+  // Bit-flip the global history bit corresponding to this prediction
+  // We know how many predictions there have since been by the size of the FTQ
+  if (directionPrediction != isTaken) globalHistory_ ^= (1ull << (ftq_.size()));
+}
+
+void PerceptronPredictor::flush(uint64_t address) {
+  // If address interacted with RAS, rewind entry
+  auto it = rasHistory_.find(address);
+  if (it != rasHistory_.end()) {
+    uint64_t target = it->second;
+    if (target != 0) {
+      // If history entry belongs to a return instruction, push target back onto
+      // stack
+      if (ras_.size() >= rasSize_) {
+        ras_.pop_front();
+      }
+      ras_.push_back(target);
+    } else {
+      // If history entry belongs to a branch-and-link instruction, pop target
+      // off of stack
+      if (ras_.size()) {
+        ras_.pop_back();
+      }
+    }
+    rasHistory_.erase(it);
+  }
+
+  assert((ftq_.size() > 0) &&
+         "Cannot flush instruction from Branch Predictor "
+         "when the ftq is empty");
+  ftq_.pop_back();
+
+  // Roll back global history
+  globalHistory_ >>= 1;
+}
+
+int64_t PerceptronPredictor::getDotProduct(
+    const std::vector<int8_t>& perceptron, uint64_t history) {
+  int64_t Pout = perceptron[globalHistoryLength_];
+  for (uint64_t i = 0; i < globalHistoryLength_; i++) {
+    // Get branch direction for ith entry in the history
+    bool historyTaken =
+        ((history & (1ull << ((globalHistoryLength_ - 1) - i))) != 0);
+    Pout += historyTaken ? perceptron[i] : (0 - perceptron[i]);
+  }
+  return Pout;
+}
+
+}  // namespace simeng
diff --git a/src/lib/config/ModelConfig.cc b/src/lib/config/ModelConfig.cc
new file mode 100644
index 0000000000..6d54b50d06
--- /dev/null
+++ b/src/lib/config/ModelConfig.cc
@@ -0,0 +1,1248 @@
+#define RYML_SINGLE_HDR_DEFINE_NOW
+#include "simeng/config/ModelConfig.hh"
+
+#include <cmath>
+
+#include "arch/aarch64/InstructionMetadata.hh"
+#include "arch/riscv/InstructionMetadata.hh"
+
+namespace simeng {
+namespace config {
+
+/** RISC-V opcodes. Each opcode represents a unique RISC-V operation. */
+namespace RISCVOpcode {
+#define GET_INSTRINFO_ENUM
+#include "RISCVGenInstrInfo.inc"
+}  // namespace RISCVOpcode
+
+/** AArch64 opcodes. Each opcode represents a unique AArch64 operation. */
+namespace AARCH64Opcode {
+#define GET_INSTRINFO_ENUM
+#include "AArch64GenInstrInfo.inc"
+}  // namespace AARCH64Opcode
+
+ModelConfig::ModelConfig(std::string path) {
+  // Reset ryml::Tree used to represent the config file
+  configTree_.clear();
+  configTree_.rootref() |= ryml::MAP;
+  isDefault_ = false;
+
+  std::ifstream file(path, std::ios::binary);
+  // Check for file existence
+  if (!file.is_open()) {
+    std::cerr << "[SimEng:ModelConfig] Could not read " << path << std::endl;
+    exit(1);
+  }
+  // Read in the contents of the file and create a ryml:Tree from it
+  std::stringstream buffer;
+  buffer << file.rdbuf();
+  configTree_ = ryml::parse_in_arena(ryml::to_csubstr(buffer.str()));
+  file.close();
+
+  // Set the expectations of the config file and validate the config values
+  // within the passed config file
+  setExpectations();
+  validate();
+}
+
+ModelConfig::ModelConfig() {
+  // Generate the default config file
+  generateDefault();
+}
+
+void ModelConfig::validate() {
+  missing_.clear();
+  invalid_.clear();
+
+  recursiveValidate(expectations_, configTree_.rootref());
+  postValidation();
+
+  std::string missingStr = missing_.str();
+  std::string invalidStr = invalid_.str();
+  // Print all missing fields
+  if (missingStr.length()) {
+    std::cerr << "[SimEng:ModelConfig] The following fields are missing from "
+                 "the provided "
+                 "configuration file:\n"
+              << missingStr << std::endl;
+  }
+  // Print all invalid values
+  if (invalidStr.length()) {
+    std::cerr << "[SimEng:ModelConfig] The following values are invalid for "
+                 "their associated field:\n"
+              << invalidStr << std::endl;
+  }
+  // Stop execution if the config file didn't pass checks
+  if (missingStr.length() || invalidStr.length()) exit(1);
+}
+
+void ModelConfig::reGenerateDefault(ISA isa, bool force) {
+  // Only re-generate the default config file if it hasn't already been
+  // generated for the specified ISA
+  if (!force && (isa_ == isa && isDefault_)) return;
+  isa_ = isa;
+  generateDefault();
+}
+
+void ModelConfig::generateDefault() {
+  // Reset ryml::Tree used to represent the config file
+  configTree_.clear();
+  configTree_.rootref() |= ryml::MAP;
+  isDefault_ = true;
+
+  // Set the expectations for the default config file, construct it, and
+  // validate it to ensure correctness for the simulation
+  setExpectations(true);
+  constructDefault(expectations_, configTree_.root_id());
+  validate();
+}
+
+void ModelConfig::constructDefault(ExpectationNode expectations,
+                                   size_t root_id) {
+  // Iterate over the expectations supplied
+  for (const auto& child : expectations.getChildren()) {
+    std::string key = child.getKey();
+    ExpectedType type = child.getType();
+    // If the key is a wildcard, then change it to be an appropriate value
+    // in the resultant config file and its type to be valueless
+    if (key == wildcard) {
+      key = "0";
+      type = ExpectedType::Valueless;
+    }
+    // Create the ryml::NodeRef representing a config option
+    ryml::NodeRef node = configTree_.ref(root_id).append_child()
+                         << ryml::key(key);
+    // If the expectation is a sequence, then add an additional ryml::NodeRef as
+    // a child to the former to act as the sequence of values when read in later
+    if (child.isSequence()) {
+      node |= ryml::SEQ;
+      node = configTree_.ref(node.id()).append_child();
+    }
+    // Set the value of the ryml::NodeRef based on the type. A valueless
+    // expectation informs that an additional level of the YAML hierarchy is
+    // required, thus call constructDefault again with the new ryml::NodeRef's
+    // id as the root id
+    switch (type) {
+      case ExpectedType::Bool:
+        node << child.getDefault<bool>();
+        break;
+      case ExpectedType::Double:
+        node << child.getDefault<double>();
+        break;
+      case ExpectedType::Float:
+        node << child.getDefault<float>();
+        break;
+      case ExpectedType::Integer8:
+        node << child.getDefault<int8_t>();
+        break;
+      case ExpectedType::Integer16:
+        node << child.getDefault<int16_t>();
+        break;
+      case ExpectedType::Integer32:
+        node << child.getDefault<int32_t>();
+        break;
+      case ExpectedType::Integer64:
+        node << child.getDefault<int64_t>();
+        break;
+      case ExpectedType::String:
+        node << child.getDefault<std::string>();
+        break;
+      case ExpectedType::UInteger8:
+        node << child.getDefault<uint8_t>();
+        break;
+      case ExpectedType::UInteger16:
+        node << child.getDefault<uint16_t>();
+        break;
+      case ExpectedType::UInteger32:
+        node << child.getDefault<uint32_t>();
+        break;
+      case ExpectedType::UInteger64:
+        node << child.getDefault<uint64_t>();
+        break;
+      case ExpectedType::Valueless:
+        node |= ryml::MAP;
+        constructDefault(expectations[key], node.id());
+        break;
+    }
+  }
+}
+
+void ModelConfig::addConfigOptions(std::string config) {
+  // Construct a temporary ryml:Tree so that the values held in the passed
+  // config string can be appropriately extracted
+  ryml::Tree tree = ryml::parse_in_arena(ryml::to_csubstr(config));
+
+  // Add/replace the passed config options in `configTree_` and re-run
+  // validation/checks
+  recursiveAdd(tree.rootref(), configTree_.root_id());
+  setExpectations();
+
+  // If the config additions result in a smaller config tree then errors can
+  // occur where node ids are greater than the size of the ryml::Tree. To
+  // combat this, a new tree is created to reassign node ids such that none are
+  // greater than the size of the ryml::Tree
+  ryml::Tree tmp;
+  // Copy all config values over to a temporary tree
+  tmp.rootref() |= ryml::MAP;
+  tmp.duplicate_children(&configTree_, configTree_.root_id(), tmp.root_id(),
+                         -1);
+  // Clear configTree_ and copy config values back over but with new assigned
+  // node ids
+  configTree_.clear();
+  configTree_.rootref() |= ryml::MAP;
+  configTree_.duplicate_children(&tmp, tmp.root_id(), tmp.root_id(), -1);
+
+  validate();
+}
+
+void ModelConfig::recursiveAdd(ryml::NodeRef node, size_t id) {
+  // Iterate over the config options supplied
+  for (ryml::NodeRef child : node.children()) {
+    ryml::NodeRef ref;
+    // If the config option doesn't already exists, add it. Otherwise get the
+    // reference to it
+    if (!configTree_.ref(id).has_child(child.key())) {
+      std::string key = std::string(child.key().data(), child.key().size());
+      ref = configTree_.ref(id).append_child() << ryml::key(key);
+      // Set any appropriate ryml::NodeRef types
+      if (child.is_map()) {
+        ref |= ryml::MAP;
+      }
+      if (child.is_seq()) {
+        ref |= ryml::SEQ;
+      }
+    } else {
+      ref = configTree_.ref(id)[child.key()];
+    }
+    if (child.is_map()) {
+      // If the config option had children, iterate through them.
+      recursiveAdd(child, ref.id());
+    } else if (child.is_seq()) {
+      // If the config option is a sequence, then add the sequence of values
+      // held within the config option (its children) as children to the current
+      // ryml::Tree node identified by `id`
+      ref.clear_children();
+      for (size_t entry = 0; entry < child.num_children(); entry++) {
+        ref.append_child();
+        ref[entry] << child[entry].val();
+      }
+    } else {
+      // If the config option is neither a map nor a sequence, simply add its
+      // value to the ryml::Tree node reference
+      ref << child.val();
+    }
+  }
+}
+
+void ModelConfig::setExpectations(bool isDefault) {
+  // Reset expectations
+  expectations_ = {};
+
+  // Core
+  expectations_.addChild(ExpectationNode::createExpectation("Core"));
+
+  if (isa_ == ISA::AArch64)
+    expectations_["Core"].addChild(
+        ExpectationNode::createExpectation<std::string>("AArch64", "ISA"));
+  else if (isa_ == ISA::RV64)
+    expectations_["Core"].addChild(
+        ExpectationNode::createExpectation<std::string>("rv64", "ISA"));
+  expectations_["Core"]["ISA"].setValueSet(
+      std::vector<std::string>{"AArch64", "rv64"});
+
+  // Early check on [Core][ISA] as its value is needed to inform the
+  // expectations of other config options
+  if (!isDefault) {
+    // Ensure the key "Core" exists before querying the associated YAML node
+    if (configTree_.rootref().has_child(ryml::to_csubstr("Core"))) {
+      // Ensure the key "Core:ISA" exists before querying the associated YAML
+      // node
+      if (configTree_["Core"].has_child(ryml::to_csubstr("ISA"))) {
+        ValidationResult result =
+            expectations_["Core"]["ISA"].validateConfigNode(
+                configTree_["Core"]["ISA"]);
+        std::string ISA = configTree_["Core"]["ISA"].as<std::string>();
+        if (!result.valid) {
+          std::cerr
+              << "[SimEng:ModelConfig] Invalid ISA value of \"" << ISA
+              << "\" passed in config file due to \"" << result.message
+              << "\" error. Cannot continue with config validation. Exiting."
+              << std::endl;
+          exit(1);
+        }
+        // Set isa_
+        if (ISA == "AArch64") {
+          isa_ = ISA::AArch64;
+        } else if (ISA == "rv64") {
+          isa_ = ISA::RV64;
+        }
+      } else {
+        std::cerr
+            << "[SimEng:ModelConfig] Attempted to access config key "
+               "\"Core:ISA\" but it doesn't exist. Cannot continue with config "
+               "validation. Exiting."
+            << std::endl;
+        exit(1);
+      }
+    } else {
+      std::cerr << "[SimEng:ModelConfig] Attempted to access config key "
+                   "\"Core\" but it doesn't exist. Cannot continue with config "
+                   "validation. Exiting."
+                << std::endl;
+      exit(1);
+    }
+  }
+  createGroupMapping();
+
+  if (isa_ == ISA::RV64) {
+    expectations_["Core"].addChild(
+        ExpectationNode::createExpectation<bool>(false, "Compressed"));
+    expectations_["Core"]["Compressed"].setValueSet(std::vector{false, true});
+  }
+
+  expectations_["Core"].addChild(
+      ExpectationNode::createExpectation<std::string>("emulation",
+                                                      "Simulation-Mode"));
+  expectations_["Core"]["Simulation-Mode"].setValueSet(
+      std::vector<std::string>{"emulation", "inorderpipelined", "outoforder"});
+
+  const float clockFreqUpperBound = 10.f;
+  expectations_["Core"].addChild(
+      ExpectationNode::createExpectation<float>(1.f, "Clock-Frequency-GHz"));
+  expectations_["Core"]["Clock-Frequency-GHz"].setValueBounds(
+      0.001f, clockFreqUpperBound);
+
+  // Early check on ["Core"]["Clock-Frequency-GHz"] as values are needed to
+  // inform the expected lower bound of the ["Core"]["Timer-Frequency-MHz"]
+  // value
+  uint64_t tFreqUpperBound = clockFreqUpperBound * 1000;
+  if (!isDefault) {
+    // Ensure the key "Core" exists before querying the associated YAML node
+    if (configTree_.rootref().has_child(ryml::to_csubstr("Core"))) {
+      // Ensure the key "Core:Clock-Frequency-GHz" exists before querying the
+      // associated YAML node
+      if (configTree_["Core"].has_child(
+              ryml::to_csubstr("Clock-Frequency-GHz"))) {
+        ValidationResult result =
+            expectations_["Core"]["Clock-Frequency-GHz"].validateConfigNode(
+                configTree_["Core"]["Clock-Frequency-GHz"]);
+        float clockFreq =
+            configTree_["Core"]["Clock-Frequency-GHz"].as<float>();
+        if (!result.valid) {
+          std::cerr
+              << "[SimEng:ModelConfig] Invalid Clock-Frequency-GHz value of \""
+              << clockFreq << "\" passed in config file due to \""
+              << result.message
+              << "\" error. Cannot continue with config validation. Exiting."
+              << std::endl;
+          exit(1);
+        }
+
+        tFreqUpperBound = clockFreq * 1000;
+      } else {
+        std::cerr << "[SimEng:ModelConfig] Attempted to access config key "
+                     "\"Core:Clock-Frequency-GHz\" but it doesn't exist. "
+                     "Cannot continue with config "
+                     "validation. Exiting."
+                  << std::endl;
+        exit(1);
+      }
+    } else {
+      std::cerr << "[SimEng:ModelConfig] Attempted to access config key "
+                   "\"Core\" but it doesn't exist. Cannot continue with config "
+                   "validation. Exiting."
+                << std::endl;
+      exit(1);
+    }
+  }
+
+  expectations_["Core"].addChild(
+      ExpectationNode::createExpectation<uint32_t>(100, "Timer-Frequency-MHz"));
+  expectations_["Core"]["Timer-Frequency-MHz"].setValueBounds<uint32_t>(
+      1, tFreqUpperBound);
+
+  expectations_["Core"].addChild(ExpectationNode::createExpectation<bool>(
+      false, "Micro-Operations", true));
+  expectations_["Core"]["Micro-Operations"].setValueSet(
+      std::vector{false, true});
+
+  if (isa_ == ISA::AArch64) {
+    expectations_["Core"].addChild(ExpectationNode::createExpectation<uint16_t>(
+        128, "Vector-Length", true));
+    expectations_["Core"]["Vector-Length"].setValueSet(
+        std::vector<uint16_t>{128, 256, 384, 512, 640, 768, 896, 1024, 1152,
+                              1280, 1408, 1536, 1664, 1792, 1920, 2048});
+
+    expectations_["Core"].addChild(ExpectationNode::createExpectation<uint16_t>(
+        128, "Streaming-Vector-Length", true));
+    expectations_["Core"]["Streaming-Vector-Length"].setValueSet(
+        std::vector<uint16_t>{128, 256, 512, 1024, 2048});
+  }
+
+  // Fetch
+  expectations_.addChild(ExpectationNode::createExpectation("Fetch"));
+
+  expectations_["Fetch"].addChild(
+      ExpectationNode::createExpectation<uint16_t>(32, "Fetch-Block-Size"));
+  expectations_["Fetch"]["Fetch-Block-Size"].setValueSet(std::vector<uint16_t>{
+      4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768});
+
+  expectations_["Fetch"].addChild(
+      ExpectationNode::createExpectation<uint16_t>(32, "Loop-Buffer-Size"));
+  expectations_["Fetch"]["Loop-Buffer-Size"].setValueBounds<uint16_t>(
+      0, UINT16_MAX);
+
+  expectations_["Fetch"].addChild(ExpectationNode::createExpectation<uint16_t>(
+      5, "Loop-Detection-Threshold"));
+  expectations_["Fetch"]["Loop-Detection-Threshold"].setValueBounds<uint16_t>(
+      0, UINT16_MAX);
+
+  // Process-Image
+  expectations_.addChild(ExpectationNode::createExpectation("Process-Image"));
+
+  expectations_["Process-Image"].addChild(
+      ExpectationNode::createExpectation<uint64_t>(100000, "Heap-Size"));
+  expectations_["Process-Image"]["Heap-Size"].setValueBounds<uint64_t>(
+      1, UINT64_MAX);
+
+  expectations_["Process-Image"].addChild(
+      ExpectationNode::createExpectation<uint64_t>(100000, "Stack-Size"));
+  expectations_["Process-Image"]["Stack-Size"].setValueBounds<uint64_t>(
+      1, UINT64_MAX);
+
+  // Register-Set
+  expectations_.addChild(ExpectationNode::createExpectation("Register-Set"));
+  if (isa_ == ISA::AArch64) {
+    // TODO: Reduce to 32 once renaming issue has been sorted. Also replace in
+    // ConfigTest.
+    expectations_["Register-Set"].addChild(
+        ExpectationNode::createExpectation<uint16_t>(38,
+                                                     "GeneralPurpose-Count"));
+    expectations_["Register-Set"]["GeneralPurpose-Count"]
+        .setValueBounds<uint16_t>(38, UINT16_MAX);
+
+    // TODO: Reduce to 32 once renaming issue has been sorted. Also replace in
+    // ConfigTest.
+    expectations_["Register-Set"].addChild(
+        ExpectationNode::createExpectation<uint16_t>(
+            38, "FloatingPoint/SVE-Count"));
+    expectations_["Register-Set"]["FloatingPoint/SVE-Count"]
+        .setValueBounds<uint16_t>(38, UINT16_MAX);
+
+    expectations_["Register-Set"].addChild(
+        ExpectationNode::createExpectation<uint16_t>(17, "Predicate-Count",
+                                                     true));
+    expectations_["Register-Set"]["Predicate-Count"].setValueBounds<uint16_t>(
+        17, UINT16_MAX);
+
+    expectations_["Register-Set"].addChild(
+        ExpectationNode::createExpectation<uint16_t>(1, "Conditional-Count"));
+    expectations_["Register-Set"]["Conditional-Count"].setValueBounds<uint16_t>(
+        1, UINT16_MAX);
+
+    expectations_["Register-Set"].addChild(
+        ExpectationNode::createExpectation<uint16_t>(1, "SME-Matrix-Count",
+                                                     true));
+    expectations_["Register-Set"]["SME-Matrix-Count"].setValueBounds<uint16_t>(
+        1, UINT16_MAX);
+
+    expectations_["Register-Set"].addChild(
+        ExpectationNode::createExpectation<uint16_t>(
+            1, "SME-Lookup-Table-Count", true));
+    expectations_["Register-Set"]["SME-Lookup-Table-Count"]
+        .setValueBounds<uint16_t>(1, UINT16_MAX);
+  } else if (isa_ == ISA::RV64) {
+    // TODO: Reduce to 32 once renaming issue has been sorted. Also replace in
+    // ConfigTest.
+    expectations_["Register-Set"].addChild(
+        ExpectationNode::createExpectation<uint16_t>(38,
+                                                     "GeneralPurpose-Count"));
+    // TODO: Reduce to 32 once renaming issue has been sorted
+    expectations_["Register-Set"]["GeneralPurpose-Count"]
+        .setValueBounds<uint16_t>(38, UINT16_MAX);
+
+    // TODO: Reduce to 32 once renaming issue has been sorted. Also replace in
+    // ConfigTest.
+    expectations_["Register-Set"].addChild(
+        ExpectationNode::createExpectation<uint16_t>(38,
+                                                     "FloatingPoint-Count"));
+    // TODO: Reduce to 32 once renaming issue has been sorted
+    expectations_["Register-Set"]["FloatingPoint-Count"]
+        .setValueBounds<uint16_t>(38, UINT16_MAX);
+  }
+
+  // Pipeline-Widths
+  expectations_.addChild(ExpectationNode::createExpectation("Pipeline-Widths"));
+
+  expectations_["Pipeline-Widths"].addChild(
+      ExpectationNode::createExpectation<uint16_t>(1, "Commit"));
+  expectations_["Pipeline-Widths"]["Commit"].setValueBounds<uint16_t>(
+      1, UINT16_MAX);
+
+  expectations_["Pipeline-Widths"].addChild(
+      ExpectationNode::createExpectation<uint16_t>(1, "FrontEnd"));
+  expectations_["Pipeline-Widths"]["FrontEnd"].setValueBounds<uint16_t>(
+      1, UINT16_MAX);
+
+  expectations_["Pipeline-Widths"].addChild(
+      ExpectationNode::createExpectation<uint16_t>(1, "LSQ-Completion"));
+  expectations_["Pipeline-Widths"]["LSQ-Completion"].setValueBounds<uint16_t>(
+      1, UINT16_MAX);
+
+  // Queue-Sizes
+  expectations_.addChild(ExpectationNode::createExpectation("Queue-Sizes"));
+
+  expectations_["Queue-Sizes"].addChild(
+      ExpectationNode::createExpectation<uint32_t>(32, "ROB"));
+  expectations_["Queue-Sizes"]["ROB"].setValueBounds<uint32_t>(1, UINT32_MAX);
+
+  expectations_["Queue-Sizes"].addChild(
+      ExpectationNode::createExpectation<uint32_t>(16, "Load"));
+  expectations_["Queue-Sizes"]["Load"].setValueBounds<uint32_t>(1, UINT32_MAX);
+
+  expectations_["Queue-Sizes"].addChild(
+      ExpectationNode::createExpectation<uint32_t>(16, "Store"));
+  expectations_["Queue-Sizes"]["Store"].setValueBounds<uint32_t>(1, UINT32_MAX);
+
+  // Port-Allocator
+  expectations_.addChild(ExpectationNode::createExpectation("Port-Allocator"));
+  expectations_["Port-Allocator"].addChild(
+      ExpectationNode::createExpectation<std::string>("Balanced", "Type"));
+  expectations_["Port-Allocator"]["Type"].setValueSet(
+      std::vector<std::string>{"Balanced", "A64FX", "M1"});
+
+  // Branch-Predictor
+  expectations_.addChild(
+      ExpectationNode::createExpectation("Branch-Predictor"));
+
+  expectations_["Branch-Predictor"].addChild(
+      ExpectationNode::createExpectation<std::string>("Perceptron", "Type"));
+  expectations_["Branch-Predictor"]["Type"].setValueSet(
+      std::vector<std::string>{"Generic", "Perceptron"});
+
+  expectations_["Branch-Predictor"].addChild(
+      ExpectationNode::createExpectation<uint8_t>(8, "BTB-Tag-Bits"));
+  expectations_["Branch-Predictor"]["BTB-Tag-Bits"].setValueBounds<uint8_t>(1,
+                                                                            64);
+
+  expectations_["Branch-Predictor"].addChild(
+      ExpectationNode::createExpectation<uint16_t>(8, "Global-History-Length"));
+  expectations_["Branch-Predictor"]["Global-History-Length"]
+      .setValueBounds<uint16_t>(1, 32);
+
+  expectations_["Branch-Predictor"].addChild(
+      ExpectationNode::createExpectation<uint16_t>(8, "RAS-entries"));
+  expectations_["Branch-Predictor"]["RAS-entries"].setValueBounds<uint16_t>(
+      1, UINT16_MAX);
+
+  // The saturating counter bits and the fallback predictor
+  // are relevant to the GenericPredictor only
+  if (!isDefault) {
+    // Ensure the key "Branch-Predictor" exists before querying the associated
+    // YAML node
+    if (configTree_.rootref().has_child(ryml::to_csubstr("Branch-Predictor"))) {
+      // Ensure the key "Branch-Predictor:Type" exists before querying the
+      // associated YAML node
+      if (configTree_["Branch-Predictor"].has_child(ryml::to_csubstr("Type"))) {
+        if (configTree_["Branch-Predictor"]["Type"].as<std::string>() ==
+            "Generic") {
+          expectations_["Branch-Predictor"].addChild(
+              ExpectationNode::createExpectation<uint8_t>(
+                  2, "Saturating-Count-Bits"));
+          expectations_["Branch-Predictor"]["Saturating-Count-Bits"]
+              .setValueBounds<uint8_t>(1, 64);
+
+          expectations_["Branch-Predictor"].addChild(
+              ExpectationNode::createExpectation<std::string>(
+                  "Always-Taken", "Fallback-Static-Predictor"));
+          expectations_["Branch-Predictor"]["Fallback-Static-Predictor"]
+              .setValueSet(
+                  std::vector<std::string>{"Always-Taken", "Always-Not-Taken"});
+        }
+      } else {
+        std::cerr << "[SimEng:ModelConfig] Attempted to access config key "
+                     "\"Branch-Predictor:Type\" but it doesn't exist. "
+                     "Cannot continue with config "
+                     "validation. Exiting."
+                  << std::endl;
+        exit(1);
+      }
+    } else {
+      std::cerr << "[SimEng:ModelConfig] Attempted to access config key "
+                   "\"Branch-Predictor\" but it doesn't exist. Cannot continue "
+                   "with config "
+                   "validation. Exiting."
+                << std::endl;
+      exit(1);
+    }
+  }
+
+  // L1-Data-Memory
+  expectations_.addChild(ExpectationNode::createExpectation("L1-Data-Memory"));
+
+  expectations_["L1-Data-Memory"].addChild(
+      ExpectationNode::createExpectation<std::string>("Flat",
+                                                      "Interface-Type"));
+  expectations_["L1-Data-Memory"]["Interface-Type"].setValueSet(
+      std::vector<std::string>{"Flat", "Fixed", "External"});
+
+  // L1-Instruction-Memory
+  expectations_.addChild(
+      ExpectationNode::createExpectation("L1-Instruction-Memory"));
+
+  expectations_["L1-Instruction-Memory"].addChild(
+      ExpectationNode::createExpectation<std::string>("Flat",
+                                                      "Interface-Type"));
+  expectations_["L1-Instruction-Memory"]["Interface-Type"].setValueSet(
+      std::vector<std::string>{"Flat", "Fixed", "External"});
+
+  // LSQ-L1-Interface
+  expectations_.addChild(
+      ExpectationNode::createExpectation("LSQ-L1-Interface"));
+
+  expectations_["LSQ-L1-Interface"].addChild(
+      ExpectationNode::createExpectation<uint64_t>(4, "Access-Latency"));
+  expectations_["LSQ-L1-Interface"]["Access-Latency"].setValueBounds<uint64_t>(
+      1, UINT16_MAX);
+
+  expectations_["LSQ-L1-Interface"].addChild(
+      ExpectationNode::createExpectation<bool>(false, "Exclusive"));
+  expectations_["LSQ-L1-Interface"]["Exclusive"].setValueSet(
+      std::vector{false, true});
+
+  expectations_["LSQ-L1-Interface"].addChild(
+      ExpectationNode::createExpectation<uint16_t>(32, "Load-Bandwidth"));
+
+  expectations_["LSQ-L1-Interface"].addChild(
+      ExpectationNode::createExpectation<uint16_t>(32, "Store-Bandwidth"));
+
+  // AArch64 requires a vector length of at least 128, requiring a minimum of 16
+  // byte load/store bandwidths
+  // For RV64, the the minimum required load/store bandwidth is 8 bytes
+  if (isa_ == ISA::AArch64) {
+    expectations_["LSQ-L1-Interface"]["Load-Bandwidth"]
+        .setValueBounds<uint16_t>(16, UINT16_MAX);
+    expectations_["LSQ-L1-Interface"]["Store-Bandwidth"]
+        .setValueBounds<uint16_t>(16, UINT16_MAX);
+  } else if (isa_ == ISA::RV64) {
+    expectations_["LSQ-L1-Interface"]["Store-Bandwidth"]
+        .setValueBounds<uint16_t>(8, UINT16_MAX);
+    expectations_["LSQ-L1-Interface"]["Load-Bandwidth"]
+        .setValueBounds<uint16_t>(8, UINT16_MAX);
+  }
+
+  expectations_["LSQ-L1-Interface"].addChild(
+      ExpectationNode::createExpectation<uint16_t>(
+          1, "Permitted-Requests-Per-Cycle"));
+  expectations_["LSQ-L1-Interface"]["Permitted-Requests-Per-Cycle"]
+      .setValueBounds<uint16_t>(1, UINT16_MAX);
+
+  expectations_["LSQ-L1-Interface"].addChild(
+      ExpectationNode::createExpectation<uint16_t>(
+          1, "Permitted-Loads-Per-Cycle"));
+  expectations_["LSQ-L1-Interface"]["Permitted-Loads-Per-Cycle"]
+      .setValueBounds<uint16_t>(1, UINT16_MAX);
+
+  expectations_["LSQ-L1-Interface"].addChild(
+      ExpectationNode::createExpectation<uint16_t>(
+          1, "Permitted-Stores-Per-Cycle"));
+  expectations_["LSQ-L1-Interface"]["Permitted-Stores-Per-Cycle"]
+      .setValueBounds<uint16_t>(1, UINT16_MAX);
+
+  // Ports
+  expectations_.addChild(ExpectationNode::createExpectation("Ports"));
+  expectations_["Ports"].addChild(
+      ExpectationNode::createExpectation<uint16_t>(0, wildcard));
+
+  expectations_["Ports"][wildcard].addChild(
+      ExpectationNode::createExpectation<std::string>("0", "Portname"));
+
+  expectations_["Ports"][wildcard].addChild(
+      ExpectationNode::createExpectation<std::string>(
+          "ALL", "Instruction-Group-Support", true));
+  expectations_["Ports"][wildcard]["Instruction-Group-Support"].setValueSet(
+      groupOptions_);
+  expectations_["Ports"][wildcard]["Instruction-Group-Support"].setAsSequence();
+
+  // Get the upper bound of what the opcode value can be based on the ISA
+  uint16_t maxOpcode = 0;
+  if (isa_ == ISA::AArch64) {
+    maxOpcode = arch::aarch64::Opcode::INSTRUCTION_LIST_END;
+  } else if (isa_ == ISA::RV64) {
+    maxOpcode = arch::riscv::Opcode::RISCV_INSTRUCTION_LIST_END;
+  }
+  expectations_["Ports"][wildcard].addChild(
+      ExpectationNode::createExpectation<uint16_t>(
+          maxOpcode, "Instruction-Opcode-Support", true));
+  expectations_["Ports"][wildcard]["Instruction-Opcode-Support"]
+      .setValueBounds<uint16_t>(0, maxOpcode);
+  expectations_["Ports"][wildcard]["Instruction-Opcode-Support"]
+      .setAsSequence();
+
+  // Early check on [Ports][*][Portname] as the values are needed to inform
+  // the expectations of the [Reservation-Stations][*][Ports] values
+  std::vector<std::string> portnames = {"0"};
+  if (!isDefault) {
+    portnames = {};
+    // An index value used in case of error
+    uint16_t idx = 0;
+    // Get all portnames defined in the config file and ensure they are unique
+    if (configTree_.rootref().has_child(ryml::to_csubstr("Ports"))) {
+      for (ryml::NodeRef child : configTree_["Ports"]) {
+        ValidationResult result =
+            expectations_["Ports"][wildcard]["Portname"].validateConfigNode(
+                child["Portname"]);
+        std::string portname = child["Portname"].as<std::string>();
+        if (result.valid) {
+          if (std::find(portnames.begin(), portnames.end(), portname) ==
+              portnames.end()) {
+            portnames.push_back(portname);
+          } else {
+            invalid_ << "\t- duplicate portname \"" << portname << "\"\n";
+          }
+        } else {
+          std::cerr
+              << "[SimEng:ModelConfig] Invalid portname for port " << idx
+              << ", namely \"" << portname
+              << "\", passed in config file due to \"" << result.message
+              << "\" error. Cannot continue with config validation. Exiting."
+              << std::endl;
+          exit(1);
+        }
+        idx++;
+      }
+    }
+  }
+
+  // Reservation-Stations
+  expectations_.addChild(
+      ExpectationNode::createExpectation("Reservation-Stations"));
+  expectations_["Reservation-Stations"].addChild(
+      ExpectationNode::createExpectation<uint16_t>(0, wildcard));
+
+  expectations_["Reservation-Stations"][wildcard].addChild(
+      ExpectationNode::createExpectation<uint32_t>(32, "Size"));
+  expectations_["Reservation-Stations"][wildcard]["Size"]
+      .setValueBounds<uint32_t>(1, UINT32_MAX);
+
+  expectations_["Reservation-Stations"][wildcard].addChild(
+      ExpectationNode::createExpectation<uint16_t>(4, "Dispatch-Rate"));
+  expectations_["Reservation-Stations"][wildcard]["Dispatch-Rate"]
+      .setValueBounds<uint16_t>(1, UINT16_MAX);
+
+  expectations_["Reservation-Stations"][wildcard].addChild(
+      ExpectationNode::createExpectation<std::string>("0", "Ports"));
+  expectations_["Reservation-Stations"][wildcard]["Ports"].setValueSet(
+      portnames);
+  expectations_["Reservation-Stations"][wildcard]["Ports"].setAsSequence();
+
+  // Execution-Units
+  expectations_.addChild(ExpectationNode::createExpectation("Execution-Units"));
+  expectations_["Execution-Units"].addChild(
+      ExpectationNode::createExpectation<uint16_t>(0, wildcard));
+
+  expectations_["Execution-Units"][wildcard].addChild(
+      ExpectationNode::createExpectation<bool>(true, "Pipelined"));
+  expectations_["Execution-Units"][wildcard]["Pipelined"].setValueSet(
+      std::vector{false, true});
+
+  expectations_["Execution-Units"][wildcard].addChild(
+      ExpectationNode::createExpectation<std::string>("NONE", "Blocking-Groups",
+                                                      true));
+  expectations_["Execution-Units"][wildcard]["Blocking-Groups"].setValueSet(
+      groupOptions_);
+  expectations_["Execution-Units"][wildcard]["Blocking-Groups"].setAsSequence();
+
+  // Latencies
+  expectations_.addChild(ExpectationNode::createExpectation("Latencies", true));
+  expectations_["Latencies"].addChild(
+      ExpectationNode::createExpectation<uint16_t>(0, wildcard));
+
+  expectations_["Latencies"][wildcard].addChild(
+      ExpectationNode::createExpectation<std::string>(
+          "NONE", "Instruction-Groups", true));
+  expectations_["Latencies"][wildcard]["Instruction-Groups"].setValueSet(
+      groupOptions_);
+  expectations_["Latencies"][wildcard]["Instruction-Groups"].setAsSequence();
+
+  expectations_["Latencies"][wildcard].addChild(
+      ExpectationNode::createExpectation<uint16_t>(
+          maxOpcode, "Instruction-Opcodes", true));
+  expectations_["Latencies"][wildcard]["Instruction-Opcodes"]
+      .setValueBounds<uint16_t>(0, maxOpcode);
+  expectations_["Latencies"][wildcard]["Instruction-Opcodes"].setAsSequence();
+
+  expectations_["Latencies"][wildcard].addChild(
+      ExpectationNode::createExpectation<uint16_t>(1, "Execution-Latency"));
+  expectations_["Latencies"][wildcard]["Execution-Latency"]
+      .setValueBounds<uint16_t>(1, UINT16_MAX);
+
+  expectations_["Latencies"][wildcard].addChild(
+      ExpectationNode::createExpectation<uint16_t>(1, "Execution-Throughput",
+                                                   true));
+  expectations_["Latencies"][wildcard]["Execution-Throughput"]
+      .setValueBounds<uint16_t>(1, UINT16_MAX);
+
+  // CPU-Info
+  expectations_.addChild(ExpectationNode::createExpectation("CPU-Info"));
+
+  expectations_["CPU-Info"].addChild(ExpectationNode::createExpectation<bool>(
+      true, "Generate-Special-Dir", true));
+  expectations_["CPU-Info"]["Generate-Special-Dir"].setValueSet(
+      std::vector{false, true});
+
+  expectations_["CPU-Info"].addChild(
+      ExpectationNode::createExpectation<std::string>(
+          defaultSpecialFilePath_, "Special-File-Dir-Path", true));
+
+  expectations_["CPU-Info"].addChild(
+      ExpectationNode::createExpectation<uint64_t>(1, "Core-Count", true));
+  expectations_["CPU-Info"]["Core-Count"].setValueBounds<uint64_t>(1,
+                                                                   UINT16_MAX);
+
+  expectations_["CPU-Info"].addChild(
+      ExpectationNode::createExpectation<uint64_t>(1, "Socket-Count", true));
+  expectations_["CPU-Info"]["Socket-Count"].setValueSet<uint64_t>({1});
+
+  expectations_["CPU-Info"].addChild(
+      ExpectationNode::createExpectation<uint64_t>(1, "SMT", true));
+  expectations_["CPU-Info"]["SMT"].setValueSet<uint64_t>({1});
+
+  expectations_["CPU-Info"].addChild(
+      ExpectationNode::createExpectation<float>(0.f, "BogoMIPS", true));
+  expectations_["CPU-Info"]["BogoMIPS"].setValueBounds(
+      0.f, std::numeric_limits<float>::max());
+
+  expectations_["CPU-Info"].addChild(
+      ExpectationNode::createExpectation<std::string>("", "Features", true));
+
+  expectations_["CPU-Info"].addChild(
+      ExpectationNode::createExpectation<std::string>("0x0", "CPU-Implementer",
+                                                      true));
+
+  expectations_["CPU-Info"].addChild(
+      ExpectationNode::createExpectation<uint64_t>(0, "CPU-Architecture",
+                                                   true));
+  expectations_["CPU-Info"]["CPU-Architecture"].setValueBounds<uint64_t>(
+      0, UINT16_MAX);
+
+  expectations_["CPU-Info"].addChild(
+      ExpectationNode::createExpectation<std::string>("0x0", "CPU-Variant",
+                                                      true));
+
+  expectations_["CPU-Info"].addChild(
+      ExpectationNode::createExpectation<std::string>("0x0", "CPU-Part", true));
+
+  expectations_["CPU-Info"].addChild(
+      ExpectationNode::createExpectation<uint64_t>(0, "CPU-Revision", true));
+  expectations_["CPU-Info"]["CPU-Revision"].setValueBounds<uint64_t>(
+      0, UINT16_MAX);
+
+  expectations_["CPU-Info"].addChild(
+      ExpectationNode::createExpectation<uint64_t>(1, "Package-Count", true));
+  expectations_["CPU-Info"]["Package-Count"].setValueBounds<uint64_t>(
+      1, UINT16_MAX);
+}
+
+void ModelConfig::recursiveValidate(ExpectationNode expectation,
+                                    ryml::NodeRef node,
+                                    std::string hierarchyString) {
+  // Iterate over passed expectations
+  for (auto& child : expectation.getChildren()) {
+    std::string nodeKey = child.getKey();
+    // If the expectation is a wildcard, then iterate over the associated
+    // children in the config option using the same expectation(s)
+    if (nodeKey == wildcard) {
+      for (ryml::NodeRef rymlChild : node) {
+        // An index value used in case of error
+        std::string idx =
+            std::string(rymlChild.key().data(), rymlChild.key().size());
+        ValidationResult result = child.validateConfigNode(rymlChild);
+        if (!result.valid)
+          invalid_ << "\t- "
+                   << hierarchyString + idx + " " + result.message + "\n";
+        recursiveValidate(child, rymlChild, hierarchyString + idx + ":");
+      }
+    } else if (node.has_child(ryml::to_csubstr(nodeKey))) {
+      // If the config file contains the key of the expectation node, get
+      // it
+      ryml::NodeRef rymlChild = node[ryml::to_csubstr(nodeKey)];
+      if (child.isSequence()) {
+        // If the expectation node is a sequence, then treat the ryml::NodeRef
+        // as a parent and validate all its children against the expectation
+        // node
+        int idx = 0;
+        for (ryml::NodeRef grndchild : rymlChild) {
+          ValidationResult result = child.validateConfigNode(grndchild);
+          if (!result.valid)
+            invalid_ << "\t- "
+                     << hierarchyString + nodeKey + ":" + std::to_string(idx) +
+                            " " + result.message + "\n";
+          idx++;
+        }
+      } else {
+        // If the expectation node is not a sequence, validate the config
+        // option against the current expectations and if it has children,
+        // validate those recursively
+        ValidationResult result = child.validateConfigNode(rymlChild);
+        if (!result.valid)
+          invalid_ << "\t- "
+                   << hierarchyString + nodeKey + " " + result.message + "\n";
+        if (child.getChildren().size()) {
+          recursiveValidate(child, rymlChild, hierarchyString + nodeKey + ":");
+        }
+      }
+    } else {
+      // If the config file doesn't contain the key of the expectation node,
+      // create it as a child to the config ryml::NodeRef supplied. If the
+      // config option is optional, a default value will be injected,
+      // otherwise the validation will fail
+      ryml::NodeRef rymlChild = node.append_child() << ryml::key(nodeKey);
+      ValidationResult result = child.validateConfigNode(rymlChild);
+      if (!result.valid)
+        invalid_ << "\t- "
+                 << hierarchyString + nodeKey + " " + result.message + "\n";
+    }
+  }
+}
+
+void ModelConfig::postValidation() {
+  // Ensure package_count size is a less than or equal to the core count,
+  // and that the core count can be divided by the package count
+  uint64_t packageCount =
+      configTree_["CPU-Info"]["Package-Count"].as<uint64_t>();
+  uint64_t coreCount = configTree_["CPU-Info"]["Core-Count"].as<uint64_t>();
+  if (!((packageCount <= coreCount) && (coreCount % packageCount == 0))) {
+    invalid_ << "\t- Package-Count must be a Less-than or equal to Core-Count, "
+                "and Core-Count must be divisible by Package-Count\n";
+  }
+
+  // Convert all instruction group strings to their corresponding group
+  // numbers into another config option
+  for (ryml::NodeRef node : configTree_["Ports"]) {
+    // Clear or create a new Instruction-Group-Support-Nums config option
+    if (node.has_child("Instruction-Group-Support-Nums")) {
+      node["Instruction-Group-Support-Nums"].clear_children();
+    } else {
+      node.append_child() << ryml::key("Instruction-Group-Support-Nums") |=
+          ryml::SEQ;
+    }
+    // Read in each group and place its corresponding group number into the
+    // new config option
+    for (ryml::NodeRef child : node["Instruction-Group-Support"]) {
+      ryml::NodeRef newChild =
+          node["Instruction-Group-Support-Nums"].append_child();
+      newChild << groupMapping_[child.as<std::string>()];
+    }
+  }
+  for (ryml::NodeRef node : configTree_["Execution-Units"]) {
+    // Clear or create a new Blocking-Group-Nums config option
+    if (node.has_child("Blocking-Group-Nums")) {
+      node["Blocking-Group-Nums"].clear_children();
+    } else {
+      node.append_child() << ryml::key("Blocking-Group-Nums") |= ryml::SEQ;
+    }
+    // Read in each bloacking group and place its corresponding group number
+    // into the new config option.
+    std::queue<uint16_t> blockingGroups;
+    for (ryml::NodeRef child : node["Blocking-Groups"]) {
+      uint16_t parentGroup = groupMapping_[child.as<std::string>()];
+      blockingGroups.push(parentGroup);
+      node["Blocking-Group-Nums"].append_child() << parentGroup;
+    }
+    // Expand the set of blocking groups to include those that inherit from the
+    // user defined set
+    std::unordered_map<uint16_t, std::vector<uint16_t>> groupInheritance;
+    if (isa_ == ISA::AArch64) {
+      groupInheritance = arch::aarch64::groupInheritance_;
+    } else if (isa_ == ISA::RV64) {
+      groupInheritance = arch::riscv::groupInheritance_;
+    }
+    while (blockingGroups.size()) {
+      // Determine if there's any inheritance
+      if (groupInheritance.find(blockingGroups.front()) !=
+          groupInheritance.end()) {
+        std::vector<uint16_t> inheritedGroups =
+            groupInheritance.at(blockingGroups.front());
+        for (size_t k = 0; k < inheritedGroups.size(); k++) {
+          blockingGroups.push(inheritedGroups[k]);
+          node["Blocking-Group-Nums"].append_child() << inheritedGroups[k];
+        }
+      }
+      blockingGroups.pop();
+    }
+  }
+  for (ryml::NodeRef node : configTree_["Latencies"]) {
+    // Clear or create a new Instruction-Group-Nums config option
+    if (node.has_child("Instruction-Group-Nums")) {
+      node["Instruction-Group-Nums"].clear_children();
+    } else {
+      node.append_child() << ryml::key("Instruction-Group-Nums") |= ryml::SEQ;
+    }
+    // Read in each group and place its corresponding group number into the
+    // new config option
+    for (ryml::NodeRef child : node["Instruction-Groups"]) {
+      node["Instruction-Group-Nums"].append_child()
+          << groupMapping_[child.as<std::string>()];
+    }
+  }
+
+  // Ensure all execution ports have an associated reservation station and
+  // convert port strings to their associated port indexes
+  if (configTree_["Ports"].num_children() !=
+      configTree_["Execution-Units"].num_children()) {
+    invalid_ << "\t- The number of execution units ("
+             << configTree_["Execution-Units"].num_children()
+             << ") must match the number of ports ("
+             << configTree_["Ports"].num_children() << ")\n";
+  }
+  std::vector<std::string> portnames;
+  std::unordered_map<std::string, uint16_t> portIndexes;
+  uint16_t idx = 0;
+  // Read all available port names.
+  for (ryml::NodeRef node : configTree_["Ports"]) {
+    std::string portname = node["Portname"].as<std::string>();
+    portnames.push_back(portname);
+    portIndexes[portname] = idx++;
+  }
+  // Iterate over all [Reservation-Stations][Ports] children
+  for (ryml::NodeRef node : configTree_["Reservation-Stations"]) {
+    // Clear or create a new Port-Nums config option
+    if (node.has_child("Port-Nums")) {
+      node["Port-Nums"].clear_children();
+    } else {
+      node.append_child() << ryml::key("Port-Nums") |= ryml::SEQ;
+    }
+    for (size_t i = 0; i < node["Ports"].num_children(); i++) {
+      std::string portname = node["Ports"][i].as<std::string>();
+      std::vector<std::string>::iterator itr =
+          std::find(portnames.begin(), portnames.end(), portname);
+      // If a port is yet to be marked as linked, remove it from portnames
+      if (itr != portnames.end()) {
+        portnames.erase(itr);
+      }
+      // Place the port's corresponding index into the new config option
+      node["Port-Nums"].append_child() << portIndexes[portname];
+    }
+  }
+  // Record any unlinked port names
+  for (const auto& prt : portnames)
+    invalid_ << "\t- " << prt << " has no associated reservation station\n";
+
+  // Ensure that given special file directory exists iff auto-generation is
+  // False
+  if (!configTree_["CPU-Info"]["Generate-Special-Dir"].as<bool>() &&
+      !std::ifstream(
+           configTree_["CPU-Info"]["Special-File-Dir-Path"].as<std::string>())
+           .good()) {
+    invalid_
+        << "\t- Special File Directory '"
+        << configTree_["CPU-Info"]["Special-File-Dir-Path"].as<std::string>()
+        << "' does not exist\n";
+  }
+
+  // Ensure the L1-[Data|Instruction]-Memory:Interface-Type restrictions are
+  // enforced
+  std::string simMode =
+      configTree_["Core"]["Simulation-Mode"].as<std::string>();
+  // Currently, only outoforder core types can use non-Flat L1-Data-Memory
+  // interfaces
+  if (simMode != "outoforder") {
+    std::string l1dType =
+        configTree_["L1-Data-Memory"]["Interface-Type"].as<std::string>();
+    if (l1dType != "Flat")
+      invalid_ << "\t- Only a Flat L1-Data-Memory Interface-Type may be used "
+                  "with the "
+               << simMode << " Simulation-Mode. Interface-Type used is "
+               << l1dType << "\n";
+  }
+
+  // Currently, only a Flat L1-Instruction-Memory:Interface-Type is supported
+  std::string l1iType =
+      configTree_["L1-Instruction-Memory"]["Interface-Type"].as<std::string>();
+  if (l1iType != "Flat")
+    invalid_ << "\t- Only a 'Flat' L1-Instruction-Memory Interface-Type is "
+                "supported. Interface-Type used is "
+             << l1iType << "\n";
+
+  if (isa_ == ISA::AArch64) {
+    // Ensure LSQ-L1-Interface Load/Store Bandwidth is large enough to
+    // accomodate a full vector load of the specified Vector-Length parameter
+    if (configTree_["Core"]["Vector-Length"].as<uint16_t>() / 8 >
+        configTree_["LSQ-L1-Interface"]["Load-Bandwidth"].as<uint16_t>()) {
+      invalid_
+          << "\t- Load-Bandwidth (bytes) must be greater than Vector-Length "
+             "(bits). "
+             "The current Load-Bandwidth is set to "
+          << configTree_["LSQ-L1-Interface"]["Load-Bandwidth"].as<uint16_t>()
+          << " bytes, when it must be at least "
+          << configTree_["Core"]["Vector-Length"].as<uint16_t>() / 8 << "\n";
+    }
+    if (configTree_["Core"]["Vector-Length"].as<uint16_t>() / 8 >
+        configTree_["LSQ-L1-Interface"]["Store-Bandwidth"].as<uint16_t>()) {
+      invalid_
+          << "\t- Store-Bandwidth (bytes) must be greater than Vector-Length "
+             "(bits). "
+             "The current Store-Bandwidth is set to "
+          << configTree_["LSQ-L1-Interface"]["Store-Bandwidth"].as<uint16_t>()
+          << " bytes, when it must be at least "
+          << configTree_["Core"]["Vector-Length"].as<uint16_t>() / 8 << "\n";
+    }
+    // Ensure LSQ-L1-Interface Load/Store Bandwidth is also large enough to
+    // accomodate a full vector load of the specified Streaming-Vector-Length
+    // parameter when streaming mode is enabled
+    if (configTree_["Core"]["Streaming-Vector-Length"].as<uint16_t>() / 8 >
+        configTree_["LSQ-L1-Interface"]["Load-Bandwidth"].as<uint16_t>()) {
+      invalid_
+          << "\t- Load-Bandwidth (bytes) must be greater than "
+             "Streaming-Vector-Length (bits). "
+             "The current Load-Bandwidth is set to "
+          << configTree_["LSQ-L1-Interface"]["Load-Bandwidth"].as<uint16_t>()
+          << " bytes, when it must be at least "
+          << configTree_["Core"]["Streaming-Vector-Length"].as<uint16_t>() / 8
+          << "\n";
+    }
+    if (configTree_["Core"]["Streaming-Vector-Length"].as<uint16_t>() / 8 >
+        configTree_["LSQ-L1-Interface"]["Store-Bandwidth"].as<uint16_t>()) {
+      invalid_
+          << "\t- Store-Bandwidth (bytes) must be greater than "
+             "Streaming-Vector-Length (bits). "
+             "The current Store-Bandwidth is set to "
+          << configTree_["LSQ-L1-Interface"]["Store-Bandwidth"].as<uint16_t>()
+          << " bytes, when it must be at least "
+          << configTree_["Core"]["Streaming-Vector-Length"].as<uint16_t>() / 8
+          << "\n";
+    }
+  }
+}
+
+ryml::Tree ModelConfig::getConfig() { return configTree_; }
+
+void ModelConfig::createGroupMapping() {
+  if (isa_ == ISA::AArch64) {
+    groupOptions_ = {"INT",
+                     "INT_SIMPLE",
+                     "INT_SIMPLE_ARTH",
+                     "INT_SIMPLE_ARTH_NOSHIFT",
+                     "INT_SIMPLE_LOGICAL",
+                     "INT_SIMPLE_LOGICAL_NOSHIFT",
+                     "INT_SIMPLE_CMP",
+                     "INT_SIMPLE_CVT",
+                     "INT_MUL",
+                     "INT_DIV_OR_SQRT",
+                     "LOAD_INT",
+                     "STORE_ADDRESS_INT",
+                     "STORE_DATA_INT",
+                     "STORE_INT",
+                     "FP",
+                     "FP_SIMPLE",
+                     "FP_SIMPLE_ARTH",
+                     "FP_SIMPLE_ARTH_NOSHIFT",
+                     "FP_SIMPLE_LOGICAL",
+                     "FP_SIMPLE_LOGICAL_NOSHIFT",
+                     "FP_SIMPLE_CMP",
+                     "FP_SIMPLE_CVT",
+                     "FP_MUL",
+                     "FP_DIV_OR_SQRT",
+                     "SCALAR",
+                     "SCALAR_SIMPLE",
+                     "SCALAR_SIMPLE_ARTH",
+                     "SCALAR_SIMPLE_ARTH_NOSHIFT",
+                     "SCALAR_SIMPLE_LOGICAL",
+                     "SCALAR_SIMPLE_LOGICAL_NOSHIFT",
+                     "SCALAR_SIMPLE_CMP",
+                     "SCALAR_SIMPLE_CVT",
+                     "SCALAR_MUL",
+                     "SCALAR_DIV_OR_SQRT",
+                     "LOAD_SCALAR",
+                     "STORE_ADDRESS_SCALAR",
+                     "STORE_DATA_SCALAR",
+                     "STORE_SCALAR",
+                     "VECTOR",
+                     "VECTOR_SIMPLE",
+                     "VECTOR_SIMPLE_ARTH",
+                     "VECTOR_SIMPLE_ARTH_NOSHIFT",
+                     "VECTOR_SIMPLE_LOGICAL",
+                     "VECTOR_SIMPLE_LOGICAL_NOSHIFT",
+                     "VECTOR_SIMPLE_CMP",
+                     "VECTOR_SIMPLE_CVT",
+                     "VECTOR_MUL",
+                     "VECTOR_DIV_OR_SQRT",
+                     "LOAD_VECTOR",
+                     "STORE_ADDRESS_VECTOR",
+                     "STORE_DATA_VECTOR",
+                     "STORE_VECTOR",
+                     "SVE",
+                     "SVE_SIMPLE",
+                     "SVE_SIMPLE_ARTH",
+                     "SVE_SIMPLE_ARTH_NOSHIFT",
+                     "SVE_SIMPLE_LOGICAL",
+                     "SVE_SIMPLE_LOGICAL_NOSHIFT",
+                     "SVE_SIMPLE_CMP",
+                     "SVE_SIMPLE_CVT",
+                     "SVE_MUL",
+                     "SVE_DIV_OR_SQRT",
+                     "LOAD_SVE",
+                     "STORE_ADDRESS_SVE",
+                     "STORE_DATA_SVE",
+                     "STORE_SVE",
+                     "PREDICATE",
+                     "LOAD",
+                     "STORE_ADDRESS",
+                     "STORE_DATA",
+                     "STORE",
+                     "BRANCH",
+                     "SME",
+                     "SME_SIMPLE",
+                     "SME_SIMPLE_ARTH",
+                     "SME_SIMPLE_ARTH_NOSHIFT",
+                     "SME_SIMPLE_LOGICAL",
+                     "SME_SIMPLE_LOGICAL_NOSHIFT",
+                     "SME_SIMPLE_CMP",
+                     "SME_SIMPLE_CVT",
+                     "SME_MUL",
+                     "SME_DIV_OR_SQRT",
+                     "LOAD_SME",
+                     "STORE_ADDRESS_SME",
+                     "STORE_DATA_SME",
+                     "STORE_SME",
+                     "ALL",
+                     "NONE"};
+  } else if (isa_ == ISA::RV64) {
+    groupOptions_ = {"INT",
+                     "INT_SIMPLE",
+                     "INT_SIMPLE_ARTH",
+                     "INT_SIMPLE_CMP",
+                     "INT_SIMPLE_LOGICAL",
+                     "INT_SIMPLE_SHIFT",
+                     "INT_MUL",
+                     "INT_DIV_OR_SQRT",
+                     "LOAD_INT",
+                     "STORE_INT",
+                     "FLOAT",
+                     "FLOAT_SIMPLE",
+                     "FLOAT_SIMPLE_ARTH",
+                     "FLOAT_SIMPLE_CMP",
+                     "FLOAT_SIMPLE_LOGICAL",
+                     "FLOAT_SIMPLE_CVT",
+                     "FLOAT_MUL",
+                     "FLOAT_DIV_OR_SQRT",
+                     "LOAD_FLOAT",
+                     "STORE_FLOAT",
+                     "LOAD",
+                     "STORE",
+                     "BRANCH",
+                     "ALL",
+                     "NONE"};
+  }
+  // ISA instruction group namespaces contain a set of contiguous assigned
+  // uint16_t starting from 0. Therefore, the index of each groupOptions_
+  // entry is also its <isa>::InstructionGroups value (assuming groupOptions_
+  // is ordered exactly as <isa>::InstructionGroups is).
+  for (size_t grp = 0; grp < groupOptions_.size(); grp++) {
+    groupMapping_[groupOptions_[grp]] = grp;
+  }
+}
+
+}  // namespace config
+}  // namespace simeng
diff --git a/src/lib/config/SimInfo.cc b/src/lib/config/SimInfo.cc
new file mode 100644
index 0000000000..a4136a00df
--- /dev/null
+++ b/src/lib/config/SimInfo.cc
@@ -0,0 +1,126 @@
+#include "simeng/config/SimInfo.hh"
+
+namespace simeng {
+namespace config {
+
+ryml::ConstNodeRef SimInfo::getConfig() {
+  return getInstance()->validatedConfig_.crootref();
+}
+
+void SimInfo::setConfig(std::string path) { getInstance()->makeConfig(path); }
+
+void SimInfo::addToConfig(std::string configAdditions) {
+  getInstance()->modelConfig_.addConfigOptions(configAdditions);
+  // Replace the validated config with new instance with the supplied
+  // additional values
+  getInstance()->validatedConfig_ = getInstance()->modelConfig_.getConfig();
+  // Update previously extracted values from the config file
+  getInstance()->extractValues();
+}
+
+void SimInfo::generateDefault(ISA isa, bool force) {
+  if (isa == ISA::AArch64)
+    getInstance()->modelConfig_.reGenerateDefault(ISA::AArch64, force);
+  else if (isa == ISA::RV64)
+    getInstance()->modelConfig_.reGenerateDefault(ISA::RV64, force);
+
+  // Update config path to be the default string
+  getInstance()->configFilePath_ = DEFAULT_STR;
+
+  // Replace the validated config with the new default config
+  getInstance()->validatedConfig_ = getInstance()->modelConfig_.getConfig();
+  // Update previously extracted values from the config file
+  getInstance()->extractValues();
+}
+
+std::string SimInfo::getConfigPath() { return getInstance()->configFilePath_; }
+
+SimulationMode SimInfo::getSimMode() { return getInstance()->mode_; }
+
+std::string SimInfo::getSimModeStr() { return getInstance()->modeStr_; }
+
+ISA SimInfo::getISA() { return getInstance()->isa_; }
+
+std::string SimInfo::getISAString() { return getInstance()->isaString_; }
+
+const std::vector<simeng::RegisterFileStructure>& SimInfo::getArchRegStruct() {
+  return getInstance()->archInfo_->getArchRegStruct();
+}
+
+const std::vector<simeng::RegisterFileStructure>& SimInfo::getPhysRegStruct() {
+  return getInstance()->archInfo_->getPhysRegStruct();
+}
+
+const std::vector<uint16_t>& SimInfo::getPhysRegQuantities() {
+  return getInstance()->archInfo_->getPhysRegQuantities();
+}
+
+const std::vector<uint64_t>& SimInfo::getSysRegVec() {
+  return getInstance()->archInfo_->getSysRegEnums();
+}
+
+bool SimInfo::getGenSpecFiles() { return getInstance()->genSpecialFiles_; }
+
+void SimInfo::reBuild() { getInstance()->extractValues(); }
+
+SimInfo::SimInfo() {
+  // Set the validated config file to be the current default config
+  // generated by the default constructor of ModelConfig
+  validatedConfig_ = modelConfig_.getConfig();
+  extractValues();
+}
+
+std::unique_ptr<SimInfo>& SimInfo::getInstance() {
+  static std::unique_ptr<SimInfo> SimInfoClass = nullptr;
+  if (SimInfoClass == nullptr) {
+    SimInfoClass = std::unique_ptr<SimInfo>(new SimInfo());
+  }
+  return SimInfoClass;
+}
+
+void SimInfo::makeConfig(std::string path) {
+  // Recreate the model config instance from the YAML file path
+  modelConfig_ = ModelConfig(path);
+
+  // Update config path to be the passed path
+  configFilePath_ = path;
+
+  // Update the validated config file
+  validatedConfig_ = modelConfig_.getConfig();
+  extractValues();
+}
+
+void SimInfo::extractValues() {
+  // Get ISA type and set the corresponding ArchInfo class
+  isaString_ = validatedConfig_["Core"]["ISA"].as<std::string>();
+  if (isaString_ == "AArch64") {
+    isa_ = ISA::AArch64;
+    archInfo_ = std::make_unique<arch::aarch64::ArchInfo>(
+        arch::aarch64::ArchInfo(validatedConfig_));
+  } else if (isaString_ == "rv64") {
+    isa_ = ISA::RV64;
+    archInfo_ = std::make_unique<arch::riscv::ArchInfo>(
+        arch::riscv::ArchInfo(validatedConfig_));
+  }
+
+  // Get Simulation mode
+  std::string mode =
+      validatedConfig_["Core"]["Simulation-Mode"].as<std::string>();
+  if (mode == "emulation") {
+    mode_ = SimulationMode::Emulation;
+    modeStr_ = "Emulation";
+  } else if (mode == "inorderpipelined") {
+    mode_ = SimulationMode::InOrderPipelined;
+    modeStr_ = "In-Order Pipelined";
+  } else if (mode == "outoforder") {
+    mode_ = SimulationMode::Outoforder;
+    modeStr_ = "Out-of-Order";
+  }
+
+  // Get if the special files directory should be created
+  genSpecialFiles_ =
+      validatedConfig_["CPU-Info"]["Generate-Special-Dir"].as<bool>();
+}
+
+}  // namespace config
+}  // namespace simeng
\ No newline at end of file
diff --git a/src/lib/kernel/Linux.cc b/src/lib/kernel/Linux.cc
index 9f0b594114..780867cdec 100644
--- a/src/lib/kernel/Linux.cc
+++ b/src/lib/kernel/Linux.cc
@@ -6,9 +6,7 @@
 #include <sys/ioctl.h>
 #include <sys/resource.h>
 #include <sys/stat.h>
-#include <sys/syscall.h>
 #include <sys/termios.h>
-#include <sys/types.h>
 #include <sys/uio.h>
 #include <unistd.h>
 
@@ -23,13 +21,11 @@ namespace kernel {
 void Linux::createProcess(const LinuxProcess& process) {
   assert(process.isValid() && "Attempted to use an invalid process");
   assert(processStates_.size() == 0 && "Multiple processes not yet supported");
-  processStates_.push_back({.pid = 0,  // TODO: create unique PIDs
-                            .path = process.getPath(),
-                            .startBrk = process.getHeapStart(),
-                            .currentBrk = process.getHeapStart(),
-                            .initialStackPointer = process.getStackPointer(),
-                            .mmapRegion = process.getMmapStart(),
-                            .pageSize = process.getPageSize()});
+  processStates_.push_back({0,  // TODO: create unique PIDs
+                            process.getPath(), process.getHeapStart(),
+                            process.getHeapStart(),
+                            process.getInitialStackPointer(),
+                            process.getMmapStart(), process.getPageSize()});
   processStates_.back().fileDescriptorTable.push_back(STDIN_FILENO);
   processStates_.back().fileDescriptorTable.push_back(STDOUT_FILENO);
   processStates_.back().fileDescriptorTable.push_back(STDERR_FILENO);
@@ -41,31 +37,34 @@ void Linux::createProcess(const LinuxProcess& process) {
        "/sys/devices/system/cpu/online", "core_id", "physical_package_id"});
 }
 
-uint64_t Linux::getDirFd(int64_t dfd, std::string pathname) {
-  // Resolve absolute path to target file
-  char absolutePath[LINUX_PATH_MAX];
-  realpath(pathname.c_str(), absolutePath);
-
-  int64_t dfd_temp = AT_FDCWD;
-  if (dfd != -100) {
-    dfd_temp = dfd;
-    // If absolute path used then dfd is dis-regarded. Otherwise need to see if
-    // fd exists for directory referenced
-    if (strncmp(pathname.c_str(), absolutePath, strlen(absolutePath)) != 0) {
-      assert(dfd < processStates_[0].fileDescriptorTable.size());
-      dfd_temp = processStates_[0].fileDescriptorTable[dfd];
-      if (dfd_temp < 0) {
-        return -1;
-      }
+int64_t Linux::getHostDirFD(int64_t vdfd) {
+  // -100 = AT_FCWD on linux. Pass back AT_FDCWD for host platform e.g. -2 for
+  // macOS
+  if (vdfd == -100) {
+    // Early return if requesting current working directory
+    return AT_FDCWD;
+  }
+
+  if (vdfd < 0) {
+    // Invalid virtual file descriptor
+    return -1;
+  } else {
+    uint64_t unsignedVdfd = static_cast<uint64_t>(vdfd);
+    if (unsignedVdfd < processStates_[0].fileDescriptorTable.size()) {
+      // Within bounds of table. Entry will be -1 if invalid
+      return processStates_[0].fileDescriptorTable[unsignedVdfd];
+    } else {
+      // Outside bounds of table
+      assert(false && "vdfd outside bounds of file descriptor table");
+      return -1;
     }
   }
-  return dfd_temp;
 }
 
 std::string Linux::getSpecialFile(const std::string filename) {
   for (auto prefix : {"/dev/", "/proc/", "/sys/"}) {
     if (strncmp(filename.c_str(), prefix, strlen(prefix)) == 0) {
-      for (int i = 0; i < supportedSpecialFiles_.size(); i++) {
+      for (size_t i = 0; i < supportedSpecialFiles_.size(); i++) {
         if (filename.find(supportedSpecialFiles_[i]) != std::string::npos) {
           std::cerr << "[SimEng:Linux] Using Special File: " << filename.c_str()
                     << std::endl;
@@ -140,30 +139,32 @@ int64_t Linux::faccessat(int64_t dfd, const std::string& filename, int64_t mode,
   // special file)
   new_pathname = Linux::getSpecialFile(filename);
 
-  // Get correct dirfd
-  int64_t dirfd = Linux::getDirFd(dfd, filename);
-  if (dirfd == -1) return EBADF;
+  // Get host dirfd. May return -1 in case of no mapping, pass through to host
+  // faccessat to deal with this
+  int64_t hostDfd = Linux::getHostDirFD(dfd);
 
   // Pass call through to host
-  int64_t retval = ::faccessat(dirfd, new_pathname.c_str(), mode, flag);
+  int64_t retval = ::faccessat(hostDfd, new_pathname.c_str(), mode, flag);
 
   return retval;
 }
 
-int64_t Linux::close(int64_t fd) {
+int64_t Linux::close(int64_t vfd) {
   // Don't close STDOUT or STDERR otherwise no SimEng output is given
   // afterwards. This includes final results given at the end of execution
-  if (fd != STDERR_FILENO && fd != STDOUT_FILENO) {
-    assert(fd < processStates_[0].fileDescriptorTable.size());
-    int64_t hfd = processStates_[0].fileDescriptorTable[fd];
+  if (vfd != STDERR_FILENO && vfd != STDOUT_FILENO) {
+    assert(vfd >= 0 && static_cast<size_t>(vfd) <
+                           processStates_[0].fileDescriptorTable.size());
+    int64_t hfd = processStates_[0].fileDescriptorTable[vfd];
     if (hfd < 0) {
+      // Early return, can't deallocate vfd that isn't in fileDescriptorTable
       return EBADF;
     }
 
     // Deallocate the virtual file descriptor
-    assert(processStates_[0].freeFileDescriptors.count(fd) == 0);
-    processStates_[0].freeFileDescriptors.insert(fd);
-    processStates_[0].fileDescriptorTable[fd] = -1;
+    assert(processStates_[0].freeFileDescriptors.count(vfd) == 0);
+    processStates_[0].freeFileDescriptors.insert(vfd);
+    processStates_[0].fileDescriptorTable[vfd] = -1;
 
     return ::close(hfd);
   }
@@ -181,13 +182,13 @@ int64_t Linux::newfstatat(int64_t dfd, const std::string& filename, stat& out,
   // special file)
   new_pathname = Linux::getSpecialFile(filename);
 
-  // Get correct dirfd
-  int64_t dirfd = Linux::getDirFd(dfd, filename);
-  if (dirfd == -1) return EBADF;
+  // Get host dirfd. May return -1 in case of no mapping, pass through to host
+  // fstatat to deal with this
+  int64_t hostDfd = Linux::getHostDirFD(dfd);
 
   // Pass call through to host
   struct ::stat statbuf;
-  int64_t retval = ::fstatat(dirfd, new_pathname.c_str(), &statbuf, flag);
+  int64_t retval = ::fstatat(hostDfd, new_pathname.c_str(), &statbuf, flag);
 
   // Copy results to output struct
   out.dev = statbuf.st_dev;
@@ -223,7 +224,8 @@ int64_t Linux::newfstatat(int64_t dfd, const std::string& filename, stat& out,
 }
 
 int64_t Linux::fstat(int64_t fd, stat& out) {
-  assert(fd < processStates_[0].fileDescriptorTable.size());
+  assert(fd > 0 && static_cast<size_t>(fd) <
+                       processStates_[0].fileDescriptorTable.size());
   int64_t hfd = processStates_[0].fileDescriptorTable[fd];
   if (hfd < 0) {
     return EBADF;
@@ -319,7 +321,8 @@ int64_t Linux::gettimeofday(uint64_t systemTimer, timeval* tv, timeval* tz) {
 }
 
 int64_t Linux::ioctl(int64_t fd, uint64_t request, std::vector<char>& out) {
-  assert(fd < processStates_[0].fileDescriptorTable.size());
+  assert(fd > 0 && static_cast<size_t>(fd) <
+                       processStates_[0].fileDescriptorTable.size());
   int64_t hfd = processStates_[0].fileDescriptorTable[fd];
   if (hfd < 0) {
     return EBADF;
@@ -354,7 +357,8 @@ int64_t Linux::ioctl(int64_t fd, uint64_t request, std::vector<char>& out) {
 }
 
 uint64_t Linux::lseek(int64_t fd, uint64_t offset, int64_t whence) {
-  assert(fd < processStates_[0].fileDescriptorTable.size());
+  assert(fd > 0 && static_cast<size_t>(fd) <
+                       processStates_[0].fileDescriptorTable.size());
   int64_t hfd = processStates_[0].fileDescriptorTable[fd];
   if (hfd < 0) {
     return EBADF;
@@ -367,15 +371,14 @@ int64_t Linux::munmap(uint64_t addr, size_t length) {
   if (addr % lps->pageSize != 0) {
     // addr must be a multiple of the process page size
     return -1;
-  }
-  int i;
+  };
   vm_area_struct alloc;
   // Find addr in allocations
-  for (i = 0; i < lps->contiguousAllocations.size(); i++) {
+  for (size_t i = 0; i < lps->contiguousAllocations.size(); i++) {
     alloc = lps->contiguousAllocations[i];
     if (alloc.vm_start == addr) {
       if ((alloc.vm_end - alloc.vm_start) < length) {
-        // length must not be larger than the original allocation
+        // Length must not be larger than the original allocation
         return -1;
       }
       if (i != 0) {
@@ -387,24 +390,25 @@ int64_t Linux::munmap(uint64_t addr, size_t length) {
     }
   }
 
-  for (int i = 0; i < lps->nonContiguousAllocations.size(); i++) {
-    alloc = lps->nonContiguousAllocations[i];
+  for (size_t j = 0; j < lps->nonContiguousAllocations.size(); j++) {
+    alloc = lps->nonContiguousAllocations[j];
     if (alloc.vm_start == addr) {
       if ((alloc.vm_end - alloc.vm_start) < length) {
-        // length must not be larger than the original allocation
+        // Length must not be larger than the original allocation
         return -1;
       }
       lps->nonContiguousAllocations.erase(
-          lps->nonContiguousAllocations.begin() + i);
+          lps->nonContiguousAllocations.begin() + j);
       return 0;
     }
   }
-  // Not an error if the indicated range does no contain any mapped pages
+  // Not an error if the indicated range does not contain any mapped pages
   return 0;
 }
 
-uint64_t Linux::mmap(uint64_t addr, size_t length, int prot, int flags, int fd,
-                     off_t offset) {
+uint64_t Linux::mmap(uint64_t addr, size_t length, [[maybe_unused]] int prot,
+                     [[maybe_unused]] int flags, [[maybe_unused]] int fd,
+                     [[maybe_unused]] off_t offset) {
   LinuxProcessState* lps = &processStates_[0];
   std::shared_ptr<struct vm_area_struct> newAlloc(new vm_area_struct);
   if (addr == 0) {  // Kernel decides allocation
@@ -443,13 +447,11 @@ uint64_t Linux::mmap(uint64_t addr, size_t length, int prot, int flags, int fd,
   return newAlloc->vm_start;
 }
 
-int64_t Linux::openat(int64_t dfd, const std::string& filename, int64_t flags,
+int64_t Linux::openat(int64_t dfd, const std::string& pathname, int64_t flags,
                       uint16_t mode) {
-  std::string new_pathname;
-
-  // Alter special file path to point to SimEng one (if filename points to
+  // Alter special file path to point to SimEng one (if pathname points to
   // special file)
-  new_pathname = Linux::getSpecialFile(filename);
+  std::string new_pathname = Linux::getSpecialFile(pathname);
 
   // Need to re-create flag input to correct values for host OS
   int64_t newFlags = 0;
@@ -485,18 +487,22 @@ int64_t Linux::openat(int64_t dfd, const std::string& filename, int64_t flags,
 
   // If Special File (or Special File Directory) is being opened then need to
   // set flags to O_RDONLY and O_CLOEXEC only.
-  if (new_pathname != filename) {
+  if (new_pathname != pathname) {
     newFlags = O_RDONLY | O_CLOEXEC;
   }
 
-  // Get correct dirfd
-  int64_t dirfd = Linux::getDirFd(dfd, filename);
-  if (dirfd == -1) return EBADF;
+  // Get host dirfd. May return -1 in case of no mapping, pass through to host
+  // openat to deal with this
+  int64_t hDfd = Linux::getHostDirFD(dfd);
 
   // Pass call through to host
-  int64_t hfd = ::openat(dirfd, new_pathname.c_str(), newFlags, mode);
-  if (hfd < 0) {
-    return hfd;
+  int64_t hostFd = ::openat(hDfd, new_pathname.c_str(), newFlags, mode);
+  if (hostFd < 0) {
+    // An error occurred, pass this back to userspace don't allocate virtual
+    // file descriptor
+    // TODO possibly need to set errno for simulated program so that it can be
+    // handled correctly?? This may be relevant throughout
+    return hostFd;
   }
 
   LinuxProcessState& processState = processStates_[0];
@@ -507,11 +513,11 @@ int64_t Linux::openat(int64_t dfd, const std::string& filename, int64_t flags,
     // Take virtual descriptor from free pool
     auto first = processState.freeFileDescriptors.begin();
     vfd = processState.freeFileDescriptors.extract(first).value();
-    processState.fileDescriptorTable[vfd] = hfd;
+    processState.fileDescriptorTable[vfd] = hostFd;
   } else {
     // Extend file descriptor table for a new virtual descriptor
     vfd = processState.fileDescriptorTable.size();
-    processState.fileDescriptorTable.push_back(hfd);
+    processState.fileDescriptorTable.push_back(hostFd);
   }
 
   return vfd;
@@ -521,19 +527,29 @@ int64_t Linux::readlinkat(int64_t dirfd, const std::string& pathname, char* buf,
                           size_t bufsize) const {
   const auto& processState = processStates_[0];
   if (pathname == "/proc/self/exe") {
+    // Resolve absolute path
+    char absolutePath[LINUX_PATH_MAX];
+    if (!realpath(processState.path.c_str(), absolutePath)) {
+      // Something went wrong
+      std::cerr << "[SimEng:readlinkat] realpath failed with errno = " << errno
+                << std::endl;
+      exit(EXIT_FAILURE);
+    }
+
     // Copy executable path to buffer
-    // TODO: resolve path into canonical path
-    std::strncpy(buf, processState.path.c_str(), bufsize);
+    std::strncpy(buf, absolutePath, bufsize);
 
-    return std::min(processState.path.length(), bufsize);
+    return std::min(std::strlen(absolutePath), bufsize);
   }
 
-  // TODO: resolve symbolic link for other paths
+  // TODO: resolve symbolic link for other paths - get hostfd then pass to real
+  // readlinkat
   return -1;
 }
 
 int64_t Linux::getdents64(int64_t fd, void* buf, uint64_t count) {
-  assert(fd < processStates_[0].fileDescriptorTable.size());
+  assert(fd > 0 && static_cast<size_t>(fd) <
+                       processStates_[0].fileDescriptorTable.size());
   int64_t hfd = processStates_[0].fileDescriptorTable[fd];
   if (hfd < 0) {
     return EBADF;
@@ -589,7 +605,8 @@ int64_t Linux::getdents64(int64_t fd, void* buf, uint64_t count) {
 }
 
 int64_t Linux::read(int64_t fd, void* buf, uint64_t count) {
-  assert(fd < processStates_[0].fileDescriptorTable.size());
+  assert(fd > 0 && static_cast<size_t>(fd) <
+                       processStates_[0].fileDescriptorTable.size());
   int64_t hfd = processStates_[0].fileDescriptorTable[fd];
   if (hfd < 0) {
     return EBADF;
@@ -598,7 +615,8 @@ int64_t Linux::read(int64_t fd, void* buf, uint64_t count) {
 }
 
 int64_t Linux::readv(int64_t fd, const void* iovdata, int iovcnt) {
-  assert(fd < processStates_[0].fileDescriptorTable.size());
+  assert(fd > 0 && static_cast<size_t>(fd) <
+                       processStates_[0].fileDescriptorTable.size());
   int64_t hfd = processStates_[0].fileDescriptorTable[fd];
   if (hfd < 0) {
     return EBADF;
@@ -629,7 +647,8 @@ int64_t Linux::setTidAddress(uint64_t tidptr) {
 }
 
 int64_t Linux::write(int64_t fd, const void* buf, uint64_t count) {
-  assert(fd < processStates_[0].fileDescriptorTable.size());
+  assert(fd > 0 && static_cast<size_t>(fd) <
+                       processStates_[0].fileDescriptorTable.size());
   int64_t hfd = processStates_[0].fileDescriptorTable[fd];
   if (hfd < 0) {
     return EBADF;
@@ -638,7 +657,8 @@ int64_t Linux::write(int64_t fd, const void* buf, uint64_t count) {
 }
 
 int64_t Linux::writev(int64_t fd, const void* iovdata, int iovcnt) {
-  assert(fd < processStates_[0].fileDescriptorTable.size());
+  assert(fd > 0 && static_cast<size_t>(fd) <
+                       processStates_[0].fileDescriptorTable.size());
   int64_t hfd = processStates_[0].fileDescriptorTable[fd];
   if (hfd < 0) {
     return EBADF;
diff --git a/src/lib/kernel/LinuxProcess.cc b/src/lib/kernel/LinuxProcess.cc
index e3bc57df6f..795a6e5a8a 100644
--- a/src/lib/kernel/LinuxProcess.cc
+++ b/src/lib/kernel/LinuxProcess.cc
@@ -17,7 +17,7 @@ uint64_t alignToBoundary(uint64_t value, uint64_t boundary) {
 }
 
 LinuxProcess::LinuxProcess(const std::vector<std::string>& commandLine,
-                           YAML::Node config)
+                           ryml::ConstNodeRef config)
     : STACK_SIZE(config["Process-Image"]["Stack-Size"].as<uint64_t>()),
       HEAP_SIZE(config["Process-Image"]["Heap-Size"].as<uint64_t>()),
       commandLine_(commandLine) {
@@ -62,11 +62,13 @@ LinuxProcess::LinuxProcess(const std::vector<std::string>& commandLine,
   processImage_ = std::shared_ptr<char>(unwrappedProcImgPtr, free);
 }
 
-LinuxProcess::LinuxProcess(span<char> instructions, YAML::Node config)
+LinuxProcess::LinuxProcess(span<const uint8_t> instructions,
+                           ryml::ConstNodeRef config)
     : STACK_SIZE(config["Process-Image"]["Stack-Size"].as<uint64_t>()),
       HEAP_SIZE(config["Process-Image"]["Heap-Size"].as<uint64_t>()) {
-  // Leave program command string empty
-  commandLine_.push_back("\0");
+  // Set program command string to the full path of the default program even
+  // though these aren't the instructions being executed
+  commandLine_.push_back(SIMENG_SOURCE_DIR "/SimEngDefaultProgram\0");
 
   isValid_ = true;
 
@@ -79,7 +81,7 @@ LinuxProcess::LinuxProcess(span<char> instructions, YAML::Node config)
       alignToBoundary(heapStart_ + (HEAP_SIZE + STACK_SIZE) / 2, pageSize_);
 
   size_ = heapStart_ + HEAP_SIZE + STACK_SIZE;
-  char* unwrappedProcImgPtr = (char*)malloc(size_ * sizeof(char));
+  char* unwrappedProcImgPtr = (char*)calloc(size_, sizeof(char));
   std::copy(instructions.begin(), instructions.end(), unwrappedProcImgPtr);
 
   createStack(&unwrappedProcImgPtr);
@@ -108,7 +110,7 @@ uint64_t LinuxProcess::getProcessImageSize() const { return size_; }
 
 uint64_t LinuxProcess::getEntryPoint() const { return entryPoint_; }
 
-uint64_t LinuxProcess::getStackPointer() const { return stackPointer_; }
+uint64_t LinuxProcess::getInitialStackPointer() const { return stackPointer_; }
 
 void LinuxProcess::createStack(char** processImage) {
   // Decrement the stack pointer and populate with initial stack state
@@ -127,7 +129,7 @@ void LinuxProcess::createStack(char** processImage) {
   initialStackFrame.push_back(commandLine_.size());  // argc
   for (size_t i = 0; i < commandLine_.size(); i++) {
     char* argvi = commandLine_[i].data();
-    for (int j = 0; j < commandLine_[i].size(); j++) {
+    for (size_t j = 0; j < commandLine_[i].size(); j++) {
       stringBytes.push_back(argvi[j]);
     }
     stringBytes.push_back(0);
@@ -135,10 +137,10 @@ void LinuxProcess::createStack(char** processImage) {
   // Environment strings
   std::vector<std::string> envStrings = {"OMP_NUM_THREADS=1"};
   for (std::string& env : envStrings) {
-    for (int i = 0; i < env.size(); i++) {
+    for (size_t i = 0; i < env.size(); i++) {
       stringBytes.push_back(env.c_str()[i]);
     }
-    // Null entry to seperate strings
+    // Null entry to separate strings
     stringBytes.push_back(0);
   }
 
@@ -147,9 +149,9 @@ void LinuxProcess::createStack(char** processImage) {
   stackPointer_ -= alignToBoundary(stringBytes.size() + 1, 32);
   uint16_t ptrCount = 1;
   initialStackFrame.push_back(stackPointer_);  // argv[0] ptr
-  for (int i = 0; i < stringBytes.size(); i++) {
+  for (size_t i = 0; i < stringBytes.size(); i++) {
     if (ptrCount == commandLine_.size()) {
-      // null terminator to seperate argv and env strings
+      // null terminator to separate argv and env strings
       initialStackFrame.push_back(0);
       ptrCount++;
     }
@@ -162,8 +164,8 @@ void LinuxProcess::createStack(char** processImage) {
 
   initialStackFrame.push_back(0);  // null terminator
 
-  // ELF auxillary vector, keys defined in `uapi/linux/auxvec.h`
-  // TODO: populate remaining auxillary vector entries
+  // ELF auxiliary vector, keys defined in `uapi/linux/auxvec.h`
+  // TODO: populate remaining auxiliary vector entries
   initialStackFrame.push_back(auxVec::AT_PHDR);  // AT_PHDR
   initialStackFrame.push_back(progHeaderTableAddress_);
 
diff --git a/src/lib/FixedLatencyMemoryInterface.cc b/src/lib/memory/FixedLatencyMemoryInterface.cc
similarity index 88%
rename from src/lib/FixedLatencyMemoryInterface.cc
rename to src/lib/memory/FixedLatencyMemoryInterface.cc
index 7ef042675a..8265ea1895 100644
--- a/src/lib/FixedLatencyMemoryInterface.cc
+++ b/src/lib/memory/FixedLatencyMemoryInterface.cc
@@ -1,9 +1,11 @@
-#include "simeng/FixedLatencyMemoryInterface.hh"
+#include "simeng/memory/FixedLatencyMemoryInterface.hh"
 
-#include <cassert>
+#include <iostream>
 
 namespace simeng {
 
+namespace memory {
+
 FixedLatencyMemoryInterface::FixedLatencyMemoryInterface(char* memory,
                                                          size_t size,
                                                          uint16_t latency)
@@ -29,8 +31,12 @@ void FixedLatencyMemoryInterface::tick() {
 
     if (request.write) {
       // Write: write data directly to memory
-      assert(target.address + target.size <= size_ &&
-             "Attempted to write beyond memory limit");
+      if (target.address + target.size > size_) {
+        std::cerr << "[SimEng:FixedLatencyMemoryInterface] Attempted to write "
+                     "beyond memory limit."
+                  << std::endl;
+        exit(1);
+      }
 
       auto ptr = memory_ + target.address;
       // Copy the data from the RegisterValue to memory
@@ -84,4 +90,5 @@ bool FixedLatencyMemoryInterface::hasPendingRequests() const {
   return !pendingRequests_.empty();
 }
 
+}  // namespace memory
 }  // namespace simeng
diff --git a/src/lib/FlatMemoryInterface.cc b/src/lib/memory/FlatMemoryInterface.cc
similarity index 84%
rename from src/lib/FlatMemoryInterface.cc
rename to src/lib/memory/FlatMemoryInterface.cc
index b3e86b9f48..494a378d9b 100644
--- a/src/lib/FlatMemoryInterface.cc
+++ b/src/lib/memory/FlatMemoryInterface.cc
@@ -1,10 +1,11 @@
-#include "simeng/FlatMemoryInterface.hh"
+#include "simeng/memory/FlatMemoryInterface.hh"
 
-#include <cassert>
 #include <iostream>
 
 namespace simeng {
 
+namespace memory {
+
 FlatMemoryInterface::FlatMemoryInterface(char* memory, size_t size)
     : memory_(memory), size_(size) {}
 
@@ -30,8 +31,12 @@ void FlatMemoryInterface::requestRead(const MemoryAccessTarget& target,
 
 void FlatMemoryInterface::requestWrite(const MemoryAccessTarget& target,
                                        const RegisterValue& data) {
-  assert(target.address + target.size <= size_ &&
-         "Attempted to write beyond memory limit");
+  if (target.address + target.size > size_) {
+    std::cerr << "[SimEng:FlatLatencyMemoryInterface] Attempted to write "
+                 "beyond memory limit."
+              << std::endl;
+    exit(1);
+  }
 
   auto ptr = memory_ + target.address;
   // Copy the data from the RegisterValue to memory
@@ -49,4 +54,5 @@ bool FlatMemoryInterface::hasPendingRequests() const { return false; }
 
 void FlatMemoryInterface::tick() {}
 
+}  // namespace memory
 }  // namespace simeng
diff --git a/src/lib/models/emulation/Core.cc b/src/lib/models/emulation/Core.cc
index 1229b658ec..fb9694b812 100644
--- a/src/lib/models/emulation/Core.cc
+++ b/src/lib/models/emulation/Core.cc
@@ -6,21 +6,28 @@ namespace simeng {
 namespace models {
 namespace emulation {
 
-// TODO: Expose as config option
 /** The number of bytes fetched each cycle. */
 const uint8_t FETCH_SIZE = 4;
-const unsigned int clockFrequency = 2.5 * 1e9;
 
-Core::Core(MemoryInterface& instructionMemory, MemoryInterface& dataMemory,
-           uint64_t entryPoint, uint64_t programByteLength,
-           const arch::Architecture& isa)
-    : instructionMemory_(instructionMemory),
-      dataMemory_(dataMemory),
-      programByteLength_(programByteLength),
-      isa_(isa),
+Core::Core(memory::MemoryInterface& instructionMemory,
+           memory::MemoryInterface& dataMemory, uint64_t entryPoint,
+           uint64_t programByteLength, const arch::Architecture& isa)
+    : simeng::Core(dataMemory, isa, config::SimInfo::getArchRegStruct()),
+      instructionMemory_(instructionMemory),
+      architecturalRegisterFileSet_(registerFileSet_),
       pc_(entryPoint),
-      registerFileSet_(isa.getRegisterFileStructures()),
-      architecturalRegisterFileSet_(registerFileSet_) {
+      programByteLength_(programByteLength) {
+  // Ensure both interface types are flat
+  assert(
+      (config::SimInfo::getConfig()["L1-Data-Memory"]["Interface-Type"]
+           .as<std::string>() == "Flat") &&
+      "Emulation core is only compatable with a Flat Data Memory Interface.");
+  assert(
+      (config::SimInfo::getConfig()["L1-Instruction-Memory"]["Interface-Type"]
+           .as<std::string>() == "Flat") &&
+      "Emulation core is only compatable with a Flat Instruction Memory "
+      "Interface.");
+
   // Pre-load the first instruction
   instructionMemory_.requestRead({pc_, FETCH_SIZE});
 
@@ -30,8 +37,6 @@ Core::Core(MemoryInterface& instructionMemory, MemoryInterface& dataMemory,
 }
 
 void Core::tick() {
-  ticks_++;
-
   if (hasHalted_) return;
 
   if (pc_ >= programByteLength_) {
@@ -39,128 +44,113 @@ void Core::tick() {
     return;
   }
 
-  if (exceptionHandler_ != nullptr) {
-    processExceptionHandler();
-    return;
-  }
-
-  if (pendingReads_ > 0) {
-    // Handle pending reads to a uop
-    auto& uop = microOps_.front();
+  ticks_++;
+  isa_.updateSystemTimerRegisters(&registerFileSet_, ticks_);
 
-    const auto& completedReads = dataMemory_.getCompletedReads();
-    for (const auto& response : completedReads) {
-      assert(pendingReads_ > 0);
-      uop->supplyData(response.target.address, response.data);
-      pendingReads_--;
-    }
-    dataMemory_.clearCompletedReads();
+  // Fetch & Decode
+  assert(macroOp_.empty() &&
+         "Cannot begin emulation tick with un-executed micro-ops.");
+  // We only fetch one instruction at a time, so only ever one result in
+  // complete reads
+  const auto& instructionBytes = instructionMemory_.getCompletedReads()[0].data;
+  // Predecode fetched data
+  std::string disasm;
+  auto bytesRead = isa_.predecode(instructionBytes.getAsVector<uint8_t>(),
+                                  FETCH_SIZE, pc_, macroOp_, disasm);
+  // Clear the fetched data
+  instructionMemory_.clearCompletedReads();
+
+  pc_ += bytesRead;
+
+  // Loop over all micro-ops and execute one by one
+  while (!macroOp_.empty()) {
+    auto& uop = macroOp_.front();
 
-    if (pendingReads_ == 0) {
-      // Load complete: resume execution
-      execute(uop);
+    if (uop->exceptionEncountered()) {
+      handleException(uop);
+      // If fatal, return
+      if (hasHalted_) return;
     }
 
-    // More data pending, end cycle early
-    return;
-  }
-
-  // Fetch
-
-  // Determine if new uops are needed to be fetched
-  if (!microOps_.size()) {
-    // Find fetched memory that matches the current PC
-    const auto& fetched = instructionMemory_.getCompletedReads();
-    size_t fetchIndex;
-    for (fetchIndex = 0; fetchIndex < fetched.size(); fetchIndex++) {
-      if (fetched[fetchIndex].target.address == pc_) {
-        break;
+    // Issue
+    auto registers = uop->getSourceRegisters();
+    for (size_t i = 0; i < registers.size(); i++) {
+      auto reg = registers[i];
+      if (!uop->isOperandReady(i)) {
+        uop->supplyOperand(i, registerFileSet_.get(reg));
       }
     }
-    if (fetchIndex == fetched.size()) {
-      // Need to wait for fetched instructions
-      return;
-    }
-
-    const auto& instructionBytes = fetched[fetchIndex].data;
-    std::string disasm;
-    auto bytesRead = isa_.predecode(instructionBytes.getAsVector<char>(),
-                                    FETCH_SIZE, pc_, macroOp_, disasm);
-
-    // Clear the fetched data
-    instructionMemory_.clearCompletedReads();
-
-    pc_ += bytesRead;
-
-    // Decode
-    for (size_t index = 0; index < macroOp_.size(); index++) {
-      microOps_.push(std::move(macroOp_[index]));
-    }
-  }
-
-  auto& uop = microOps_.front();
-
-  if (uop->exceptionEncountered()) {
-    handleException(uop);
-    return;
-  }
-
-  // Issue
-  auto registers = uop->getOperandRegisters();
-  for (size_t i = 0; i < registers.size(); i++) {
-    auto reg = registers[i];
-    if (!uop->isOperandReady(i)) {
-      uop->supplyOperand(i, registerFileSet_.get(reg));
-    }
-  }
 
-  // Execute
-  if (uop->isLoad()) {
-    auto addresses = uop->generateAddresses();
-    previousAddresses_.clear();
-    if (uop->exceptionEncountered()) {
-      handleException(uop);
-      return;
-    }
-    if (addresses.size() > 0) {
-      // Memory reads are required; request them, set `pendingReads_`
-      // accordingly, and end the cycle early
+    // Execute & Write-back
+    if (uop->isLoad()) {
+      auto addresses = uop->generateAddresses();
+      previousAddresses_.clear();
+      if (uop->exceptionEncountered()) {
+        handleException(uop);
+        // If fatal, return
+        if (hasHalted_) return;
+      }
+      if (addresses.size() > 0) {
+        // Memory reads required; request them
+        for (auto const& target : addresses) {
+          dataMemory_.requestRead(target);
+          // Save addresses for use by instructions that perform a LD and STR
+          // (i.e. single instruction atomics)
+          previousAddresses_.push_back(target);
+        }
+        // Emulation core can only be used with a Flat memory interface, so data
+        // is ready immediately
+        const auto& completedReads = dataMemory_.getCompletedReads();
+        assert(
+            completedReads.size() == addresses.size() &&
+            "Number of completed reads does not match the number of requested "
+            "reads.");
+        for (const auto& response : completedReads) {
+          uop->supplyData(response.target.address, response.data);
+        }
+        dataMemory_.clearCompletedReads();
+      }
+    } else if (uop->isStoreAddress()) {
+      auto addresses = uop->generateAddresses();
+      previousAddresses_.clear();
+      if (uop->exceptionEncountered()) {
+        handleException(uop);
+        // If fatal, return
+        if (hasHalted_) return;
+      }
+      // Store addresses for use by next store data operation in `execute()`
       for (auto const& target : addresses) {
-        dataMemory_.requestRead(target);
-        // Store addresses for use by next store data operation
         previousAddresses_.push_back(target);
       }
-      pendingReads_ = addresses.size();
-      return;
-    } else {
-      // Early execution due to lacking addresses
-      execute(uop);
-      return;
-    }
-  } else if (uop->isStoreAddress()) {
-    auto addresses = uop->generateAddresses();
-    previousAddresses_.clear();
-    if (uop->exceptionEncountered()) {
-      handleException(uop);
-      return;
-    }
-    // Store addresses for use by next store data operation
-    for (auto const& target : addresses) {
-      previousAddresses_.push_back(target);
-    }
-    if (uop->isStoreData()) {
-      execute(uop);
-    } else {
-      // Fetch memory for next cycle
-      instructionMemory_.requestRead({pc_, FETCH_SIZE});
-      microOps_.pop();
+      if (!uop->isStoreData()) {
+        // No further action needed, move onto next micro-op
+        macroOp_.erase(macroOp_.begin());
+        continue;
+      }
     }
-
-    return;
+    execute(uop);
+    macroOp_.erase(macroOp_.begin());
   }
+  instructionsExecuted_++;
+  // Fetch memory for next cycle
+  instructionMemory_.requestRead({pc_, FETCH_SIZE});
+}
 
-  execute(uop);
-  isa_.updateSystemTimerRegisters(&registerFileSet_, ticks_);
+bool Core::hasHalted() const { return hasHalted_; }
+
+const ArchitecturalRegisterFileSet& Core::getArchitecturalRegisterFileSet()
+    const {
+  return architecturalRegisterFileSet_;
+}
+
+uint64_t Core::getInstructionsRetiredCount() const {
+  return instructionsExecuted_;
+}
+
+std::map<std::string, std::string> Core::getStats() const {
+  return {{"cycles", std::to_string(ticks_)},
+          {"retired", std::to_string(instructionsExecuted_)},
+          {"branch.executed", std::to_string(branchesExecuted_)}};
 }
 
 void Core::execute(std::shared_ptr<Instruction>& uop) {
@@ -172,8 +162,6 @@ void Core::execute(std::shared_ptr<Instruction>& uop) {
   }
 
   if (uop->isStoreData()) {
-    auto results = uop->getResults();
-    auto destinations = uop->getDestinationRegisters();
     auto data = uop->getData();
     for (size_t i = 0; i < previousAddresses_.size(); i++) {
       dataMemory_.requestWrite(previousAddresses_[i], data[i]);
@@ -184,25 +172,12 @@ void Core::execute(std::shared_ptr<Instruction>& uop) {
   }
 
   // Writeback
-  auto results = uop->getResults();
-  auto destinations = uop->getDestinationRegisters();
-  if (uop->isStoreData()) {
-    for (size_t i = 0; i < results.size(); i++) {
-      auto reg = destinations[i];
-      registerFileSet_.set(reg, results[i]);
-    }
-  } else {
-    for (size_t i = 0; i < results.size(); i++) {
-      auto reg = destinations[i];
-      registerFileSet_.set(reg, results[i]);
-    }
+  const auto& results = uop->getResults();
+  const auto& destinations = uop->getDestinationRegisters();
+  for (size_t i = 0; i < results.size(); i++) {
+    auto reg = destinations[i];
+    registerFileSet_.set(reg, results[i]);
   }
-
-  if (uop->isLastMicroOp()) instructionsExecuted_++;
-
-  // Fetch memory for next cycle
-  instructionMemory_.requestRead({pc_, FETCH_SIZE});
-  microOps_.pop();
 }
 
 void Core::handleException(const std::shared_ptr<Instruction>& instruction) {
@@ -213,16 +188,9 @@ void Core::handleException(const std::shared_ptr<Instruction>& instruction) {
 void Core::processExceptionHandler() {
   assert(exceptionHandler_ != nullptr &&
          "Attempted to process an exception handler that wasn't present");
-  if (dataMemory_.hasPendingRequests()) {
-    // Must wait for all memory requests to complete before processing the
-    // exception
-    return;
-  }
 
-  bool success = exceptionHandler_->tick();
-  if (!success) {
-    // Handler needs further ticks to complete
-    return;
+  // Tick until true is returned, signifying completion
+  while (exceptionHandler_->tick() == false) {
   }
 
   const auto& result = exceptionHandler_->getResult();
@@ -238,72 +206,8 @@ void Core::processExceptionHandler() {
 
   // Clear the handler
   exceptionHandler_ = nullptr;
-
-  // Fetch memory for next cycle
-  instructionMemory_.requestRead({pc_, FETCH_SIZE});
-  microOps_.pop();
 }
 
-void Core::applyStateChange(const arch::ProcessStateChange& change) {
-  // Update registers in accoradance with the ProcessStateChange type
-  switch (change.type) {
-    case arch::ChangeType::INCREMENT: {
-      for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
-        registerFileSet_.set(
-            change.modifiedRegisters[i],
-            registerFileSet_.get(change.modifiedRegisters[i]).get<uint64_t>() +
-                change.modifiedRegisterValues[i].get<uint64_t>());
-      }
-      break;
-    }
-    case arch::ChangeType::DECREMENT: {
-      for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
-        registerFileSet_.set(
-            change.modifiedRegisters[i],
-            registerFileSet_.get(change.modifiedRegisters[i]).get<uint64_t>() -
-                change.modifiedRegisterValues[i].get<uint64_t>());
-      }
-      break;
-    }
-    default: {  // arch::ChangeType::REPLACEMENT
-      // If type is ChangeType::REPLACEMENT, set new values
-      for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
-        registerFileSet_.set(change.modifiedRegisters[i],
-                             change.modifiedRegisterValues[i]);
-      }
-      break;
-    }
-  }
-
-  // Update memory
-  // TODO: Analyse if ChangeType::INCREMENT or ChangeType::DECREMENT case is
-  // required for memory changes
-  for (size_t i = 0; i < change.memoryAddresses.size(); i++) {
-    dataMemory_.requestWrite(change.memoryAddresses[i],
-                             change.memoryAddressValues[i]);
-  }
-}
-
-bool Core::hasHalted() const { return hasHalted_; }
-
-const ArchitecturalRegisterFileSet& Core::getArchitecturalRegisterFileSet()
-    const {
-  return architecturalRegisterFileSet_;
-}
-
-uint64_t Core::getInstructionsRetiredCount() const {
-  return instructionsExecuted_;
-}
-
-uint64_t Core::getSystemTimer() const {
-  return ticks_ / (clockFrequency / 1e9);
-}
-
-std::map<std::string, std::string> Core::getStats() const {
-  return {{"instructions", std::to_string(instructionsExecuted_)},
-          {"branch.executed", std::to_string(branchesExecuted_)}};
-};
-
 }  // namespace emulation
 }  // namespace models
 }  // namespace simeng
diff --git a/src/lib/models/inorder/Core.cc b/src/lib/models/inorder/Core.cc
index bbcb9742f3..787dd3fff4 100644
--- a/src/lib/models/inorder/Core.cc
+++ b/src/lib/models/inorder/Core.cc
@@ -9,22 +9,20 @@ namespace simeng {
 namespace models {
 namespace inorder {
 
-// TODO: Replace with config options
-const unsigned int blockSize = 16;
-const unsigned int clockFrequency = 2.5 * 1e9;
-
-Core::Core(MemoryInterface& instructionMemory, MemoryInterface& dataMemory,
-           uint64_t processMemorySize, uint64_t entryPoint,
-           const arch::Architecture& isa, BranchPredictor& branchPredictor)
-    : dataMemory_(dataMemory),
-      isa_(isa),
-      registerFileSet_(isa.getRegisterFileStructures()),
+Core::Core(memory::MemoryInterface& instructionMemory,
+           memory::MemoryInterface& dataMemory, uint64_t processMemorySize,
+           uint64_t entryPoint, const arch::Architecture& isa,
+           BranchPredictor& branchPredictor)
+    : simeng::Core(dataMemory, isa, config::SimInfo::getArchRegStruct()),
       architecturalRegisterFileSet_(registerFileSet_),
       fetchToDecodeBuffer_(1, {}),
       decodeToExecuteBuffer_(1, nullptr),
       completionSlots_(1, {1, nullptr}),
       fetchUnit_(fetchToDecodeBuffer_, instructionMemory, processMemorySize,
-                 entryPoint, blockSize, isa, branchPredictor),
+                 entryPoint,
+                 config::SimInfo::getConfig()["Fetch"]["Fetch-Block-Size"]
+                     .as<uint16_t>(),
+                 isa, branchPredictor),
       decodeUnit_(fetchToDecodeBuffer_, decodeToExecuteBuffer_,
                   branchPredictor),
       executeUnit_(
@@ -32,19 +30,19 @@ Core::Core(MemoryInterface& instructionMemory, MemoryInterface& dataMemory,
           [this](auto regs, auto values) { forwardOperands(regs, values); },
           [this](auto instruction) { handleLoad(instruction); },
           [this](auto instruction) { storeData(instruction); },
-          [this](auto instruction) { raiseException(instruction); },
-          branchPredictor, false),
+          [this](auto instruction) { raiseException(instruction); }, false),
       writebackUnit_(completionSlots_, registerFileSet_, [](auto insnId) {}) {
   // Query and apply initial state
   auto state = isa.getInitialState();
   applyStateChange(state);
-};
+}
 
 void Core::tick() {
-  ticks_++;
-
   if (hasHalted_) return;
 
+  ticks_++;
+  isa_.updateSystemTimerRegisters(&registerFileSet_, ticks_);
+
   if (exceptionHandler_ != nullptr) {
     processExceptionHandler();
     return;
@@ -112,7 +110,6 @@ void Core::tick() {
   }
 
   fetchUnit_.requestFromPC();
-  isa_.updateSystemTimerRegisters(&registerFileSet_, ticks_);
 }
 
 bool Core::hasHalted() const {
@@ -150,34 +147,16 @@ uint64_t Core::getInstructionsRetiredCount() const {
   return writebackUnit_.getInstructionsWrittenCount();
 }
 
-uint64_t Core::getSystemTimer() const {
-  // TODO: This will need to be changed if we start supporting DVFS.
-  return ticks_ / (clockFrequency / 1e9);
-}
-
 std::map<std::string, std::string> Core::getStats() const {
   auto retired = writebackUnit_.getInstructionsWrittenCount();
   auto ipc = retired / static_cast<float>(ticks_);
   std::ostringstream ipcStr;
   ipcStr << std::setprecision(2) << ipc;
 
-  // Sum up the branch stats reported across the execution units.
-  uint64_t totalBranchesExecuted = 0;
-  uint64_t totalBranchMispredicts = 0;
-  totalBranchesExecuted += executeUnit_.getBranchExecutedCount();
-  totalBranchMispredicts += executeUnit_.getBranchMispredictedCount();
-  auto branchMissRate = 100.0f * static_cast<float>(totalBranchMispredicts) /
-                        static_cast<float>(totalBranchesExecuted);
-  std::ostringstream branchMissRateStr;
-  branchMissRateStr << std::setprecision(3) << branchMissRate << "%";
-
   return {{"cycles", std::to_string(ticks_)},
           {"retired", std::to_string(retired)},
           {"ipc", ipcStr.str()},
-          {"flushes", std::to_string(flushes_)},
-          {"branch.executed", std::to_string(totalBranchesExecuted)},
-          {"branch.mispredict", std::to_string(totalBranchMispredicts)},
-          {"branch.missrate", branchMissRateStr.str()}};
+          {"flushes", std::to_string(flushes_)}};
 }
 
 void Core::raiseException(const std::shared_ptr<Instruction>& instruction) {
@@ -232,6 +211,19 @@ void Core::processExceptionHandler() {
   exceptionHandler_ = nullptr;
 }
 
+void Core::handleLoad(const std::shared_ptr<Instruction>& instruction) {
+  loadData(instruction);
+  if (instruction->exceptionEncountered()) {
+    raiseException(instruction);
+    return;
+  }
+
+  forwardOperands(instruction->getDestinationRegisters(),
+                  instruction->getResults());
+  // Manually add the instruction to the writeback input buffer
+  completionSlots_[0].getTailSlots()[0] = instruction;
+}
+
 void Core::loadData(const std::shared_ptr<Instruction>& instruction) {
   const auto& addresses = instruction->getGeneratedAddresses();
   for (const auto& target : addresses) {
@@ -280,7 +272,7 @@ void Core::forwardOperands(const span<Register>& registers,
     return;
   }
 
-  auto sourceRegisters = uop->getOperandRegisters();
+  auto sourceRegisters = uop->getSourceRegisters();
   for (size_t i = 0; i < registers.size(); i++) {
     // Check each forwarded register vs source operands and supply for each
     // match
@@ -309,7 +301,7 @@ void Core::readRegisters() {
 
   // Register read
   // Identify missing registers and supply values
-  const auto& sourceRegisters = uop->getOperandRegisters();
+  const auto& sourceRegisters = uop->getSourceRegisters();
   for (size_t i = 0; i < sourceRegisters.size(); i++) {
     const auto& reg = sourceRegisters[i];
     if (!uop->isOperandReady(i)) {
@@ -318,71 +310,6 @@ void Core::readRegisters() {
   }
 }
 
-void Core::applyStateChange(const arch::ProcessStateChange& change) {
-  // Update registers in accoradance with the ProcessStateChange type
-  switch (change.type) {
-    case arch::ChangeType::INCREMENT: {
-      for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
-        registerFileSet_.set(
-            change.modifiedRegisters[i],
-            registerFileSet_.get(change.modifiedRegisters[i]).get<uint64_t>() +
-                change.modifiedRegisterValues[i].get<uint64_t>());
-      }
-      break;
-    }
-    case arch::ChangeType::DECREMENT: {
-      for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
-        registerFileSet_.set(
-            change.modifiedRegisters[i],
-            registerFileSet_.get(change.modifiedRegisters[i]).get<uint64_t>() -
-                change.modifiedRegisterValues[i].get<uint64_t>());
-      }
-      break;
-    }
-    default: {  // arch::ChangeType::REPLACEMENT
-      // If type is ChangeType::REPLACEMENT, set new values
-      for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
-        registerFileSet_.set(change.modifiedRegisters[i],
-                             change.modifiedRegisterValues[i]);
-      }
-      break;
-    }
-  }
-
-  // Update memory
-  // TODO: Analyse if ChangeType::INCREMENT or ChangeType::DECREMENT case is
-  // required for memory changes
-  for (size_t i = 0; i < change.memoryAddresses.size(); i++) {
-    dataMemory_.requestWrite(change.memoryAddresses[i],
-                             change.memoryAddressValues[i]);
-  }
-}
-
-void Core::handleLoad(const std::shared_ptr<Instruction>& instruction) {
-  loadData(instruction);
-  if (instruction->exceptionEncountered()) {
-    raiseException(instruction);
-    return;
-  }
-
-  if (instruction->getTraceId() != 0) {
-    std::map<uint64_t, Trace*>::iterator it =
-        traceMap.find(instruction->getTraceId());
-    if (it != traceMap.end()) {
-      cycleTrace tr = it->second->getCycleTraces();
-      if (tr.finished != 1) {
-        tr.complete = trace_cycle;
-        it->second->setCycleTraces(tr);
-      }
-    }
-  }
-
-  forwardOperands(instruction->getDestinationRegisters(),
-                  instruction->getResults());
-  // Manually add the instruction to the writeback input buffer
-  completionSlots_[0].getTailSlots()[0] = instruction;
-}
-
 void Core::flushTraces(const bool atDecode) {
   // Flush traces from instructions in fetch to decode buffer
   for (size_t slot = 0; slot < fetchToDecodeBuffer_.getWidth(); slot++) {
diff --git a/src/lib/models/outoforder/Core.cc b/src/lib/models/outoforder/Core.cc
index b09104ccca..8ea02127a2 100644
--- a/src/lib/models/outoforder/Core.cc
+++ b/src/lib/models/outoforder/Core.cc
@@ -6,46 +6,62 @@
 #include <sstream>
 #include <string>
 
-// Temporary; until config options are available
-#include "simeng/arch/aarch64/Instruction.hh"
 namespace simeng {
 namespace models {
 namespace outoforder {
 
-// TODO: System register count has to match number of supported system registers
-Core::Core(MemoryInterface& instructionMemory, MemoryInterface& dataMemory,
-           uint64_t processMemorySize, uint64_t entryPoint,
-           const arch::Architecture& isa, BranchPredictor& branchPredictor,
-           pipeline::PortAllocator& portAllocator, YAML::Node config)
-    : isa_(isa),
-      physicalRegisterStructures_(
-          isa.getConfigPhysicalRegisterStructure(config)),
-      physicalRegisterQuantities_(
-          isa.getConfigPhysicalRegisterQuantities(config)),
-      registerFileSet_(physicalRegisterStructures_),
-      registerAliasTable_(isa.getRegisterFileStructures(),
+Core::Core(memory::MemoryInterface& instructionMemory,
+           memory::MemoryInterface& dataMemory, uint64_t processMemorySize,
+           uint64_t entryPoint, const arch::Architecture& isa,
+           BranchPredictor& branchPredictor,
+           pipeline::PortAllocator& portAllocator, ryml::ConstNodeRef config)
+    : simeng::Core(dataMemory, isa, config::SimInfo::getPhysRegStruct()),
+      physicalRegisterStructures_(config::SimInfo::getPhysRegStruct()),
+      physicalRegisterQuantities_(config::SimInfo::getPhysRegQuantities()),
+      registerAliasTable_(config::SimInfo::getArchRegStruct(),
                           physicalRegisterQuantities_),
       mappedRegisterFileSet_(registerFileSet_, registerAliasTable_),
-      dataMemory_(dataMemory),
-      fetchToDecodeBuffer_(
-          config["Pipeline-Widths"]["FrontEnd"].as<unsigned int>(), {}),
+      fetchToDecodeBuffer_(config["Pipeline-Widths"]["FrontEnd"].as<uint16_t>(),
+                           {}),
       decodeToRenameBuffer_(
-          config["Pipeline-Widths"]["FrontEnd"].as<unsigned int>(), nullptr),
+          config["Pipeline-Widths"]["FrontEnd"].as<uint16_t>(), nullptr),
       renameToDispatchBuffer_(
-          config["Pipeline-Widths"]["FrontEnd"].as<unsigned int>(), nullptr),
-      issuePorts_(config["Execution-Units"].size(), {1, nullptr}),
+          config["Pipeline-Widths"]["FrontEnd"].as<uint16_t>(), nullptr),
+      issuePorts_(config["Execution-Units"].num_children(), {1, nullptr}),
       completionSlots_(
-          config["Execution-Units"].size() +
-              config["Pipeline-Widths"]["LSQ-Completion"].as<unsigned int>(),
+          config["Execution-Units"].num_children() +
+              config["Pipeline-Widths"]["LSQ-Completion"].as<uint16_t>(),
           {1, nullptr}),
+      fetchUnit_(fetchToDecodeBuffer_, instructionMemory, processMemorySize,
+                 entryPoint, config["Fetch"]["Fetch-Block-Size"].as<uint16_t>(),
+                 isa, branchPredictor),
+      decodeUnit_(fetchToDecodeBuffer_, decodeToRenameBuffer_, branchPredictor),
+      renameUnit_(decodeToRenameBuffer_, renameToDispatchBuffer_,
+                  reorderBuffer_, registerAliasTable_, loadStoreQueue_,
+                  physicalRegisterStructures_.size()),
+      dispatchIssueUnit_(renameToDispatchBuffer_, issuePorts_, registerFileSet_,
+                         portAllocator, physicalRegisterQuantities_),
+      writebackUnit_(
+          completionSlots_, registerFileSet_,
+          [this](auto insnId) { reorderBuffer_.commitMicroOps(insnId); }),
+      reorderBuffer_(
+          config["Queue-Sizes"]["ROB"].as<uint32_t>(), registerAliasTable_,
+          loadStoreQueue_,
+          [this](auto instruction) { raiseException(instruction); },
+          [this](auto branchAddress) {
+            fetchUnit_.registerLoopBoundary(branchAddress);
+          },
+          branchPredictor, config["Fetch"]["Loop-Buffer-Size"].as<uint16_t>(),
+          config["Fetch"]["Loop-Detection-Threshold"].as<uint16_t>()),
       loadStoreQueue_(
-          config["Queue-Sizes"]["Load"].as<unsigned int>(),
-          config["Queue-Sizes"]["Store"].as<unsigned int>(), dataMemory,
-          {completionSlots_.data() + config["Execution-Units"].size(),
-           config["Pipeline-Widths"]["LSQ-Completion"].as<unsigned int>()},
+          config["Queue-Sizes"]["Load"].as<uint32_t>(),
+          config["Queue-Sizes"]["Store"].as<uint32_t>(), dataMemory,
+          {completionSlots_.data() + config["Execution-Units"].num_children(),
+           config["Pipeline-Widths"]["LSQ-Completion"].as<uint16_t>()},
           [this](auto regs, auto values) {
             dispatchIssueUnit_.forwardOperands(regs, values);
           },
+          [](auto uop) { uop->setCommitReady(); },
           config["LSQ-L1-Interface"]["Exclusive"].as<bool>(),
           config["LSQ-L1-Interface"]["Load-Bandwidth"].as<uint16_t>(),
           config["LSQ-L1-Interface"]["Store-Bandwidth"].as<uint16_t>(),
@@ -55,37 +71,15 @@ Core::Core(MemoryInterface& instructionMemory, MemoryInterface& dataMemory,
               .as<uint16_t>(),
           config["LSQ-L1-Interface"]["Permitted-Stores-Per-Cycle"]
               .as<uint16_t>()),
-      fetchUnit_(fetchToDecodeBuffer_, instructionMemory, processMemorySize,
-                 entryPoint, config["Fetch"]["Fetch-Block-Size"].as<uint16_t>(),
-                 isa, branchPredictor),
-      reorderBuffer_(
-          config["Queue-Sizes"]["ROB"].as<unsigned int>(), registerAliasTable_,
-          loadStoreQueue_,
-          [this](auto instruction) { raiseException(instruction); },
-          [this](auto branchAddress) {
-            fetchUnit_.registerLoopBoundary(branchAddress);
-          },
-          branchPredictor, config["Fetch"]["Loop-Buffer-Size"].as<uint16_t>(),
-          config["Fetch"]["Loop-Detection-Threshold"].as<uint16_t>()),
-      decodeUnit_(fetchToDecodeBuffer_, decodeToRenameBuffer_, branchPredictor),
-      renameUnit_(decodeToRenameBuffer_, renameToDispatchBuffer_,
-                  reorderBuffer_, registerAliasTable_, loadStoreQueue_,
-                  physicalRegisterStructures_.size()),
-      dispatchIssueUnit_(renameToDispatchBuffer_, issuePorts_, registerFileSet_,
-                         portAllocator, physicalRegisterQuantities_, config),
-      writebackUnit_(
-          completionSlots_, registerFileSet_,
-          [this](auto insnId) { reorderBuffer_.commitMicroOps(insnId); }),
       portAllocator_(portAllocator),
-      clockFrequency_(config["Core"]["Clock-Frequency"].as<float>() * 1e9),
-      commitWidth_(config["Pipeline-Widths"]["Commit"].as<unsigned int>()) {
-  for (size_t i = 0; i < config["Execution-Units"].size(); i++) {
+      commitWidth_(config["Pipeline-Widths"]["Commit"].as<uint16_t>()),
+      branchPredictor_(branchPredictor) {
+  for (size_t i = 0; i < config["Execution-Units"].num_children(); i++) {
     // Create vector of blocking groups
     std::vector<uint16_t> blockingGroups = {};
-    if (config["Execution-Units"][i]["Blocking-Groups"].IsDefined()) {
-      for (YAML::Node gp : config["Execution-Units"][i]["Blocking-Groups"]) {
-        blockingGroups.push_back(gp.as<uint16_t>());
-      }
+    for (ryml::ConstNodeRef grp :
+         config["Execution-Units"][i]["Blocking-Group-Nums"]) {
+      blockingGroups.push_back(grp.as<uint16_t>());
     }
     executionUnits_.emplace_back(
         issuePorts_[i], completionSlots_[i],
@@ -94,24 +88,25 @@ Core::Core(MemoryInterface& instructionMemory, MemoryInterface& dataMemory,
         },
         [this](auto uop) { loadStoreQueue_.startLoad(uop); },
         [this](auto uop) { loadStoreQueue_.supplyStoreData(uop); },
-        [](auto uop) { uop->setCommitReady(); }, branchPredictor,
+        [](auto uop) { uop->setCommitReady(); },
         config["Execution-Units"][i]["Pipelined"].as<bool>(), blockingGroups);
   }
   // Provide reservation size getter to A64FX port allocator
-  portAllocator.setRSSizeGetter([this](std::vector<uint64_t>& sizeVec) {
+  portAllocator.setRSSizeGetter([this](std::vector<uint32_t>& sizeVec) {
     dispatchIssueUnit_.getRSSizes(sizeVec);
   });
 
   // Query and apply initial state
   auto state = isa.getInitialState();
   applyStateChange(state);
-};
+}
 
 void Core::tick() {
-  ticks_++;
-
   if (hasHalted_) return;
 
+  ticks_++;
+  isa_.updateSystemTimerRegisters(&registerFileSet_, ticks_);
+
   if (exceptionHandler_ != nullptr) {
     processExceptionHandler();
     return;
@@ -163,73 +158,6 @@ void Core::tick() {
 
   flushIfNeeded();
   fetchUnit_.requestFromPC();
-  isa_.updateSystemTimerRegisters(&registerFileSet_, ticks_);
-}
-
-void Core::flushIfNeeded() {
-  // Check for flush
-  bool euFlush = false;
-  uint64_t targetAddress = 0;
-  uint64_t lowestSeqId = 0;
-  for (const auto& eu : executionUnits_) {
-    if (eu.shouldFlush() && (!euFlush || eu.getFlushSeqId() < lowestSeqId)) {
-      euFlush = true;
-      lowestSeqId = eu.getFlushSeqId();
-      targetAddress = eu.getFlushAddress();
-    }
-  }
-  if (euFlush || reorderBuffer_.shouldFlush()) {
-    // Flush was requested in an out-of-order stage.
-    // Update PC and wipe in-order buffers (Fetch/Decode, Decode/Rename,
-    // Rename/Dispatch)
-
-    if (reorderBuffer_.shouldFlush() &&
-        (!euFlush || reorderBuffer_.getFlushSeqId() < lowestSeqId)) {
-      // If the reorder buffer found an older instruction to flush up to, do
-      // that instead
-      lowestSeqId = reorderBuffer_.getFlushSeqId();
-      targetAddress = reorderBuffer_.getFlushAddress();
-    }
-
-    fetchUnit_.flushLoopBuffer();
-    // Set all flushed instructions to a finished state
-    flushTraces(false);
-
-    fetchUnit_.updatePC(targetAddress);
-    fetchToDecodeBuffer_.fill({});
-    fetchToDecodeBuffer_.stall(false);
-
-    decodeToRenameBuffer_.fill(nullptr);
-    decodeToRenameBuffer_.stall(false);
-
-    renameToDispatchBuffer_.fill(nullptr);
-    renameToDispatchBuffer_.stall(false);
-
-    // Flush everything younger than the bad instruction from the ROB
-    reorderBuffer_.flush(lowestSeqId);
-    decodeUnit_.purgeFlushed();
-    dispatchIssueUnit_.purgeFlushed();
-    loadStoreQueue_.purgeFlushed();
-    for (auto& eu : executionUnits_) {
-      eu.purgeFlushed();
-    }
-
-    flushes_++;
-  } else if (decodeUnit_.shouldFlush()) {
-    // Flush was requested at decode stage
-    // Update PC and wipe Fetch/Decode buffer.
-    targetAddress = decodeUnit_.getFlushAddress();
-
-    fetchUnit_.flushLoopBuffer();
-    // Set all flushed instructions to a finished state
-    flushTraces(true);
-
-    fetchUnit_.updatePC(targetAddress);
-    fetchToDecodeBuffer_.fill({});
-    fetchToDecodeBuffer_.stall(false);
-
-    flushes_++;
-  }
 }
 
 void Core::flushTraces(const bool atDecode) {
@@ -346,6 +274,66 @@ bool Core::hasHalted() const {
   return true;
 }
 
+const ArchitecturalRegisterFileSet& Core::getArchitecturalRegisterFileSet()
+    const {
+  return mappedRegisterFileSet_;
+}
+
+uint64_t Core::getInstructionsRetiredCount() const {
+  return reorderBuffer_.getInstructionsCommittedCount();
+}
+
+std::map<std::string, std::string> Core::getStats() const {
+  auto retired = reorderBuffer_.getInstructionsCommittedCount();
+  auto ipc = retired / static_cast<float>(ticks_);
+  std::ostringstream ipcStr;
+  ipcStr << std::setprecision(2) << ipc;
+
+  auto branchStalls = fetchUnit_.getBranchStalls();
+
+  auto earlyFlushes = decodeUnit_.getEarlyFlushes();
+
+  auto allocationStalls = renameUnit_.getAllocationStalls();
+  auto robStalls = renameUnit_.getROBStalls();
+  auto lqStalls = renameUnit_.getLoadQueueStalls();
+  auto sqStalls = renameUnit_.getStoreQueueStalls();
+
+  auto rsStalls = dispatchIssueUnit_.getRSStalls();
+  auto frontendStalls = dispatchIssueUnit_.getFrontendStalls();
+  auto backendStalls = dispatchIssueUnit_.getBackendStalls();
+  auto portBusyStalls = dispatchIssueUnit_.getPortBusyStalls();
+
+  uint64_t totalBranchesFetched = fetchUnit_.getBranchFetchedCount();
+  uint64_t totalBranchesRetired = reorderBuffer_.getRetiredBranchesCount();
+  uint64_t totalBranchMispredicts = reorderBuffer_.getBranchMispredictedCount();
+
+  auto branchMissRate = 100.0 * static_cast<double>(totalBranchMispredicts) /
+                        static_cast<double>(totalBranchesRetired);
+  std::ostringstream branchMissRateStr;
+  branchMissRateStr << std::setprecision(3) << branchMissRate << "%";
+
+  return {{"cycles", std::to_string(ticks_)},
+          {"retired", std::to_string(retired)},
+          {"ipc", ipcStr.str()},
+          {"flushes", std::to_string(flushes_)},
+          {"fetch.branchStalls", std::to_string(branchStalls)},
+          {"decode.earlyFlushes", std::to_string(earlyFlushes)},
+          {"rename.allocationStalls", std::to_string(allocationStalls)},
+          {"rename.robStalls", std::to_string(robStalls)},
+          {"rename.lqStalls", std::to_string(lqStalls)},
+          {"rename.sqStalls", std::to_string(sqStalls)},
+          {"dispatch.rsStalls", std::to_string(rsStalls)},
+          {"issue.frontendStalls", std::to_string(frontendStalls)},
+          {"issue.backendStalls", std::to_string(backendStalls)},
+          {"issue.portBusyStalls", std::to_string(portBusyStalls)},
+          {"branch.fetched", std::to_string(totalBranchesFetched)},
+          {"branch.retired", std::to_string(totalBranchesRetired)},
+          {"branch.mispredicted", std::to_string(totalBranchMispredicts)},
+          {"branch.missrate", branchMissRateStr.str()},
+          {"lsq.loadViolations",
+           std::to_string(reorderBuffer_.getViolatingLoadsCount())}};
+}
+
 void Core::raiseException(const std::shared_ptr<Instruction>& instruction) {
   exceptionGenerated_ = true;
   exceptionGeneratingInstruction_ = instruction;
@@ -355,12 +343,18 @@ void Core::handleException() {
   // Set all flushed instructions to a finished state
   flushTraces(false);
 
+  // Check for branch instructions in buffer, and flush them from the BP.
+  // Then empty the buffers
+  branchPredictor_.flushBranchesInBufferFromSelf(fetchToDecodeBuffer_);
   fetchToDecodeBuffer_.fill({});
   fetchToDecodeBuffer_.stall(false);
 
+  branchPredictor_.flushBranchesInBufferFromSelf(decodeToRenameBuffer_);
   decodeToRenameBuffer_.fill(nullptr);
   decodeToRenameBuffer_.stall(false);
 
+  // Instructions in this buffer are already accounted for in the ROB so no
+  // need to check for branch instructions in this buffer
   renameToDispatchBuffer_.fill(nullptr);
   renameToDispatchBuffer_.stall(false);
 
@@ -410,114 +404,79 @@ void Core::processExceptionHandler() {
   exceptionHandler_ = nullptr;
 }
 
-void Core::applyStateChange(const arch::ProcessStateChange& change) {
-  // Update registers in accoradance with the ProcessStateChange type
-  switch (change.type) {
-    case arch::ChangeType::INCREMENT: {
-      for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
-        mappedRegisterFileSet_.set(
-            change.modifiedRegisters[i],
-            mappedRegisterFileSet_.get(change.modifiedRegisters[i])
-                    .get<uint64_t>() +
-                change.modifiedRegisterValues[i].get<uint64_t>());
-      }
-      break;
-    }
-    case arch::ChangeType::DECREMENT: {
-      for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
-        mappedRegisterFileSet_.set(
-            change.modifiedRegisters[i],
-            mappedRegisterFileSet_.get(change.modifiedRegisters[i])
-                    .get<uint64_t>() -
-                change.modifiedRegisterValues[i].get<uint64_t>());
-      }
-      break;
-    }
-    default: {  // arch::ChangeType::REPLACEMENT
-      // If type is ChangeType::REPLACEMENT, set new values
-      for (size_t i = 0; i < change.modifiedRegisters.size(); i++) {
-        mappedRegisterFileSet_.set(change.modifiedRegisters[i],
-                                   change.modifiedRegisterValues[i]);
-      }
-      break;
+void Core::flushIfNeeded() {
+  // Check for flush
+  bool euFlush = false;
+  uint64_t targetAddress = 0;
+  uint64_t lowestInsnId = 0;
+  for (const auto& eu : executionUnits_) {
+    if (eu.shouldFlush() && (!euFlush || eu.getFlushInsnId() < lowestInsnId)) {
+      euFlush = true;
+      lowestInsnId = eu.getFlushInsnId();
+      targetAddress = eu.getFlushAddress();
     }
   }
+  if (euFlush || reorderBuffer_.shouldFlush()) {
+    // Flush was requested in an out-of-order stage.
+    // Update PC and wipe in-order buffers (Fetch/Decode, Decode/Rename,
+    // Rename/Dispatch)
 
-  // Update memory
-  // TODO: Analyse if ChangeType::INCREMENT or ChangeType::DECREMENT case is
-  // required for memory changes
-  for (size_t i = 0; i < change.memoryAddresses.size(); i++) {
-    dataMemory_.requestWrite(change.memoryAddresses[i],
-                             change.memoryAddressValues[i]);
-  }
-}
-
-const ArchitecturalRegisterFileSet& Core::getArchitecturalRegisterFileSet()
-    const {
-  return mappedRegisterFileSet_;
-}
+    if (reorderBuffer_.shouldFlush() &&
+        (!euFlush || reorderBuffer_.getFlushInsnId() < lowestInsnId)) {
+      // If the reorder buffer found an older instruction to flush up to, do
+      // that instead
+      lowestInsnId = reorderBuffer_.getFlushInsnId();
+      targetAddress = reorderBuffer_.getFlushAddress();
+    }
 
-uint64_t Core::getInstructionsRetiredCount() const {
-  return reorderBuffer_.getInstructionsCommittedCount();
-}
+    // Check for branch instructions in buffer, and flush them from the BP.
+    // Then empty the buffers
+    fetchUnit_.flushLoopBuffer();
+    fetchUnit_.updatePC(targetAddress);
+    branchPredictor_.flushBranchesInBufferFromSelf(fetchToDecodeBuffer_);
+    fetchToDecodeBuffer_.fill({});
+    fetchToDecodeBuffer_.stall(false);
 
-uint64_t Core::getSystemTimer() const {
-  // TODO: This will need to be changed if we start supporting DVFS.
-  return ticks_ / (clockFrequency_ / 1e9);
-}
+    // Set all flushed instructions to a finished state
+    flushTraces(false);
 
-std::map<std::string, std::string> Core::getStats() const {
-  auto retired = reorderBuffer_.getInstructionsCommittedCount();
-  auto ipc = retired / static_cast<float>(ticks_);
-  std::ostringstream ipcStr;
-  ipcStr << std::setprecision(2) << ipc;
+    branchPredictor_.flushBranchesInBufferFromSelf(decodeToRenameBuffer_);
+    decodeToRenameBuffer_.fill(nullptr);
+    decodeToRenameBuffer_.stall(false);
 
-  auto branchStalls = fetchUnit_.getBranchStalls();
+    // Instructions in this buffer are already accounted for in the ROB so no
+    // need to check for branch instructions in this buffer
+    renameToDispatchBuffer_.fill(nullptr);
+    renameToDispatchBuffer_.stall(false);
 
-  auto earlyFlushes = decodeUnit_.getEarlyFlushes();
+    // Flush everything younger than the bad instruction from the ROB
+    reorderBuffer_.flush(lowestInsnId);
+    decodeUnit_.purgeFlushed();
+    dispatchIssueUnit_.purgeFlushed();
+    loadStoreQueue_.purgeFlushed();
+    for (auto& eu : executionUnits_) {
+      eu.purgeFlushed();
+    }
 
-  auto allocationStalls = renameUnit_.getAllocationStalls();
-  auto robStalls = renameUnit_.getROBStalls();
-  auto lqStalls = renameUnit_.getLoadQueueStalls();
-  auto sqStalls = renameUnit_.getStoreQueueStalls();
+    flushes_++;
+  } else if (decodeUnit_.shouldFlush()) {
+    // Flush was requested at decode stage
+    // Update PC and wipe Fetch/Decode buffer.
+    targetAddress = decodeUnit_.getFlushAddress();
 
-  auto rsStalls = dispatchIssueUnit_.getRSStalls();
-  auto frontendStalls = dispatchIssueUnit_.getFrontendStalls();
-  auto backendStalls = dispatchIssueUnit_.getBackendStalls();
-  auto portBusyStalls = dispatchIssueUnit_.getPortBusyStalls();
+    // Check for branch instructions in buffer, and flush them from the BP.
+    // Then empty the buffers
+    fetchUnit_.flushLoopBuffer();
+    fetchUnit_.updatePC(targetAddress);
+    branchPredictor_.flushBranchesInBufferFromSelf(fetchToDecodeBuffer_);
+    fetchToDecodeBuffer_.fill({});
+    fetchToDecodeBuffer_.stall(false);
 
-  uint64_t totalBranchesExecuted = 0;
-  uint64_t totalBranchMispredicts = 0;
+    // Set all flushed instructions to a finished state
+    flushTraces(true);
 
-  // Sum up the branch stats reported across the execution units.
-  for (auto& eu : executionUnits_) {
-    totalBranchesExecuted += eu.getBranchExecutedCount();
-    totalBranchMispredicts += eu.getBranchMispredictedCount();
+    flushes_++;
   }
-  auto branchMissRate = 100.0f * static_cast<float>(totalBranchMispredicts) /
-                        static_cast<float>(totalBranchesExecuted);
-  std::ostringstream branchMissRateStr;
-  branchMissRateStr << std::setprecision(3) << branchMissRate << "%";
-
-  return {{"cycles", std::to_string(ticks_)},
-          {"retired", std::to_string(retired)},
-          {"ipc", ipcStr.str()},
-          {"flushes", std::to_string(flushes_)},
-          {"fetch.branchStalls", std::to_string(branchStalls)},
-          {"decode.earlyFlushes", std::to_string(earlyFlushes)},
-          {"rename.allocationStalls", std::to_string(allocationStalls)},
-          {"rename.robStalls", std::to_string(robStalls)},
-          {"rename.lqStalls", std::to_string(lqStalls)},
-          {"rename.sqStalls", std::to_string(sqStalls)},
-          {"dispatch.rsStalls", std::to_string(rsStalls)},
-          {"issue.frontendStalls", std::to_string(frontendStalls)},
-          {"issue.backendStalls", std::to_string(backendStalls)},
-          {"issue.portBusyStalls", std::to_string(portBusyStalls)},
-          {"branch.executed", std::to_string(totalBranchesExecuted)},
-          {"branch.mispredict", std::to_string(totalBranchMispredicts)},
-          {"branch.missrate", branchMissRateStr.str()},
-          {"lsq.loadViolations",
-           std::to_string(reorderBuffer_.getViolatingLoadsCount())}};
 }
 
 }  // namespace outoforder
diff --git a/src/lib/pipeline/A64FXPortAllocator.cc b/src/lib/pipeline/A64FXPortAllocator.cc
index ba1db0471e..d7c1057bb3 100644
--- a/src/lib/pipeline/A64FXPortAllocator.cc
+++ b/src/lib/pipeline/A64FXPortAllocator.cc
@@ -9,7 +9,7 @@ namespace pipeline {
 
 A64FXPortAllocator::A64FXPortAllocator(
     const std::vector<std::vector<uint16_t>>& portArrangement)
-    :  // Initiliase reservation station to port mapping
+    :  // Initialise reservation station to port mapping
       rsToPort_({{0, 1, 2}, {3, 4}, {5}, {6}, {7}}) {}
 
 uint16_t A64FXPortAllocator::allocate(const std::vector<uint16_t>& ports) {
@@ -19,22 +19,24 @@ uint16_t A64FXPortAllocator::allocate(const std::vector<uint16_t>& ports) {
 
   uint16_t rs = 0;
   uint16_t port = 0;
-  bool foundRS = false;
-  bool foundPort = false;
+  // Both only used in assertions so produces warning in release mode
+  [[maybe_unused]] bool foundRS = false;
+  [[maybe_unused]] bool foundPort = false;
 
   if (attribute == InstructionAttribute::RSX) {
     // Get difference between free entries of RSE{0|1} and RSA{0|1}
-    int diffRSE = (freeEntries_[0] + freeEntries_[1]) -
-                  (freeEntries_[2] + freeEntries_[3]);
-    int diffRSA = (freeEntries_[2] + freeEntries_[3]) -
-                  (freeEntries_[0] + freeEntries_[1]);
+    int32_t totalRSE = freeEntries_[0] + freeEntries_[1];
+    int32_t totalRSA = freeEntries_[2] + freeEntries_[3];
+    int32_t diffRSE = totalRSE - totalRSA;
+    int32_t diffRSA = totalRSA - totalRSE;
+
     // Set threshold values
-    int thresholdA = 4;
-    int thresholdB = 4;
-    int thresholdC = 4;
+    int32_t thresholdA = 4;
+    int32_t thresholdB = 4;
+    int32_t thresholdC = 4;
 
     if (diffRSE >= thresholdA) {
-      if ((freeEntries_[0] - freeEntries_[1]) >= thresholdB) {
+      if (((int32_t)freeEntries_[0] - (int32_t)freeEntries_[1]) >= thresholdB) {
         rs = RSEm_;  // Table 1
       } else {
         rs = dispatchSlot_ % 2 == 0 ? RSEm_ : RSEf_;  // Table 2
@@ -145,12 +147,14 @@ uint16_t A64FXPortAllocator::allocate(const std::vector<uint16_t>& ports) {
 }
 
 void A64FXPortAllocator::issued(uint16_t port) {}
-void A64FXPortAllocator::deallocate(uint16_t port) { issued(port); };
+
+void A64FXPortAllocator::deallocate(uint16_t port) { issued(port); }
 
 uint8_t A64FXPortAllocator::attributeMapping(
     const std::vector<uint16_t>& ports) {
   uint8_t attribute = 0;
-  bool foundAttribute = false;
+  // Only used in assertion so produces warning in release mode
+  [[maybe_unused]] bool foundAttribute = false;
   if (ports == EXA_EXB_EAGA_EAGB) {  // EXA,EXB,EAGA,EAGB
     attribute = InstructionAttribute::RSX;
     foundAttribute = true;
@@ -176,7 +180,7 @@ uint8_t A64FXPortAllocator::attributeMapping(
 }
 
 void A64FXPortAllocator::setRSSizeGetter(
-    std::function<void(std::vector<uint64_t>&)> rsSizes) {
+    std::function<void(std::vector<uint32_t>&)> rsSizes) {
   rsSizes_ = rsSizes;
 }
 
diff --git a/src/lib/pipeline/BalancedPortAllocator.cc b/src/lib/pipeline/BalancedPortAllocator.cc
index a3db3f85e4..971755e3f5 100644
--- a/src/lib/pipeline/BalancedPortAllocator.cc
+++ b/src/lib/pipeline/BalancedPortAllocator.cc
@@ -35,10 +35,10 @@ void BalancedPortAllocator::issued(uint16_t port) {
   assert(weights[port] > 0);
   weights[port]--;
 }
-void BalancedPortAllocator::deallocate(uint16_t port) { issued(port); };
+void BalancedPortAllocator::deallocate(uint16_t port) { issued(port); }
 
 void BalancedPortAllocator::setRSSizeGetter(
-    std::function<void(std::vector<uint64_t>&)> rsSizes) {
+    std::function<void(std::vector<uint32_t>&)> rsSizes) {
   rsSizes_ = rsSizes;
 }
 
diff --git a/src/lib/pipeline/DecodeUnit.cc b/src/lib/pipeline/DecodeUnit.cc
index 2cea7a547d..8f139fd7b4 100644
--- a/src/lib/pipeline/DecodeUnit.cc
+++ b/src/lib/pipeline/DecodeUnit.cc
@@ -8,7 +8,7 @@ namespace pipeline {
 DecodeUnit::DecodeUnit(PipelineBuffer<MacroOp>& input,
                        PipelineBuffer<std::shared_ptr<Instruction>>& output,
                        BranchPredictor& predictor)
-    : input_(input), output_(output), predictor_(predictor){};
+    : input_(input), output_(output), predictor_(predictor) {}
 
 void DecodeUnit::tick() {
   // Stall if output buffer is stalled
@@ -48,77 +48,14 @@ void DecodeUnit::tick() {
     if (!microOps_.size()) break;
 
     // Move uop to output buffer and remove from internal buffer
-    auto& uop = (output_.getTailSlots()[slot] = std::move(microOps_.front()));
+    output_.getTailSlots()[slot] = std::move(microOps_.front());
     microOps_.pop_front();
-    // Store cycle at which instruction was decoded
-    if (uop->getTraceId() != 0) {
-      std::map<uint64_t, Trace*>::iterator it =
-          traceMap.find(uop->getTraceId());
-      if (it != traceMap.end()) {
-        cycleTrace tr = it->second->getCycleTraces();
-        if (tr.finished != 1) {
-          tr.decode = trace_cycle;
-          it->second->setCycleTraces(tr);
-        }
-      }
-    }
-
-    // Check preliminary branch prediction results now that the instruction is
-    // decoded. Identifies:
-    // - Non-branch instructions mistakenly predicted as branches
-    // - Incorrect targets for immediate branches
-    auto [misprediction, correctAddress] = uop->checkEarlyBranchMisprediction();
-    if (misprediction) {
-      earlyFlushes_++;
-      shouldFlush_ = true;
-      pc_ = correctAddress;
-
-      if (!uop->isBranch()) {
-        // Non-branch incorrectly predicted as a branch; let the predictor know
-        predictor_.update(uop->getInstructionAddress(), false, pc_,
-                          uop->getBranchType());
-      }
-      // Remove macro-operations in microOps_ buffer after macro-operation
-      // decoded in this cycle
-      auto uopIt = microOps_.begin();
-      // Find first microOps_ entry not belonging to same address as flushing
-      // instruction
-      while (uopIt != microOps_.end()) {
-        if ((*uopIt)->getInstructionAddress() != uop->getInstructionAddress()) {
-          break;
-        } else {
-          uopIt++;
-        }
-      }
-      // Remove all entries after first macro-operation in buffer
-      while (uopIt != microOps_.end()) {
-        if ((*uopIt)->getTraceId() != 0) {
-          std::map<uint64_t, Trace*>::iterator it =
-              traceMap.find((*uopIt)->getTraceId());
-          if (it != traceMap.end()) {
-            cycleTrace tr = it->second->getCycleTraces();
-            tr.finished = 1;
-            it->second->setCycleTraces(tr);
-          }
-        }
-        uopIt = microOps_.erase(uopIt);
-      }
-
-      // Branch.decode.earlyMisprediction
-      probeTrace newProbe = {13, trace_cycle, uop->getTraceId()};
-      Trace* newTrace = new Trace;
-      newTrace->setProbeTraces(newProbe);
-      probeList.push_back(newTrace);
-
-      // Skip processing remaining uops, as they need to be flushed
-      break;
-    }
   }
 }
 
 bool DecodeUnit::shouldFlush() const { return shouldFlush_; }
 uint64_t DecodeUnit::getFlushAddress() const { return pc_; }
-uint64_t DecodeUnit::getEarlyFlushes() const { return earlyFlushes_; };
+uint64_t DecodeUnit::getEarlyFlushes() const { return earlyFlushes_; }
 
 void DecodeUnit::purgeFlushed() {
   auto uopIt = microOps_.begin();
@@ -134,7 +71,12 @@ void DecodeUnit::purgeFlushed() {
     }
     uopIt++;
   }
-  microOps_.clear();
+
+  while (!microOps_.empty()) {
+    if (microOps_.back()->isBranch())
+      predictor_.flush(microOps_.back()->getInstructionAddress());
+    microOps_.pop_back();
+  }
 }
 
 }  // namespace pipeline
diff --git a/src/lib/pipeline/DispatchIssueUnit.cc b/src/lib/pipeline/DispatchIssueUnit.cc
index d710a2d085..a763aeabf8 100644
--- a/src/lib/pipeline/DispatchIssueUnit.cc
+++ b/src/lib/pipeline/DispatchIssueUnit.cc
@@ -10,7 +10,8 @@ DispatchIssueUnit::DispatchIssueUnit(
     PipelineBuffer<std::shared_ptr<Instruction>>& fromRename,
     std::vector<PipelineBuffer<std::shared_ptr<Instruction>>>& issuePorts,
     const RegisterFileSet& registerFileSet, PortAllocator& portAllocator,
-    const std::vector<uint16_t>& physicalRegisterStructure, YAML::Node config)
+    const std::vector<uint16_t>& physicalRegisterStructure,
+    ryml::ConstNodeRef config)
     : input_(fromRename),
       issuePorts_(issuePorts),
       registerFileSet_(registerFileSet),
@@ -24,23 +25,24 @@ DispatchIssueUnit::DispatchIssueUnit(
   }
   // Create set of reservation station structs with correct issue port
   // mappings
-  for (size_t i = 0; i < config["Reservation-Stations"].size(); i++) {
+  for (size_t i = 0; i < config["Reservation-Stations"].num_children(); i++) {
     // Iterate over each reservation station in config
     auto reservation_station = config["Reservation-Stations"][i];
     // Create ReservationStation struct to be stored
     ReservationStation rs = {
-        reservation_station["Size"].as<uint16_t>(),
+        reservation_station["Size"].as<uint32_t>(),
         reservation_station["Dispatch-Rate"].as<uint16_t>(),
-        0,
+        0ul,
         {}};
     // Resize rs port attribute to match what's defined in config file
-    rs.ports.resize(reservation_station["Ports"].size());
-    for (size_t j = 0; j < reservation_station["Ports"].size(); j++) {
+    rs.ports.resize(reservation_station["Port-Nums"].num_children());
+    for (size_t j = 0; j < reservation_station["Port-Nums"].num_children();
+         j++) {
       // Iterate over issue ports in config
-      uint16_t issue_port = reservation_station["Ports"][j].as<uint16_t>();
+      uint16_t issue_port = reservation_station["Port-Nums"][j].as<uint16_t>();
       rs.ports[j].issuePort = issue_port;
       // Add port mapping entry, resizing vector if needed
-      if ((issue_port + 1) > portMapping_.size()) {
+      if ((size_t)(issue_port + 1) > portMapping_.size()) {
         portMapping_.resize((issue_port + 1));
       }
       portMapping_[issue_port] = {i, j};
@@ -65,26 +67,29 @@ void DispatchIssueUnit::tick() {
       continue;
     }
 
-    const std::vector<uint16_t>& supportedPorts = uop->getSupportedPorts();
+    std::vector<uint16_t> supportedPorts = uop->getSupportedPorts();
     if (uop->exceptionEncountered()) {
       // Exception; mark as ready to commit, and remove from pipeline
       uop->setCommitReady();
       input_.getHeadSlots()[slot] = nullptr;
       continue;
     }
-    // Allocate issue port to uop
-    uint16_t port = portAllocator_.allocate(supportedPorts);
-    uint16_t RS_Index = portMapping_[port].first;
-    uint16_t RS_Port = portMapping_[port].second;
-    assert(RS_Index < reservationStations_.size() &&
-           "Allocated port inaccessible");
-    ReservationStation& rs = reservationStations_[RS_Index];
 
-    // When appropriate, stall uop or input buffer if stall buffer full
-    if (rs.currentSize == rs.capacity ||
-        dispatches_[RS_Index] == rs.dispatchRate) {
-      // Deallocate port given
-      portAllocator_.deallocate(port);
+    // Loop through all ports and remove any who's RS is at capacity or dispatch
+    // rate has been met
+    auto portIt = supportedPorts.begin();
+    while (portIt != supportedPorts.end()) {
+      uint16_t RS_Index = portMapping_[*portIt].first;
+      ReservationStation* rs = &reservationStations_[RS_Index];
+      if (rs->currentSize == rs->capacity ||
+          dispatches_[RS_Index] == rs->dispatchRate) {
+        portIt = supportedPorts.erase(portIt);
+      } else {
+        portIt++;
+      }
+    }
+    // If no ports left, stall and return
+    if (supportedPorts.size() == 0) {
       input_.stall(true);
       // Stalled.dispatch.rsFull
       probeTrace newProbe = {6, trace_cycle, uop->getTraceId()};
@@ -95,12 +100,20 @@ void DispatchIssueUnit::tick() {
       return;
     }
 
+    // Find an available RS
+    uint16_t port = portAllocator_.allocate(supportedPorts);
+    uint16_t RS_Index = portMapping_[port].first;
+    uint16_t RS_Port = portMapping_[port].second;
+    assert(RS_Index < reservationStations_.size() &&
+           "Allocated port inaccessible");
+    ReservationStation* rs = &reservationStations_[RS_Index];
+
     // Assume the uop will be ready
     bool ready = true;
 
     // Register read
     // Identify remaining missing registers and supply values
-    auto& sourceRegisters = uop->getOperandRegisters();
+    auto& sourceRegisters = uop->getSourceRegisters();
     for (uint16_t i = 0; i < sourceRegisters.size(); i++) {
       const auto& reg = sourceRegisters[i];
 
@@ -126,7 +139,7 @@ void DispatchIssueUnit::tick() {
 
     // Increment dispatches made and RS occupied entries size
     dispatches_[RS_Index]++;
-    rs.currentSize++;
+    rs->currentSize++;
 
     // Store cycle at which instruction was dispatched
     if (uop->getTraceId() != 0) {
@@ -142,7 +155,7 @@ void DispatchIssueUnit::tick() {
     }
 
     if (ready) {
-      rs.ports[RS_Port].ready.push_back(std::move(uop));
+      rs->ports[RS_Port].ready.push_back(std::move(uop));
     }
 
     input_.getHeadSlots()[slot] = nullptr;
@@ -241,10 +254,6 @@ void DispatchIssueUnit::forwardOperands(const span<Register>& registers,
   }
 }
 
-void DispatchIssueUnit::setRegisterReady(Register reg) {
-  scoreboard_[reg.type][reg.tag] = true;
-}
-
 void DispatchIssueUnit::purgeFlushed() {
   for (size_t i = 0; i < reservationStations_.size(); i++) {
     // Search the ready queues for flushed instructions and remove them
@@ -303,7 +312,7 @@ uint64_t DispatchIssueUnit::getPortBusyStalls() const {
   return portBusyStalls_;
 }
 
-void DispatchIssueUnit::getRSSizes(std::vector<uint64_t>& sizes) const {
+void DispatchIssueUnit::getRSSizes(std::vector<uint32_t>& sizes) const {
   for (auto& rs : reservationStations_) {
     sizes.push_back(rs.capacity - rs.currentSize);
   }
diff --git a/src/lib/pipeline/ExecuteUnit.cc b/src/lib/pipeline/ExecuteUnit.cc
index 60eabf418d..bc3df7b197 100644
--- a/src/lib/pipeline/ExecuteUnit.cc
+++ b/src/lib/pipeline/ExecuteUnit.cc
@@ -13,15 +13,13 @@ ExecuteUnit::ExecuteUnit(
     std::function<void(const std::shared_ptr<Instruction>&)> handleLoad,
     std::function<void(const std::shared_ptr<Instruction>&)> handleStore,
     std::function<void(const std::shared_ptr<Instruction>&)> raiseException,
-    BranchPredictor& predictor, bool pipelined,
-    const std::vector<uint16_t>& blockingGroups)
+    bool pipelined, const std::vector<uint16_t>& blockingGroups)
     : input_(input),
       output_(output),
       forwardOperands_(forwardOperands),
       handleLoad_(handleLoad),
       handleStore_(handleStore),
       raiseException_(raiseException),
-      predictor_(predictor),
       pipelined_(pipelined),
       blockingGroups_(blockingGroups) {}
 
@@ -185,13 +183,6 @@ void ExecuteUnit::execute(std::shared_ptr<Instruction>& uop) {
   if (uop->isBranch()) {
     pc_ = uop->getBranchAddress();
 
-    // Update branch predictor with branch results
-    predictor_.update(uop->getInstructionAddress(), uop->wasBranchTaken(), pc_,
-                      uop->getBranchType());
-
-    // Update the branch instruction counter
-    branchesExecuted_++;
-
     if (uop->wasBranchMispredicted()) {
       // Branch.execute.misprediction
       probeTrace newProbe = {14, trace_cycle, uop->getTraceId()};
@@ -202,8 +193,6 @@ void ExecuteUnit::execute(std::shared_ptr<Instruction>& uop) {
       // Misprediction; flush the pipeline
       shouldFlush_ = true;
       flushAfter_ = uop->getInstructionId();
-      // Update the branch misprediction counter
-      branchMispredicts_++;
     }
   }
 
@@ -222,7 +211,7 @@ void ExecuteUnit::execute(std::shared_ptr<Instruction>& uop) {
 
 bool ExecuteUnit::shouldFlush() const { return shouldFlush_; }
 uint64_t ExecuteUnit::getFlushAddress() const { return pc_; }
-uint64_t ExecuteUnit::getFlushSeqId() const { return flushAfter_; }
+uint64_t ExecuteUnit::getFlushInsnId() const { return flushAfter_; }
 
 void ExecuteUnit::purgeFlushed() {
   if (pipeline_.size() == 0) {
@@ -272,13 +261,6 @@ void ExecuteUnit::purgeFlushed() {
   }
 }
 
-uint64_t ExecuteUnit::getBranchExecutedCount() const {
-  return branchesExecuted_;
-}
-uint64_t ExecuteUnit::getBranchMispredictedCount() const {
-  return branchMispredicts_;
-}
-
 uint64_t ExecuteUnit::getCycles() const { return cycles_; }
 
 bool ExecuteUnit::isEmpty() const {
diff --git a/src/lib/pipeline/FetchUnit.cc b/src/lib/pipeline/FetchUnit.cc
index bd6a1d7828..66ae9d01bf 100644
--- a/src/lib/pipeline/FetchUnit.cc
+++ b/src/lib/pipeline/FetchUnit.cc
@@ -4,9 +4,9 @@ namespace simeng {
 namespace pipeline {
 
 FetchUnit::FetchUnit(PipelineBuffer<MacroOp>& output,
-                     MemoryInterface& instructionMemory,
+                     memory::MemoryInterface& instructionMemory,
                      uint64_t programByteLength, uint64_t entryPoint,
-                     uint8_t blockSize, const arch::Architecture& isa,
+                     uint16_t blockSize, const arch::Architecture& isa,
                      BranchPredictor& branchPredictor)
     : output_(output),
       pc_(entryPoint),
@@ -40,14 +40,30 @@ void FetchUnit::tick() {
       auto& macroOp = outputSlots[slot];
       std::string disasm;
       auto bytesRead = isa_.predecode(
-          &(loopBuffer_.front().encoding), loopBuffer_.front().instructionSize,
-          loopBuffer_.front().address, macroOp, disasm);
-
-      assert(bytesRead != 0 && "predecode failure for loop buffer entry");
+          reinterpret_cast<const uint8_t*>(&(loopBuffer_.front().encoding)),
+          loopBuffer_.front().instructionSize, loopBuffer_.front().address,
+          macroOp, disasm);
+
+      if (bytesRead == 0) {
+        std::cout << "[SimEng:FetchUnit] Predecode returned 0 bytes while loop "
+                     "buffer supplying"
+                  << std::endl;
+        exit(1);
+      }
 
       // Set prediction to recorded value during loop buffer filling
       if (macroOp[0]->isBranch()) {
         macroOp[0]->setBranchPrediction(loopBuffer_.front().prediction);
+        // Calling predict() in order to log the branch in the branch
+        // predictor. The branch needs to be logged in the branch predictor
+        // so that the branch predictor has the information needed to update
+        // itself when the branch instruction is retired. However, we are
+        // reusing the prediction from the loop buffer, thus we do not
+        // use the return value from predict().
+        branchPredictor_.predict(macroOp[0]->getInstructionAddress(),
+                                 macroOp[0]->getBranchType(),
+                                 macroOp[0]->getKnownOffset());
+        branchesFetched_++;
       }
 
       // Create map element for new fetch
@@ -72,9 +88,9 @@ void FetchUnit::tick() {
     return;
   }
 
-  // Pointer to the instruction data to decode from
+  // Const pointer to the instruction data to decode from
   const uint8_t* buffer;
-  uint8_t bufferOffset;
+  uint16_t bufferOffset;
 
   // Check if more instruction data is required
   if (bufferedBytes_ < isa_.getMaxInstructionSize()) {
@@ -100,28 +116,39 @@ void FetchUnit::tick() {
         break;
       }
     }
-    if (fetchIndex == fetched.size()) {
-      // Need to wait for fetched instructions
+    // Decide how to progress based on status of fetched data and buffer. Allow
+    // progression if minimal data is in the buffer no matter state of fetched
+    // data
+    if (fetchIndex == fetched.size() &&
+        bufferedBytes_ < isa_.getMinInstructionSize()) {
       // Stalled.fetch.instructionFetch
       probeTrace newProbe = {0, trace_cycle, 0};
       Trace* newTrace = new Trace;
       newTrace->setProbeTraces(newProbe);
       probeList.push_back(newTrace);
+      // Relevant data has not been fetched and not enough data already in the
+      // buffer. Need to wait for fetched instructions
       return;
+    } else if (fetchIndex != fetched.size()) {
+      // Data has been successfully read, move into fetch buffer
+      // TODO: Handle memory faults
+      assert(fetched[fetchIndex].data && "Memory read failed");
+      const uint8_t* fetchData =
+          fetched[fetchIndex].data.getAsVector<uint8_t>();
+
+      // Copy fetched data to fetch buffer after existing data
+      std::memcpy(fetchBuffer_ + bufferedBytes_, fetchData + bufferOffset,
+                  blockSize_ - bufferOffset);
+
+      bufferedBytes_ += blockSize_ - bufferOffset;
+      buffer = fetchBuffer_;
+      // Decoding should start from the beginning of the fetchBuffer_.
+      bufferOffset = 0;
+    } else {
+      // There is already enough data in the fetch buffer, so use that
+      buffer = fetchBuffer_;
+      bufferOffset = 0;
     }
-
-    // TODO: Handle memory faults
-    assert(fetched[fetchIndex].data && "Memory read failed");
-    const uint8_t* fetchData = fetched[fetchIndex].data.getAsVector<uint8_t>();
-
-    // Copy fetched data to fetch buffer after existing data
-    std::memcpy(fetchBuffer_ + bufferedBytes_, fetchData + bufferOffset,
-                blockSize_ - bufferOffset);
-
-    bufferedBytes_ += blockSize_ - bufferOffset;
-    buffer = fetchBuffer_;
-    // Decoding should start from the beginning of the fetchBuffer_.
-    bufferOffset = 0;
   } else {
     // There is already enough data in the fetch buffer, so use that
     buffer = fetchBuffer_;
@@ -129,7 +156,7 @@ void FetchUnit::tick() {
   }
 
   // Check we have enough data to begin decoding
-  if (bufferedBytes_ < isa_.getMaxInstructionSize()) {
+  if (bufferedBytes_ < isa_.getMinInstructionSize()) {
     // Stalled.fetch.instructionDecode
     probeTrace newProbe = {1, trace_cycle, 0};
     Trace* newTrace = new Trace;
@@ -153,12 +180,13 @@ void FetchUnit::tick() {
       break;
     }
 
-    // Create branch prediction after identifing instruction type
+    // Create branch prediction after identifying instruction type
     // (e.g. RET, BL, etc).
     BranchPrediction prediction = {false, 0};
     if (macroOp[0]->isBranch()) {
       prediction = branchPredictor_.predict(pc_, macroOp[0]->getBranchType(),
                                             macroOp[0]->getKnownOffset());
+      branchesFetched_++;
       macroOp[0]->setBranchPrediction(prediction);
     }
 
@@ -185,7 +213,7 @@ void FetchUnit::tick() {
 
       if (pc_ == loopBoundaryAddress_) {
         if (macroOp[0]->isBranch() &&
-            !macroOp[0]->getBranchPrediction().taken) {
+            !macroOp[0]->getBranchPrediction().isTaken) {
           // loopBoundaryAddress_ has been fetched whilst filling the loop
           // buffer BUT this is a branch, predicted to branch out of the loop
           // being buffered. Stop filling the loop buffer and don't supply to
@@ -202,8 +230,18 @@ void FetchUnit::tick() {
       }
     } else if (loopBufferState_ == LoopBufferState::WAITING &&
                pc_ == loopBoundaryAddress_) {
-      // Once set loopBoundaryAddress_ is fetched, start to fill loop buffer
-      loopBufferState_ = LoopBufferState::FILLING;
+      // loopBoundaryAddress_ has been fetched whilst loop buffer is waiting,
+      // start filling Loop Buffer if the branch predictor tells us to
+      // reenter the detected loop
+      if (macroOp[0]->isBranch() &&
+          !macroOp[0]->getBranchPrediction().isTaken) {
+        // If branch is not taken then we aren't re-entering the detected
+        // loop, therefore Loop Buffer stays idle
+        loopBufferState_ = LoopBufferState::IDLE;
+      } else {
+        // Otherwise, start to fill Loop Buffer
+        loopBufferState_ = LoopBufferState::FILLING;
+      }
     }
 
     assert(bytesRead <= bufferedBytes_ &&
@@ -213,12 +251,12 @@ void FetchUnit::tick() {
     bufferOffset += bytesRead;
     bufferedBytes_ -= bytesRead;
 
-    if (!prediction.taken) {
-      // Predicted as not taken; increment PC to next instruction
-      pc_ += bytesRead;
-    } else {
+    if (prediction.isTaken) {
       // Predicted as taken; set PC to predicted target address
       pc_ = prediction.target;
+    } else {
+      // Predicted as not taken; increment PC to next instruction
+      pc_ += bytesRead;
     }
 
     if (pc_ >= programByteLength_) {
@@ -232,7 +270,7 @@ void FetchUnit::tick() {
       break;
     }
 
-    if (prediction.taken) {
+    if (prediction.isTaken) {
       if (slot + 1 < output_.getWidth()) {
         // Branch.fetch.stalled
         probeTrace newProbe = {12, trace_cycle, macroOp[0]->getTraceId()};
@@ -278,6 +316,9 @@ void FetchUnit::updatePC(uint64_t address) {
 }
 
 void FetchUnit::requestFromPC() {
+  // Do nothing if supplying fetch stream from loop buffer
+  if (loopBufferState_ == LoopBufferState::SUPPLYING) return;
+
   // Do nothing if buffer already contains enough data
   if (bufferedBytes_ >= isa_.getMaxInstructionSize()) return;
 
@@ -306,5 +347,7 @@ void FetchUnit::flushLoopBuffer() {
   loopBoundaryAddress_ = 0;
 }
 
+uint64_t FetchUnit::getBranchFetchedCount() const { return branchesFetched_; }
+
 }  // namespace pipeline
 }  // namespace simeng
diff --git a/src/lib/pipeline/LoadStoreQueue.cc b/src/lib/pipeline/LoadStoreQueue.cc
index 2cc82507a9..ecaa45ef46 100644
--- a/src/lib/pipeline/LoadStoreQueue.cc
+++ b/src/lib/pipeline/LoadStoreQueue.cc
@@ -10,21 +10,24 @@ namespace simeng {
 namespace pipeline {
 
 /** Check whether requests `a` and `b` overlap. */
-bool requestsOverlap(MemoryAccessTarget a, MemoryAccessTarget b) {
+bool requestsOverlap(memory::MemoryAccessTarget a,
+                     memory::MemoryAccessTarget b) {
   // Check whether one region ends before the other begins, implying no overlap,
   // and negate
   return !(a.address + a.size <= b.address || b.address + b.size <= a.address);
 }
 
 LoadStoreQueue::LoadStoreQueue(
-    unsigned int maxCombinedSpace, MemoryInterface& memory,
+    unsigned int maxCombinedSpace, memory::MemoryInterface& memory,
     span<PipelineBuffer<std::shared_ptr<Instruction>>> completionSlots,
     std::function<void(span<Register>, span<RegisterValue>)> forwardOperands,
+    std::function<void(const std::shared_ptr<Instruction>&)> raiseException,
     bool exclusive, uint16_t loadBandwidth, uint16_t storeBandwidth,
     uint16_t permittedRequests, uint16_t permittedLoads,
     uint16_t permittedStores)
     : completionSlots_(completionSlots),
       forwardOperands_(forwardOperands),
+      raiseException_(raiseException),
       maxCombinedSpace_(maxCombinedSpace),
       combined_(true),
       memory_(memory),
@@ -33,18 +36,20 @@ LoadStoreQueue::LoadStoreQueue(
       storeBandwidth_(storeBandwidth),
       totalLimit_(permittedRequests),
       // Set per-cycle limits for each request type
-      reqLimits_{permittedLoads, permittedStores} {};
+      reqLimits_{permittedLoads, permittedStores} {}
 
 LoadStoreQueue::LoadStoreQueue(
     unsigned int maxLoadQueueSpace, unsigned int maxStoreQueueSpace,
-    MemoryInterface& memory,
+    memory::MemoryInterface& memory,
     span<PipelineBuffer<std::shared_ptr<Instruction>>> completionSlots,
     std::function<void(span<Register>, span<RegisterValue>)> forwardOperands,
+    std::function<void(const std::shared_ptr<Instruction>&)> raiseException,
     bool exclusive, uint16_t loadBandwidth, uint16_t storeBandwidth,
     uint16_t permittedRequests, uint16_t permittedLoads,
     uint16_t permittedStores)
     : completionSlots_(completionSlots),
       forwardOperands_(forwardOperands),
+      raiseException_(raiseException),
       maxLoadQueueSpace_(maxLoadQueueSpace),
       maxStoreQueueSpace_(maxStoreQueueSpace),
       combined_(false),
@@ -54,7 +59,7 @@ LoadStoreQueue::LoadStoreQueue(
       storeBandwidth_(storeBandwidth),
       totalLimit_(permittedRequests),
       // Set per-cycle limits for each request type
-      reqLimits_{permittedLoads, permittedStores} {};
+      reqLimits_{permittedLoads, permittedStores} {}
 
 unsigned int LoadStoreQueue::getLoadQueueSpace() const {
   if (combined_) {
@@ -100,6 +105,13 @@ void LoadStoreQueue::startLoad(const std::shared_ptr<Instruction>& insn) {
   if (ld_addresses.size() == 0) {
     // Early execution if not addresses need to be accessed
     insn->execute();
+
+    if (insn->exceptionEncountered()) {
+      // Exception; don't pass insn to completedLoads_
+      raiseException_(insn);
+      return;
+    }
+
     completedLoads_.push(insn);
   } else {
     // Create a speculative entry for the load
@@ -110,9 +122,9 @@ void LoadStoreQueue::startLoad(const std::shared_ptr<Instruction>& insn) {
                              .back()
                              .reqAddresses;
     // Store load addresses temporarily so that conflictions are
-    // only regsitered once on most recent (program order) store
-    std::list<simeng::MemoryAccessTarget> temp_load_addr(ld_addresses.begin(),
-                                                         ld_addresses.end());
+    // only registered once on most recent (program order) store
+    std::list<simeng::memory::MemoryAccessTarget> temp_load_addr(
+        ld_addresses.begin(), ld_addresses.end());
 
     // Detect reordering conflicts
     if (storeQueue_.size() > 0) {
@@ -174,7 +186,7 @@ void LoadStoreQueue::supplyStoreData(const std::shared_ptr<Instruction>& insn) {
   while (itSt != storeQueue_.end()) {
     auto& entry = itSt->first;
     // Pair entry and incoming store data operation with macroOp identifier and
-    // microOp index value pre-detemined in microDecoder
+    // microOp index value pre-determined in microDecoder
     if (entry->getInstructionId() == macroOpNum &&
         entry->getMicroOpIndex() == microOpNum) {
       // Supply data to be stored by operations
@@ -385,7 +397,7 @@ void LoadStoreQueue::tick() {
   // requests per cycle
   // Index 0: loads, index 1: stores
   std::array<uint16_t, 2> reqCounts = {0, 0};
-  std::array<uint64_t, 2> dataTransfered = {0, 0};
+  std::array<uint64_t, 2> dataTransferred = {0, 0};
   std::array<bool, 2> exceededLimits = {false, false};
   auto itLoad = requestLoadQueue_.begin();
   auto itStore = requestStoreQueue_.begin();
@@ -433,55 +445,57 @@ void LoadStoreQueue::tick() {
 
       // Iterate over requests ready this cycle
       while (itInsn != itReq->second.end()) {
-        // Speculatively increment count of this request type
-        reqCounts[isStore]++;
-
-        // Ensure the limit on the number of permitted operations is adhered
-        // to
-        if (reqCounts[isStore] + reqCounts[!isStore] > totalLimit_) {
-          // No more requests can be scheduled this cycle
-          exceededLimits = {true, true};
-          break;
-        } else if (reqCounts[isStore] > reqLimits_[isStore]) {
-          // No more requests of this type can be scheduled this cycle
-          exceededLimits[isStore] = true;
-          // Remove speculative increment to ensure it doesn't count for
-          // comparisons aginast the totalLimit_
-          reqCounts[isStore]--;
-          break;
-        } else {
-          // Schedule requests from the queue of addresses in
-          // request[Load|Store]Queue_ entry
-          auto& addressQueue = itInsn->reqAddresses;
-          while (addressQueue.size()) {
-            const simeng::MemoryAccessTarget req = addressQueue.front();
-
-            // Ensure the limit on the data transfered per cycle is adhered to
-            assert(req.size <= bandwidth &&
-                   "Individual memory request from LoadStoreQueue exceeds L1 "
-                   "bandwidth set and thus will never be submitted");
-            dataTransfered[isStore] += req.size;
-            if (dataTransfered[isStore] > bandwidth) {
-              // No more requests can be scheduled this cycle
-              exceededLimits[isStore] = true;
-              itInsn = itReq->second.end();
-              break;
-            }
-
-            // Request a read from the memory interface if the requestQueue_
-            // entry represents a read
-            if (!isStore) {
-              memory_.requestRead(req, itInsn->insn->getSequenceId());
-            }
+        // Schedule requests from the queue of addresses in
+        // request[Load|Store]Queue_ entry
+        auto& addressQueue = itInsn->reqAddresses;
+        while (addressQueue.size()) {
+          const simeng::memory::MemoryAccessTarget req =
+              addressQueue.front();  // Speculatively increment count of this
+                                     // request type
+          reqCounts[isStore]++;
+
+          // Ensure the limit on the number of permitted operations is adhered
+          // to
+          if (reqCounts[isStore] + reqCounts[!isStore] > totalLimit_) {
+            // No more requests can be scheduled this cycle
+            exceededLimits = {true, true};
+            itInsn = itReq->second.end();
+            break;
+          } else if (reqCounts[isStore] > reqLimits_[isStore]) {
+            // No more requests of this type can be scheduled this cycle
+            exceededLimits[isStore] = true;
+            // Remove speculative increment to ensure it doesn't count for
+            // comparisons against the totalLimit_
+            reqCounts[isStore]--;
+            itInsn = itReq->second.end();
+            break;
+          }
 
-            // Remove processed address from queue
-            addressQueue.pop();
+          // Ensure the limit on the data transferred per cycle is adhered to
+          assert(req.size <= bandwidth &&
+                 "Individual memory request from LoadStoreQueue exceeds L1 "
+                 "bandwidth set and thus will never be submitted");
+          dataTransferred[isStore] += req.size;
+          if (dataTransferred[isStore] > bandwidth) {
+            // No more requests can be scheduled this cycle
+            exceededLimits[isStore] = true;
+            itInsn = itReq->second.end();
+            break;
           }
-          // Remove entry from vector iff all of its requests have been
-          // scheduled
-          if (addressQueue.size() == 0) {
-            itInsn = itReq->second.erase(itInsn);
+
+          // Request a read from the memory interface if the requestQueue_
+          // entry represents a read
+          if (!isStore) {
+            memory_.requestRead(req, itInsn->insn->getSequenceId());
           }
+
+          // Remove processed address from queue
+          addressQueue.pop();
+        }
+        // Remove entry from vector if all of its requests have been
+        // scheduled
+        if (addressQueue.size() == 0) {
+          itInsn = itReq->second.erase(itInsn);
         }
       }
 
@@ -518,6 +532,13 @@ void LoadStoreQueue::tick() {
     if (load->hasAllData()) {
       // This load has completed
       load->execute();
+
+      if (load->exceptionEncountered()) {
+        // Exception; don't pass load to completedLoads_
+        raiseException_(load);
+        continue;
+      }
+
       if (load->isStoreData()) {
         supplyStoreData(load);
       }
diff --git a/src/lib/pipeline/M1PortAllocator.cc b/src/lib/pipeline/M1PortAllocator.cc
index a8705d9ba7..5d26b6d550 100644
--- a/src/lib/pipeline/M1PortAllocator.cc
+++ b/src/lib/pipeline/M1PortAllocator.cc
@@ -9,7 +9,7 @@ namespace pipeline {
 
 M1PortAllocator::M1PortAllocator(
     const std::vector<std::vector<uint16_t>>& portArrangement,
-    std::vector<std::pair<uint8_t, uint64_t>> rsArrangement)
+    std::vector<std::pair<uint16_t, uint64_t>> rsArrangement)
     : weights(portArrangement.size(), 0), rsArrangement_(rsArrangement) {}
 
 uint16_t M1PortAllocator::allocate(const std::vector<uint16_t>& ports) {
@@ -20,11 +20,12 @@ uint16_t M1PortAllocator::allocate(const std::vector<uint16_t>& ports) {
   uint16_t bestWeight = 0xFFFF;
 
   uint16_t bestRSQueueSize = 0xFFFF;
-  bool foundRS = false;
+  // Only used in assertions so produces warning in release mode
+  [[maybe_unused]] bool foundRS = false;
 
-  // Update the the reference for number of free spaces in the reservation
+  // Update the reference for number of free spaces in the reservation
   // stations
-  std::vector<uint64_t> rsFreeSpaces;
+  rsFreeSpaces.clear();
   rsSizes_(rsFreeSpaces);
 
   for (const auto& portIndex : ports) {
@@ -58,10 +59,10 @@ void M1PortAllocator::issued(uint16_t port) {
   weights[port]--;
 }
 
-void M1PortAllocator::deallocate(uint16_t port) { issued(port); };
+void M1PortAllocator::deallocate(uint16_t port) { issued(port); }
 
 void M1PortAllocator::setRSSizeGetter(
-    std::function<void(std::vector<uint64_t>&)> rsSizes) {
+    std::function<void(std::vector<uint32_t>&)> rsSizes) {
   rsSizes_ = rsSizes;
 }
 
diff --git a/src/lib/pipeline/RegisterAliasTable.cc b/src/lib/pipeline/RegisterAliasTable.cc
index 935c65292a..2a67585831 100644
--- a/src/lib/pipeline/RegisterAliasTable.cc
+++ b/src/lib/pipeline/RegisterAliasTable.cc
@@ -39,7 +39,7 @@ RegisterAliasTable::RegisterAliasTable(
     historyTable_[type].resize(physCount);
     destinationTable_[type].resize(physCount);
   }
-};
+}
 
 Register RegisterAliasTable::getMapping(Register architectural) const {
   // Asserts to ensure mapping isn't attempted for an out-of-bound index (i.e.
@@ -50,7 +50,7 @@ Register RegisterAliasTable::getMapping(Register architectural) const {
          "Invalid register type. Cannot find RAT mapping.");
 
   auto tag = mappingTable_[architectural.type][architectural.tag];
-  return {architectural.type, tag};
+  return {architectural.type, tag, true};
 }
 
 bool RegisterAliasTable::canAllocate(uint8_t type,
@@ -84,7 +84,7 @@ Register RegisterAliasTable::allocate(Register architectural) {
   mappingTable_[architectural.type][architectural.tag] = tag;
   destinationTable_[architectural.type][tag] = architectural.tag;
 
-  return {architectural.type, tag};
+  return {architectural.type, tag, true};
 }
 
 void RegisterAliasTable::commit(Register physical) {
@@ -93,7 +93,11 @@ void RegisterAliasTable::commit(Register physical) {
   auto oldTag = historyTable_[physical.type][physical.tag];
   freeQueues_[physical.type].push(oldTag);
 }
+
 void RegisterAliasTable::rewind(Register physical) {
+  assert(physical.renamed &&
+         "Attempted to rewind a physical register which hasn't been subject to "
+         "the register renaming scheme");
   // Find which architectural tag this referred to
   auto destinationTag = destinationTable_[physical.type][physical.tag];
   // Rewind the mapping table to the old physical tag
@@ -102,9 +106,6 @@ void RegisterAliasTable::rewind(Register physical) {
   // Add the rewound physical tag back to the free queue
   freeQueues_[physical.type].push(physical.tag);
 }
-void RegisterAliasTable::free(Register physical) {
-  freeQueues_[physical.type].push(physical.tag);
-}
 
 }  // namespace pipeline
 }  // namespace simeng
diff --git a/src/lib/pipeline/RenameUnit.cc b/src/lib/pipeline/RenameUnit.cc
index b530e095b0..4459bbf18d 100644
--- a/src/lib/pipeline/RenameUnit.cc
+++ b/src/lib/pipeline/RenameUnit.cc
@@ -137,7 +137,7 @@ void RenameUnit::tick() {
     }
 
     // Allocate source registers
-    auto& sourceRegisters = uop->getOperandRegisters();
+    auto& sourceRegisters = uop->getSourceRegisters();
     for (size_t i = 0; i < sourceRegisters.size(); i++) {
       const auto& reg = sourceRegisters[i];
       if (!uop->isOperandReady(i)) {
diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc
index b9f0c181da..cb459a37a3 100644
--- a/src/lib/pipeline/ReorderBuffer.cc
+++ b/src/lib/pipeline/ReorderBuffer.cc
@@ -8,7 +8,7 @@ namespace simeng {
 namespace pipeline {
 
 ReorderBuffer::ReorderBuffer(
-    unsigned int maxSize, RegisterAliasTable& rat, LoadStoreQueue& lsq,
+    uint32_t maxSize, RegisterAliasTable& rat, LoadStoreQueue& lsq,
     std::function<void(const std::shared_ptr<Instruction>&)> raiseException,
     std::function<void(uint64_t branchAddress)> sendLoopBoundary,
     BranchPredictor& predictor, uint16_t loopBufSize,
@@ -36,18 +36,20 @@ void ReorderBuffer::reserve(const std::shared_ptr<Instruction>& insn) {
 void ReorderBuffer::commitMicroOps(uint64_t insnId) {
   if (buffer_.size()) {
     size_t index = 0;
-    int firstOp = -1;
+    uint64_t firstOp = UINT64_MAX;
     bool validForCommit = false;
+    bool foundFirstInstance = false;
 
     // Find first instance of uop belonging to macro-op instruction
     for (; index < buffer_.size(); index++) {
       if (buffer_[index]->getInstructionId() == insnId) {
         firstOp = index;
+        foundFirstInstance = true;
         break;
       }
     }
 
-    if (firstOp > -1) {
+    if (foundFirstInstance) {
       // If found, see if all uops are committable
       for (; index < buffer_.size(); index++) {
         if (buffer_[index]->getInstructionId() != insnId) break;
@@ -60,6 +62,7 @@ void ReorderBuffer::commitMicroOps(uint64_t insnId) {
       }
       if (!validForCommit) return;
 
+      assert(firstOp != UINT64_MAX && "firstOp hasn't been populated");
       // No early return thus all uops are committable
       for (; firstOp < buffer_.size(); firstOp++) {
         if (buffer_[firstOp]->getInstructionId() != insnId) break;
@@ -70,7 +73,7 @@ void ReorderBuffer::commitMicroOps(uint64_t insnId) {
   return;
 }
 
-unsigned int ReorderBuffer::commit(unsigned int maxCommitSize) {
+unsigned int ReorderBuffer::commit(uint64_t maxCommitSize) {
   shouldFlush_ = false;
   size_t maxCommits =
       std::min(static_cast<size_t>(maxCommitSize), buffer_.size());
@@ -120,7 +123,7 @@ unsigned int ReorderBuffer::commit(unsigned int maxCommitSize) {
     }
 
     const auto& destinations = uop->getDestinationRegisters();
-    for (int i = 0; i < destinations.size(); i++) {
+    for (size_t i = 0; i < destinations.size(); i++) {
       rat_.commit(destinations[i]);
     }
 
@@ -188,18 +191,30 @@ unsigned int ReorderBuffer::commit(unsigned int maxCommitSize) {
                           0};
       }
     }
+
+    // If it is a branch, now update the predictor (here to ensure order of
+    // updates is correct)
+    if (uop->isBranch()) {
+      predictor_.update(uop->getInstructionAddress(), uop->wasBranchTaken(),
+                        uop->getBranchAddress(), uop->getBranchType(),
+                        uop->getInstructionId());
+      // Update the branches retired and mispredicted counters
+      retiredBranches_++;
+      if (uop->wasBranchMispredicted()) branchMispredicts_++;
+    }
+
     buffer_.pop_front();
   }
 
   return n;
 }
 
-void ReorderBuffer::flush(uint64_t afterSeqId) {
+void ReorderBuffer::flush(uint64_t afterInsnId) {
   // Iterate backwards from the tail of the queue to find and remove ops newer
-  // than `afterSeqId`
+  // than `afterInsnId`
   while (!buffer_.empty()) {
     auto& uop = buffer_.back();
-    if (uop->getInstructionId() <= afterSeqId) {
+    if (uop->getInstructionId() <= afterInsnId) {
       break;
     }
 
@@ -208,7 +223,8 @@ void ReorderBuffer::flush(uint64_t afterSeqId) {
     auto destinations = uop->getDestinationRegisters();
     for (int i = destinations.size() - 1; i >= 0; i--) {
       const auto& reg = destinations[i];
-      rat_.rewind(reg);
+      // Only rewind the register if it was renamed
+      if (reg.renamed) rat_.rewind(reg);
     }
     uop->setFlushed();
     // If the instruction is a branch, supply address to branch flushing logic
@@ -242,7 +258,7 @@ unsigned int ReorderBuffer::getFreeSpace() const {
 
 bool ReorderBuffer::shouldFlush() const { return shouldFlush_; }
 uint64_t ReorderBuffer::getFlushAddress() const { return pc_; }
-uint64_t ReorderBuffer::getFlushSeqId() const { return flushAfter_; }
+uint64_t ReorderBuffer::getFlushInsnId() const { return flushAfter_; }
 
 uint64_t ReorderBuffer::getInstructionsCommittedCount() const {
   return instructionsCommitted_;
@@ -252,5 +268,12 @@ uint64_t ReorderBuffer::getViolatingLoadsCount() const {
   return loadViolations_;
 }
 
+uint64_t ReorderBuffer::getBranchMispredictedCount() const {
+  return branchMispredicts_;
+}
+
+uint64_t ReorderBuffer::getRetiredBranchesCount() const {
+  return retiredBranches_;
+}
 }  // namespace pipeline
 }  // namespace simeng
diff --git a/src/lib/trace.cc b/src/lib/trace.cc
index 8702ec52d1..2930299d1a 100644
--- a/src/lib/trace.cc
+++ b/src/lib/trace.cc
@@ -12,17 +12,18 @@ int Trace::writeCycleOut(char* str, uint64_t traceId, std::string model) {
     // char buffer[1000];
     // If the model is an o3 pipeline
     if (model == std::string("outoforder")) {
-      sprintf(str,
-              "%" PRId64 ":%" PRId64 ":%" PRId64 ":%" PRId64 ":%" PRId64
-              ":%" PRId64 ":%" PRId64 ":0x%02X:%d:%" PRId64 ":%s\n",
-              fetch.cycle, element.decode, element.rename, element.dispatch,
-              element.issue, element.complete, element.retire, fetch.address,
-              fetch.microOpNum, traceId, fetch.disasm.c_str());
+      snprintf(str, 4096,
+               "%" PRId64 ":%" PRId64 ":%" PRId64 ":%" PRId64 ":%" PRId64
+               ":%" PRId64 ":%" PRId64 ":0x%02llX:%d:%" PRId64 ":%s\n",
+               fetch.cycle, element.decode, element.rename, element.dispatch,
+               element.issue, element.complete, element.retire, fetch.address,
+               fetch.microOpNum, traceId, fetch.disasm.c_str());
     } else {
-      sprintf(str,
-              "%" PRId64 ":%" PRId64 ":%" PRId64 ":%#010x:%d:%" PRId64 ":%s\n",
-              fetch.cycle, element.decode, element.complete, fetch.address,
-              fetch.microOpNum, traceId, fetch.disasm.c_str());
+      snprintf(str, 4096,
+               "%" PRId64 ":%" PRId64 ":%" PRId64 ":%#010llx:%d:%" PRId64
+               ":%s\n",
+               fetch.cycle, element.decode, element.complete, fetch.address,
+               fetch.microOpNum, traceId, fetch.disasm.c_str());
     }
     // Kept so we can print for gem5 and compare visualisers
     // if(model == std::string("outoforder")){
@@ -51,11 +52,14 @@ int Trace::writeCycleOut(char* str, uint64_t traceId, std::string model) {
 int Trace::writeProbeOut(char* str, uint64_t index, int newline, int start) {
   if (!start) {
     if (newline)
-      sprintf(str, "\n%d,%" PRId64 "", probeTrace_.event, probeTrace_.insn_num);
+      snprintf(str, 4096, "\n%d,%" PRId64 "", probeTrace_.event,
+               probeTrace_.insn_num);
     else
-      sprintf(str, ":%d,%" PRId64 "", probeTrace_.event, probeTrace_.insn_num);
+      snprintf(str, 4096, ":%d,%" PRId64 "", probeTrace_.event,
+               probeTrace_.insn_num);
   } else {
-    sprintf(str, "%d,%" PRId64 "", probeTrace_.event, probeTrace_.insn_num);
+    snprintf(str, 4096, "%d,%" PRId64 "", probeTrace_.event,
+             probeTrace_.insn_num);
   }
   int val = 1;
   return val;
diff --git a/src/tools/simeng/CMakeLists.txt b/src/tools/simeng/CMakeLists.txt
index a274b18a23..c8de0d34a9 100644
--- a/src/tools/simeng/CMakeLists.txt
+++ b/src/tools/simeng/CMakeLists.txt
@@ -7,6 +7,6 @@ if( YAML_OUTPUT )
 endif()
 
 target_include_directories(simeng PUBLIC ${PROJECT_SOURCE_DIR}/src/lib)
-target_link_libraries(simeng libsimeng yaml-cpp)
+target_link_libraries(simeng libsimeng)
 
-install(TARGETS simeng DESTINATION bin)
+install(TARGETS simeng DESTINATION bin)
\ No newline at end of file
diff --git a/src/tools/simeng/main.cc b/src/tools/simeng/main.cc
index 824c724c11..ef642711e8 100644
--- a/src/tools/simeng/main.cc
+++ b/src/tools/simeng/main.cc
@@ -7,13 +7,15 @@
 
 #include "simeng/Core.hh"
 #include "simeng/CoreInstance.hh"
-#include "simeng/MemoryInterface.hh"
+#include "simeng/config/SimInfo.hh"
+#include "simeng/memory/MemoryInterface.hh"
 #include "simeng/version.hh"
 
 /** Tick the provided core model until it halts. */
-int simulate(simeng::Core& core, simeng::MemoryInterface& dataMemory,
-             simeng::MemoryInterface& instructionMemory,
-             std::ofstream* traceOut, std::ofstream* probeOut) {
+uint64_t simulate(simeng::Core& core,
+                  simeng::memory::MemoryInterface& dataMemory,
+                  simeng::memory::MemoryInterface& instructionMemory,
+                  std::ofstream* traceOut, std::ofstream* probeOut) {
   uint64_t iterations = 0;
 
   int probeIndex = 1;
@@ -126,7 +128,9 @@ int main(int argc, char** argv) {
 
   // Determine if a config file has been supplied.
   if (argc > 1) {
-    configFilePath = std::string(argv[1]);
+    // Set the global config file to one at the file path defined.
+    simeng::config::SimInfo::setConfig(argv[1]);
+
     // Determine if an executable has been supplied
     if (argc > 2) {
       executablePath = std::string(argv[2]);
@@ -136,20 +140,24 @@ int main(int argc, char** argv) {
       int numberofArgs = argc - 3;
       executableArgs =
           std::vector<std::string>(startOfArgs, startOfArgs + numberofArgs);
+    } else {
+      // Use the default program if not
+      configFilePath = DEFAULT_STR;
+      executablePath = SIMENG_SOURCE_DIR "/SimEngDefaultProgram";
     }
-    coreInstance = std::make_unique<simeng::CoreInstance>(
-        configFilePath, executablePath, executableArgs);
   } else {
     // Without a config file, no executable can be supplied so pass default
-    // (empty) values for executable information
-    coreInstance =
-        std::make_unique<simeng::CoreInstance>(executablePath, executableArgs);
-    configFilePath = "Default";
+    // values for executable information
+    configFilePath = DEFAULT_STR;
+    executablePath = SIMENG_SOURCE_DIR "/SimEngDefaultProgram";
   }
 
+  coreInstance =
+      std::make_unique<simeng::CoreInstance>(executablePath, executableArgs);
+
   // Replace empty executablePath string with more useful content for
   // outputting
-  if (executablePath == "") executablePath = "Default";
+  if (executablePath == "") executablePath = DEFAULT_STR;
 
   // Initialise trace/probe objects
   std::ofstream traceOut;
@@ -159,24 +167,42 @@ int main(int argc, char** argv) {
 
   // Get simulation objects needed to forward simulation
   std::shared_ptr<simeng::Core> core = coreInstance->getCore();
-  std::shared_ptr<simeng::MemoryInterface> dataMemory =
+  std::shared_ptr<simeng::memory::MemoryInterface> dataMemory =
       coreInstance->getDataMemory();
-  std::shared_ptr<simeng::MemoryInterface> instructionMemory =
+  std::shared_ptr<simeng::memory::MemoryInterface> instructionMemory =
       coreInstance->getInstructionMemory();
 
   // Output general simulation details
-  std::cout << "[SimEng] Running in " << coreInstance->getSimulationModeString()
-            << " mode" << std::endl;
+  std::cout << "[SimEng] Running in "
+            << simeng::config::SimInfo::getSimModeStr() << " mode" << std::endl;
   std::cout << "[SimEng] Workload: " << executablePath;
   for (const auto& arg : executableArgs) std::cout << " " << arg;
   std::cout << std::endl;
-  std::cout << "[SimEng] Config file: " << configFilePath << std::endl;
+  std::cout << "[SimEng] Config file: "
+            << simeng::config::SimInfo::getConfigPath() << std::endl;
+  std::cout << "[SimEng] ISA: " << simeng::config::SimInfo::getISAString()
+            << std::endl;
+  std::cout << "[SimEng] Auto-generated Special File directory: ";
+  if (simeng::config::SimInfo::getGenSpecFiles())
+    std::cout << "True";
+  else
+    std::cout << "False";
+  std::cout << std::endl;
+  std::cout << "[SimEng] Special File directory used: "
+            << simeng::config::SimInfo::getConfig()["CPU-Info"]
+                                                   ["Special-File-Dir-Path"]
+                                                       .as<std::string>()
+            << std::endl;
+  std::cout << "[SimEng] Number of Cores: "
+            << simeng::config::SimInfo::getConfig()["CPU-Info"]["Core-Count"]
+                   .as<uint16_t>()
+            << std::endl;
   std::cout << "Tracing enabled\n";
   std::cout << "Probing enabled\n";
 
   // Run simulation
   std::cout << "[SimEng] Starting...\n" << std::endl;
-  int iterations = 0;
+  uint64_t iterations = 0;
   auto startTime = std::chrono::high_resolution_clock::now();
   iterations =
       simulate(*core, *dataMemory, *instructionMemory, &traceOut, &probeOut);
@@ -206,27 +232,36 @@ int main(int argc, char** argv) {
 // of YAML formatted data.
 #ifdef YAML_OUTPUT
 
-  YAML::Emitter out;
-  out << YAML::BeginDoc << YAML::BeginMap;
-  out << YAML::Key << "build metadata" << YAML::Value;
-  out << YAML::BeginSeq;
-  out << "Version: " SIMENG_VERSION;
-  out << "Compile Time - Date: " __TIME__ " - " __DATE__;
-  out << "Build type: " SIMENG_BUILD_TYPE;
-  out << "Compile options: " SIMENG_COMPILE_OPTIONS;
-  out << "Test suite: " SIMENG_ENABLE_TESTS;
-  out << YAML::EndSeq;
+  ryml::Tree out;
+  ryml::NodeRef ref = out.rootref();
+  ref |= ryml::MAP;
+  ref.append_child() << ryml::key("build metadata");
+  ref["build metadata"] |= ryml::SEQ;
+  ref["build metadata"].append_child();
+  ref["build metadata"][0] << "Version: " SIMENG_VERSION;
+  ref["build metadata"].append_child();
+  ref["build metadata"][1] << "Compile Time - Date: " __TIME__ " - " __DATE__;
+  ref["build metadata"].append_child();
+  ref["build metadata"][2] << "Build type: " SIMENG_BUILD_TYPE;
+  ref["build metadata"].append_child();
+  ref["build metadata"][3] << "Compile options: " SIMENG_COMPILE_OPTIONS;
+  ref["build metadata"].append_child();
+  ref["build metadata"][4] << "Test suite: " SIMENG_ENABLE_TESTS;
   for (const auto& [key, value] : stats) {
-    out << YAML::Key << key << YAML::Value << value;
+    ref.append_child() << ryml::key(key);
+    ref[ryml::to_csubstr(key)] << value;
   }
-  out << YAML::Key << "duration" << YAML::Value << duration;
-  out << YAML::Key << "mips" << YAML::Value << mips;
-  out << YAML::Key << "cycles_per_sec" << YAML::Value
-      << std::stod(stats["cycles"]) / (duration / 1000.0);
-  out << YAML::EndMap << YAML::EndDoc;
+  ref.append_child() << ryml::key("duration");
+  ref["duration"] << duration;
+  ref.append_child() << ryml::key("mips");
+  ref["mips"] << mips;
+  ref.append_child() << ryml::key("cycles_per_sec");
+  ref["cycles_per_sec"] << std::stod(stats["cycles"]) / (duration / 1000.0);
 
   std::cout << "YAML-SEQ\n";
-  std::cout << out.c_str() << std::endl;
+  std::cout << "---\n";
+  std::cout << ryml::emitrs_yaml<std::string>(out);
+  std::cout << "...\n\n";
 
 #endif
 
diff --git a/sst/Assemble.cc b/sst/Assemble.cc
index 57371a9548..796e311ddb 100644
--- a/sst/Assemble.cc
+++ b/sst/Assemble.cc
@@ -195,5 +195,5 @@ void Assembler::assemble(const char* source, const char* triple) {
 }
 #endif
 
-char* Assembler::getAssembledSource() { return reinterpret_cast<char*>(code_); }
+uint8_t* Assembler::getAssembledSource() { return code_; }
 size_t Assembler::getAssembledSourceSize() { return codeSize_; }
diff --git a/sst/CMakeLists.txt b/sst/CMakeLists.txt
index a6603dac3b..9c97e1375f 100644
--- a/sst/CMakeLists.txt
+++ b/sst/CMakeLists.txt
@@ -6,7 +6,7 @@ target_include_directories(sstsimeng PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 target_include_directories(sstsimeng PUBLIC ${SST_INSTALL_DIR}/include)
 # Including libsimeng.so
 target_include_directories(sstsimeng PUBLIC ${PROJECT_SOURCE_DIR}/src/lib)
-# Including capstone and yaml
+# Including capstone
 target_include_directories(sstsimeng PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 target_include_directories(sstsimeng PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 
@@ -28,7 +28,7 @@ target_compile_definitions(sstsimeng PUBLIC SIMENG_ENABLE_SST __STDC_FORMAT_MACR
 # The commmand had to be replaced as some extra flags added using target_include_directories
 target_link_options(sstsimeng PUBLIC -fno-common LINKER:-undefined,dynamic_lookup)
 
-# Linking yaml and libsimeng.so libraries to sstsimeng.so
+# Linking libsimeng.so libraries to sstsimeng.so
 target_link_libraries(sstsimeng PUBLIC libsimeng)
 
 if (SIMENG_ENABLE_TESTS)
diff --git a/sst/SimEngCoreWrapper.cc b/sst/SimEngCoreWrapper.cc
index 1c2fe4ff0f..f1848fa803 100644
--- a/sst/SimEngCoreWrapper.cc
+++ b/sst/SimEngCoreWrapper.cc
@@ -45,6 +45,14 @@ SimEngCoreWrapper::SimEngCoreWrapper(SST::ComponentId_t id, SST::Params& params)
 
   iterations_ = 0;
 
+  probeIndex = 1;
+  probeCycle = 0;
+  start = 1;
+  traceWriteOut = "";
+  traceStr = (char*)malloc(1000 * sizeof(char));
+  probeWriteOut = "";
+  probeStr = (char*)malloc(5 * sizeof(char));
+
   // Instantiate the StandardMem Interface defined in config.py
   sstMem_ = loadUserSubComponent<SST::Interfaces::StandardMem>(
       "memory", ComponentInfo::SHARE_NONE, clock_,
@@ -64,6 +72,10 @@ SimEngCoreWrapper::SimEngCoreWrapper(SST::ComponentId_t id, SST::Params& params)
 SimEngCoreWrapper::~SimEngCoreWrapper() {}
 
 void SimEngCoreWrapper::setup() {
+  // Initialise trace/probe objects
+  traceOut.open("trace.out", std::ofstream::binary | std::ofstream::trunc);
+  probeOut.open("probe.out", std::ofstream::binary | std::ofstream::trunc);
+
   sstMem_->setup();
   output_.verbose(CALL_INFO, 1, 0, "Memory setup complete\n");
   // Run Simulation
@@ -98,6 +110,9 @@ void SimEngCoreWrapper::finish() {
   std::cout << "\n[SimEng] Finished " << iterations_ << " ticks in " << duration
             << "ms (" << std::round(khz) << " kHz, " << std::setprecision(2)
             << mips << " MIPS)" << std::endl;
+
+  traceOut.close();
+  probeOut.close();
 }
 
 void SimEngCoreWrapper::init(unsigned int phase) {
@@ -120,10 +135,67 @@ bool SimEngCoreWrapper::clockTick(SST::Cycle_t current_cycle) {
     // Tick the instruction memory.
     instructionMemory_->tick();
 
+    // Write out trace data
+    std::map<uint64_t, simeng::Trace*>::iterator itM = traceMap.begin();
+    // loop through tracing map and write out the finished instructions
+    while (itM != traceMap.end()) {
+      int success =
+          itM->second->writeCycleOut(traceStr, itM->first, "outoforder");
+      // If written out remove instruction from map
+      if (success) {
+        delete itM->second;
+        itM = traceMap.erase(itM);
+        traceWriteOut += traceStr;
+        if (traceWriteOut.length() > 8196) {
+          traceOut << traceWriteOut;
+          traceWriteOut = "";
+        }
+      } else
+        break;
+    }
+    // Write out probe data
+    std::list<simeng::Trace*>::iterator itL = probeList.begin();
+    int newline = 0;
+    while (itL != probeList.end()) {
+      simeng::probeTrace pt = (*itL)->getProbeTraces();
+      if (pt.cycle == probeCycle)
+        newline = 0;
+      else {
+        newline = 1;
+        for (uint64_t i = 0; i < std::min((pt.cycle - probeCycle - 1),
+                                          static_cast<uint64_t>(0));
+             i++) {
+          probeWriteOut += "\n-";
+        }
+        probeCycle = pt.cycle;
+      }
+      int success = (*itL)->writeProbeOut(probeStr, probeIndex, newline, start);
+      // Increment probe counter
+      probeIndex++;
+      // If written out remove probe from list
+      if (success) {
+        start = 0;
+        delete (*itL);
+        itL = probeList.erase(itL);
+        probeWriteOut += probeStr;
+        if (probeWriteOut.length() > 8196) {
+          probeOut << probeWriteOut;
+          probeWriteOut = "";
+        }
+      } else
+        itL++;
+    }
     iterations_++;
+    trace_cycle = iterations_;
 
     return false;
   } else {
+    if (traceWriteOut != "") {
+      traceOut << traceWriteOut;
+    }
+    if (probeWriteOut != "") {
+      probeOut << probeWriteOut;
+    }
     // Protected method from SST::Component used to end SST simulation
     primaryComponentOKToEndSim();
     return true;
@@ -257,7 +329,7 @@ void SimEngCoreWrapper::initialiseHeapData() {
 
 void SimEngCoreWrapper::fabricateSimEngCore() {
   output_.verbose(CALL_INFO, 1, 0, "Setting up SimEng Core\n");
-  char* assembled_source = NULL;
+  uint8_t* assembled_source = NULL;
   size_t assembled_source_size = 0;
   if (assembleWithSource_) {
     output_.verbose(CALL_INFO, 1, 0,
@@ -267,25 +339,28 @@ void SimEngCoreWrapper::fabricateSimEngCore() {
     assembled_source_size = assemble.getAssembledSourceSize();
   }
   if (simengConfigPath_ != "") {
-    coreInstance_ =
-        assembleWithSource_
-            ? std::make_unique<simeng::CoreInstance>(
-                  assembled_source, assembled_source_size, simengConfigPath_)
-            : std::make_unique<simeng::CoreInstance>(
-                  simengConfigPath_, executablePath_, executableArgs_);
+    // Set the global config file to one at the file path defined
+    simeng::config::SimInfo::setConfig(simengConfigPath_);
+
+    coreInstance_ = assembleWithSource_
+                        ? std::make_unique<simeng::CoreInstance>(
+                              assembled_source, assembled_source_size)
+                        : std::make_unique<simeng::CoreInstance>(
+                              executablePath_, executableArgs_);
   } else {
     output_.verbose(CALL_INFO, 1, 0,
                     "No SimEng configuration provided. Using the default "
                     "a64fx-sst.yaml configuration file.\n");
-    coreInstance_ =
-        assembleWithSource_
-            ? std::make_unique<simeng::CoreInstance>(
-                  assembled_source, assembled_source_size, a64fxConfigPath_)
-            : std::make_unique<simeng::CoreInstance>(
-                  a64fxConfigPath_, executablePath_, executableArgs_);
+    // Set the global config file to the default a64fx-sst.yaml file
+    simeng::config::SimInfo::setConfig(a64fxConfigPath_);
+
+    coreInstance_ = assembleWithSource_
+                        ? std::make_unique<simeng::CoreInstance>(
+                              assembled_source, assembled_source_size)
+                        : std::make_unique<simeng::CoreInstance>(
+                              executablePath_, executableArgs_);
   }
-  if (coreInstance_->getSimulationMode() !=
-      simeng::SimulationMode::OutOfOrder) {
+  if (config::SimInfo::getSimMode() != config::SimulationMode::Outoforder) {
     output_.verbose(CALL_INFO, 1, 0,
                     "SimEng currently only supports Out-of-Order "
                     "archetypes with SST.");
@@ -335,12 +410,31 @@ void SimEngCoreWrapper::fabricateSimEngCore() {
   std::cout << "[SimEng] \tTest suite: " SIMENG_ENABLE_TESTS << std::endl;
   std::cout << std::endl;
 
+  // Output general simulation details
   std::cout << "[SimEng] Running in "
-            << coreInstance_->getSimulationModeString() << " mode" << std::endl;
+            << simeng::config::SimInfo::getSimModeStr() << " mode" << std::endl;
   std::cout << "[SimEng] Workload: " << executablePath_;
   for (const auto& arg : executableArgs_) std::cout << " " << arg;
   std::cout << std::endl;
-  std::cout << "[SimEng] Config file: " << simengConfigPath_ << std::endl;
+  std::cout << "[SimEng] Config file: "
+            << simeng::config::SimInfo::getConfigPath() << std::endl;
+  std::cout << "[SimEng] ISA: " << simeng::config::SimInfo::getISAString()
+            << std::endl;
+  std::cout << "[SimEng] Auto-generated Special File directory: ";
+  if (simeng::config::SimInfo::getGenSpecFiles())
+    std::cout << "True";
+  else
+    std::cout << "False";
+  std::cout << std::endl;
+  std::cout << "[SimEng] Special File directory used: "
+            << simeng::config::SimInfo::getConfig()["CPU-Info"]
+                                                   ["Special-File-Dir-Path"]
+                                                       .as<std::string>()
+            << std::endl;
+  std::cout << "[SimEng] Number of Cores: "
+            << simeng::config::SimInfo::getConfig()["CPU-Info"]["Core-Count"]
+                   .as<uint16_t>()
+            << std::endl;
 }
 
 std::vector<uint64_t> SimEngCoreWrapper::splitHeapStr() {
diff --git a/sst/SimEngMemInterface.cc b/sst/SimEngMemInterface.cc
index 4e07801f21..fd3caa0b7d 100644
--- a/sst/SimEngMemInterface.cc
+++ b/sst/SimEngMemInterface.cc
@@ -11,7 +11,7 @@ using namespace SST::SSTSimEng;
 
 SimEngMemInterface::SimEngMemInterface(StandardMem* mem, uint64_t cl,
                                        uint64_t max_addr, bool debug)
-    : simeng::MemoryInterface() {
+    : simeng::memory::MemoryInterface() {
   this->sstMem_ = mem;
   this->cacheLineWidth_ = cl;
   this->maxAddrMemory_ = max_addr;
@@ -149,7 +149,7 @@ std::vector<StandardMem::Request*> SimEngMemInterface::splitAggregatedRequest(
   return requests;
 }
 
-void SimEngMemInterface::requestRead(const MemoryAccessTarget& target,
+void SimEngMemInterface::requestRead(const memory::MemoryAccessTarget& target,
                                      uint64_t requestId) {
   uint64_t addrStart = target.address;
   uint64_t size = unsigned(target.size);
@@ -183,7 +183,7 @@ void SimEngMemInterface::requestRead(const MemoryAccessTarget& target,
   }
 }
 
-void SimEngMemInterface::requestWrite(const MemoryAccessTarget& target,
+void SimEngMemInterface::requestWrite(const memory::MemoryAccessTarget& target,
                                       const RegisterValue& data) {
   uint64_t addrStart = target.address;
   uint64_t size = unsigned(target.size);
@@ -196,6 +196,7 @@ void SimEngMemInterface::requestWrite(const MemoryAccessTarget& target,
   for (StandardMem::Request* req : requests) {
     sstMem_->send(req);
   }
+  delete aggrReq;
 }
 
 void SimEngMemInterface::tick() { tickCounter_++; }
@@ -208,8 +209,9 @@ bool SimEngMemInterface::hasPendingRequests() const {
   return aggregationMap_.size() > 0;
 };
 
-const span<MemoryReadResult> SimEngMemInterface::getCompletedReads() const {
-  return {const_cast<MemoryReadResult*>(completedReadRequests_.data()),
+const span<memory::MemoryReadResult> SimEngMemInterface::getCompletedReads()
+    const {
+  return {const_cast<memory::MemoryReadResult*>(completedReadRequests_.data()),
           completedReadRequests_.size()};
 };
 
diff --git a/sst/config/a64fx-config.py b/sst/config/a64fx-config.py
index 13f9eb4942..703dd32995 100644
--- a/sst/config/a64fx-config.py
+++ b/sst/config/a64fx-config.py
@@ -52,7 +52,7 @@ def getMemoryProps(memory_size: int, si: str):
 A64FX_HL_L1 = 5
 # Hit latency of A64FX L2 cache (cycles).
 A64FX_HL_L2 = 56
-# Cohenrence protocol of A64FX caches.
+# Coherence protocol of A64FX caches.
 A64FX_COHP = "MESI"
 # L1 & L2 cache type of A64FX.
 A64FX_CACHE_TYPE = "inclusive"
diff --git a/sst/include/Assemble.hh b/sst/include/Assemble.hh
index 93b7a999ae..b49e07fe9c 100644
--- a/sst/include/Assemble.hh
+++ b/sst/include/Assemble.hh
@@ -28,7 +28,7 @@ class Assembler {
   ~Assembler();
 
   /** Returns the assembled source as a char array. */
-  char* getAssembledSource();
+  uint8_t* getAssembledSource();
 
   /** Returns the size of the assembled source. */
   size_t getAssembledSourceSize();
diff --git a/sst/include/SimEngCoreWrapper.hh b/sst/include/SimEngCoreWrapper.hh
index cb53c0f50a..b235cecbb5 100644
--- a/sst/include/SimEngCoreWrapper.hh
+++ b/sst/include/SimEngCoreWrapper.hh
@@ -19,7 +19,6 @@
 #include "SimEngMemInterface.hh"
 #include "simeng/Core.hh"
 #include "simeng/CoreInstance.hh"
-#include "simeng/MemoryInterface.hh"
 #include "simeng/SpecialFileDirGen.hh"
 #include "simeng/version.hh"
 
@@ -204,7 +203,7 @@ class SimEngCoreWrapper : public SST::Component {
   std::shared_ptr<char> processMemory_;
 
   /** Reference to SimEng instruction memory. */
-  std::shared_ptr<simeng::MemoryInterface> instructionMemory_;
+  std::shared_ptr<simeng::memory::MemoryInterface> instructionMemory_;
 
   /** Reference to SimEngMemInterface used for interfacing with SST. */
   std::shared_ptr<SimEngMemInterface> dataMemory_;
@@ -212,6 +211,17 @@ class SimEngCoreWrapper : public SST::Component {
   /** Number of clock iterations. */
   int iterations_;
 
+  int probeIndex;
+  uint64_t probeCycle;
+  int start;
+  std::string traceWriteOut;
+  char* traceStr;
+  std::string probeWriteOut;
+  char* probeStr;
+
+  std::ofstream traceOut;
+  std::ofstream probeOut;
+
   /** Start time of simulation. */
   std::chrono::high_resolution_clock::time_point startTime_;
 
diff --git a/sst/include/SimEngMemInterface.hh b/sst/include/SimEngMemInterface.hh
index 79789a9f39..7668949e0c 100644
--- a/sst/include/SimEngMemInterface.hh
+++ b/sst/include/SimEngMemInterface.hh
@@ -16,7 +16,7 @@
 #include <type_traits>
 #include <vector>
 
-#include "simeng/MemoryInterface.hh"
+#include "simeng/memory/MemoryInterface.hh"
 #include "simeng/span.hh"
 
 using namespace simeng;
@@ -27,7 +27,7 @@ namespace SST {
 namespace SSTSimEng {
 
 /** A memory interface used by SimEng to communicate with SST's memory model. */
-class SimEngMemInterface : public MemoryInterface {
+class SimEngMemInterface : public memory::MemoryInterface {
  public:
   SimEngMemInterface(StandardMem* mem, uint64_t cl, uint64_t max_addr,
                      bool debug);
@@ -39,17 +39,18 @@ class SimEngMemInterface : public MemoryInterface {
    * Construct an AggregatedReadRequest and use it to generate
    * SST::StandardMem::Read request(s). These request(s) are then sent to SST.
    */
-  void requestRead(const MemoryAccessTarget& target, uint64_t requestId = 0);
+  void requestRead(const memory::MemoryAccessTarget& target,
+                   uint64_t requestId = 0);
 
   /**
    * Construct an AggregatedWriteRequest and use it to generate
    * SST::StandardMem::Write request(s). These request(s) are then sent to SST.
    */
-  void requestWrite(const MemoryAccessTarget& target,
+  void requestWrite(const memory::MemoryAccessTarget& target,
                     const RegisterValue& data);
 
   /** Retrieve all completed read requests. */
-  const span<MemoryReadResult> getCompletedReads() const;
+  const span<memory::MemoryReadResult> getCompletedReads() const;
 
   /** Clear the completed reads. */
   void clearCompletedReads();
@@ -103,11 +104,12 @@ class SimEngMemInterface : public MemoryInterface {
    * struct for AggregateWriteRequest and AggregateReadRequest.
    */
   struct SimEngMemoryRequest {
-    /** MemoryAccessTarget from SimEng memory instruction. */
-    const MemoryAccessTarget target;
+    /** memory::MemoryAccessTarget from SimEng memory instruction. */
+    const memory::MemoryAccessTarget target;
 
-    SimEngMemoryRequest() : target(MemoryAccessTarget()){};
-    SimEngMemoryRequest(const MemoryAccessTarget& target) : target(target){};
+    SimEngMemoryRequest() : target(memory::MemoryAccessTarget()){};
+    SimEngMemoryRequest(const memory::MemoryAccessTarget& target)
+        : target(target){};
   };
 
   /**
@@ -122,7 +124,7 @@ class SimEngMemInterface : public MemoryInterface {
     const RegisterValue data;
 
     AggregateWriteRequest() : SimEngMemoryRequest(), data(RegisterValue()){};
-    AggregateWriteRequest(const MemoryAccessTarget& target,
+    AggregateWriteRequest(const memory::MemoryAccessTarget& target,
                           const RegisterValue& data)
         : SimEngMemoryRequest(target), data(data){};
   };
@@ -148,7 +150,8 @@ class SimEngMemInterface : public MemoryInterface {
     int aggregateCount_ = 0;
 
     AggregateReadRequest() : SimEngMemoryRequest(), id_(0){};
-    AggregateReadRequest(const MemoryAccessTarget& target, const uint64_t id)
+    AggregateReadRequest(const memory::MemoryAccessTarget& target,
+                         const uint64_t id)
         : SimEngMemoryRequest(target), id_(id) {}
   };
 
@@ -170,7 +173,7 @@ class SimEngMemInterface : public MemoryInterface {
   uint64_t maxAddrMemory_;
 
   /** A vector containing all completed read requests. */
-  std::vector<MemoryReadResult> completedReadRequests_;
+  std::vector<memory::MemoryReadResult> completedReadRequests_;
 
   /**
    * This map is used to store unique ids of SST::StandardMem::Read requests and
diff --git a/sst/test/include/framework/expression.hh b/sst/test/include/framework/expression.hh
index fa0cdae03e..2ecd7875b3 100644
--- a/sst/test/include/framework/expression.hh
+++ b/sst/test/include/framework/expression.hh
@@ -158,7 +158,7 @@ class LhsExpr : public BaseExpr {
 
 /**
  * ExprBuilder struct exposes a operator which is used to instantiate an
- * LhsExpr of type T without having to explictly define the template argument.
+ * LhsExpr of type T without having to explicitly define the template argument.
  */
 struct ExprBuilder {
   template <typename T, typename = IsFundamentalType<T>>
diff --git a/sst/test/include/framework/macros/eval.hh b/sst/test/include/framework/macros/eval.hh
index 2c47629c86..69579e894b 100644
--- a/sst/test/include/framework/macros/eval.hh
+++ b/sst/test/include/framework/macros/eval.hh
@@ -6,11 +6,11 @@
 
 // This MACRO defines the source code each expression expands into. This MACRO
 // also adds the line number on which this MACRO is defined in the source code.
-#define GENERIC_EXPECT_WITH_LINE(A, OP, B, line, SRC)      \
-  {                                                        \
-    ExpressionHandler handler = ExpressionHandler();       \
-    handler.handleExpression(ExprBuilder() << A OP B, SRC, \
-                             static_cast<uint64_t>(line)); \
+#define GENERIC_EXPECT_WITH_LINE(A, OP, B, line, SRC)        \
+  {                                                          \
+    ExpressionHandler handler = ExpressionHandler();         \
+    handler.handleExpression((ExprBuilder() << A) OP B, SRC, \
+                             static_cast<uint64_t>(line));   \
   }
 
 // This MACRO defines the source code each String expression expands into. This
diff --git a/sst/test/include/framework/runner.hh b/sst/test/include/framework/runner.hh
index 2ec182c4af..75ef7dee4b 100644
--- a/sst/test/include/framework/runner.hh
+++ b/sst/test/include/framework/runner.hh
@@ -21,14 +21,6 @@ class Runner {
   Runner() {}
   /** Method used to run the test(s) inside a runner. */
   virtual void run(){};
-  /**
-   * Returns the current executing TestContext.
-   * The TestContext for a Group changes everytime a new test is run
-   */
-  virtual std::unique_ptr<TestContext>& getCurrContext() {
-    std::unique_ptr<TestContext> ptr = std::make_unique<TestContext>();
-    return ptr;
-  };
 };
 
 /** GroupConfig used to provide configuration options to a Group. */
@@ -178,9 +170,6 @@ class Group : public Runner {
     return true;
   };
 
-  /** Returns the TestContext of the current executing test case. */
-  std::unique_ptr<TestContext>& getCurrContext() { return ctx_; };
-
   /**
    * This method returns a reference of GroupConfig. This method gets overriden
    * by TEST_GROUP Macro with the config defined in the source code.
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 6d0b6479f5..ee2d48491a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(unit)
 add_subdirectory(regression)
+add_subdirectory(integration)
diff --git a/test/integration/CMakeLists.txt b/test/integration/CMakeLists.txt
new file mode 100644
index 0000000000..1abf3dc3e0
--- /dev/null
+++ b/test/integration/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(TEST_SOURCES
+    ConfigTest.cc
+)
+
+add_executable(integrationtests ${TEST_SOURCES})
+
+target_include_directories(integrationtests PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+target_include_directories(integrationtests PUBLIC ${PROJECT_SOURCE_DIR}/src/lib)
+target_link_libraries(integrationtests libsimeng)
+target_link_libraries(integrationtests gmock_main)
+target_compile_options(integrationtests PRIVATE ${SIMENG_COMPILE_OPTIONS})
+
+add_test(NAME integration_tests COMMAND integrationtests)
diff --git a/test/integration/ConfigTest.cc b/test/integration/ConfigTest.cc
new file mode 100644
index 0000000000..48975eeacd
--- /dev/null
+++ b/test/integration/ConfigTest.cc
@@ -0,0 +1,414 @@
+#include <fstream>
+#include <iostream>
+
+#include "gtest/gtest.h"
+#include "simeng/config/SimInfo.hh"
+#include "simeng/version.hh"
+
+namespace {
+
+// Test generated default values are correct
+TEST(ConfigTest, Default) {
+  // Test key default values exposed in SimInfo
+  EXPECT_EQ(simeng::config::SimInfo::getConfigPath(), "Default");
+  EXPECT_EQ(simeng::config::SimInfo::getISA(), simeng::config::ISA::AArch64);
+  EXPECT_EQ(simeng::config::SimInfo::getISAString(), "AArch64");
+  EXPECT_EQ(simeng::config::SimInfo::getSimMode(),
+            simeng::config::SimulationMode::Emulation);
+  EXPECT_EQ(simeng::config::SimInfo::getSimModeStr(), "Emulation");
+  std::vector<uint64_t> sysRegisterEnums = {
+      aarch64_sysreg::AARCH64_SYSREG_DCZID_EL0,
+      aarch64_sysreg::AARCH64_SYSREG_FPCR,
+      aarch64_sysreg::AARCH64_SYSREG_FPSR,
+      aarch64_sysreg::AARCH64_SYSREG_TPIDR_EL0,
+      aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1,
+      aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0,
+      aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0,
+      aarch64_sysreg::AARCH64_SYSREG_SVCR};
+  EXPECT_EQ(simeng::config::SimInfo::getSysRegVec(), sysRegisterEnums);
+  std::vector<simeng::RegisterFileStructure> archRegStruct = {
+      {8, 32},
+      {256, 32},
+      {32, 17},
+      {1, 1},
+      {8, static_cast<uint16_t>(sysRegisterEnums.size())},
+      {256, 16},
+      {64, 1}};
+  EXPECT_EQ(simeng::config::SimInfo::getArchRegStruct(), archRegStruct);
+
+  // Test that default config generated matches for AArch64 ISA
+  std::string emittedConfig =
+      ryml::emitrs_yaml<std::string>(simeng::config::SimInfo::getConfig());
+  std::string expectedValues =
+      "Core:\n  ISA: AArch64\n  'Simulation-Mode': emulation\n  "
+      "'Clock-Frequency-GHz': 1\n  'Timer-Frequency-MHz': 100\n  "
+      "'Micro-Operations': 0\n  'Vector-Length': 128\n  "
+      "'Streaming-Vector-Length': 128\nFetch:\n  'Fetch-Block-Size': 32\n  "
+      "'Loop-Buffer-Size': 32\n  'Loop-Detection-Threshold': "
+      "5\n'Process-Image':\n  'Heap-Size': 100000\n  'Stack-Size': "
+      "100000\n'Register-Set':\n  'GeneralPurpose-Count': 38\n  "
+      "'FloatingPoint/SVE-Count': 38\n  'Predicate-Count': 17\n  "
+      "'Conditional-Count': 1\n  'SME-Matrix-Count': 1\n  "
+      "'SME-Lookup-Table-Count': "
+      "1\n'Pipeline-Widths':\n  Commit: 1\n  FrontEnd: 1\n  'LSQ-Completion': "
+      "1\n'Queue-Sizes':\n  ROB: 32\n  Load: 16\n  Store: "
+      "16\n'Port-Allocator':\n  Type: Balanced\n'Branch-Predictor':\n  Type: "
+      "Perceptron\n  'BTB-Tag-Bits': 8\n  'Global-History-Length': 8\n  "
+      "'RAS-entries': 8\n'L1-Data-Memory':\n  'Interface-Type': "
+      "Flat\n'L1-Instruction-Memory':\n  'Interface-Type': "
+      "Flat\n'LSQ-L1-Interface':\n  'Access-Latency': 4\n  Exclusive: 0\n  "
+      "'Load-Bandwidth': 32\n  'Store-Bandwidth': 32\n  "
+      "'Permitted-Requests-Per-Cycle': 1\n  'Permitted-Loads-Per-Cycle': 1\n  "
+      "'Permitted-Stores-Per-Cycle': 1\nPorts:\n  0:\n    Portname: 0\n    "
+      "'Instruction-Group-Support':\n      - ALL\n    "
+      "'Instruction-Opcode-Support':\n      - 8232\n    "
+      "'Instruction-Group-Support-Nums':\n      - "
+      "86\n'Reservation-Stations':\n  0:\n    Size: 32\n    'Dispatch-Rate': "
+      "4\n    Ports:\n      - 0\n    'Port-Nums':\n      - "
+      "0\n'Execution-Units':\n  0:\n    Pipelined: 1\n    'Blocking-Groups':\n "
+      "     - NONE\n    'Blocking-Group-Nums':\n      - 87\nLatencies:\n  0:\n "
+      "   'Instruction-Groups':\n      - NONE\n    'Instruction-Opcodes':\n    "
+      "  - 8232\n    'Execution-Latency': 1\n    'Execution-Throughput': 1\n   "
+      " 'Instruction-Group-Nums':\n      - 87\n'CPU-Info':\n  "
+      "'Generate-Special-Dir': 1\n  'Special-File-Dir-Path': " SIMENG_BUILD_DIR
+      "/specialFiles/\n  'Core-Count': 1\n  'Socket-Count': 1\n  SMT: 1\n  "
+      "BogoMIPS: 0\n  Features: ''\n  'CPU-Implementer': 0x0\n  "
+      "'CPU-Architecture': 0\n  'CPU-Variant': 0x0\n  'CPU-Part': 0x0\n  "
+      "'CPU-Revision': 0\n  'Package-Count': 1\n";
+  EXPECT_EQ(emittedConfig, expectedValues);
+
+  // Generate default for rv64 ISA
+  simeng::config::SimInfo::generateDefault(simeng::config::ISA::RV64);
+
+  // Test SimInfo exposed have correctly changed
+  EXPECT_EQ(simeng::config::SimInfo::getISA(), simeng::config::ISA::RV64);
+  EXPECT_EQ(simeng::config::SimInfo::getISAString(), "rv64");
+  sysRegisterEnums = {simeng::arch::riscv::riscv_sysreg::RISCV_SYSREG_FFLAGS,
+                      simeng::arch::riscv::riscv_sysreg::RISCV_SYSREG_FRM,
+                      simeng::arch::riscv::riscv_sysreg::RISCV_SYSREG_FCSR,
+                      simeng::arch::riscv::riscv_sysreg::RISCV_SYSREG_CYCLE,
+                      simeng::arch::riscv::riscv_sysreg::RISCV_SYSREG_TIME,
+                      simeng::arch::riscv::riscv_sysreg::RISCV_SYSREG_INSTRET};
+  EXPECT_EQ(simeng::config::SimInfo::getSysRegVec(), sysRegisterEnums);
+  archRegStruct = {
+      {8, 32}, {8, 32}, {8, static_cast<uint16_t>(sysRegisterEnums.size())}};
+  EXPECT_EQ(simeng::config::SimInfo::getArchRegStruct(), archRegStruct);
+
+  // Test that default config generated matches for rv64 ISA
+  emittedConfig =
+      ryml::emitrs_yaml<std::string>(simeng::config::SimInfo::getConfig());
+  expectedValues =
+      "Core:\n  ISA: rv64\n  Compressed: 0\n  'Simulation-Mode': emulation\n  "
+      "'Clock-Frequency-GHz': 1\n  'Timer-Frequency-MHz': 100\n  "
+      "'Micro-Operations': 0\nFetch:\n  'Fetch-Block-Size': 32\n  "
+      "'Loop-Buffer-Size': 32\n  'Loop-Detection-Threshold': "
+      "5\n'Process-Image':\n  'Heap-Size': 100000\n  'Stack-Size': "
+      "100000\n'Register-Set':\n  'GeneralPurpose-Count': 38\n  "
+      "'FloatingPoint-Count': 38\n'Pipeline-Widths':\n  Commit: 1\n  FrontEnd: "
+      "1\n  'LSQ-Completion': 1\n'Queue-Sizes':\n  ROB: 32\n  Load: 16\n  "
+      "Store: 16\n'Port-Allocator':\n  Type: Balanced\n'Branch-Predictor':\n  "
+      "Type: Perceptron\n  'BTB-Tag-Bits': "
+      "8\n  'Global-History-Length': 8\n  'RAS-entries': "
+      "8\n'L1-Data-Memory':\n  'Interface-Type': "
+      "Flat\n'L1-Instruction-Memory':\n  'Interface-Type': "
+      "Flat\n'LSQ-L1-Interface':\n  'Access-Latency': 4\n  Exclusive: 0\n  "
+      "'Load-Bandwidth': 32\n  'Store-Bandwidth': 32\n  "
+      "'Permitted-Requests-Per-Cycle': 1\n  'Permitted-Loads-Per-Cycle': 1\n  "
+      "'Permitted-Stores-Per-Cycle': 1\nPorts:\n  0:\n    Portname: 0\n    "
+      "'Instruction-Group-Support':\n      - ALL\n    "
+      "'Instruction-Opcode-Support':\n      - 450\n    "
+      "'Instruction-Group-Support-Nums':\n      - "
+      "23\n'Reservation-Stations':\n  0:\n    Size: 32\n    'Dispatch-Rate': "
+      "4\n    Ports:\n      - 0\n    'Port-Nums':\n      - "
+      "0\n'Execution-Units':\n  0:\n    Pipelined: 1\n    'Blocking-Groups':\n "
+      "     - NONE\n    'Blocking-Group-Nums':\n      - 24\nLatencies:\n  0:\n "
+      "   'Instruction-Groups':\n      - NONE\n    'Instruction-Opcodes':\n    "
+      "  - 450\n    'Execution-Latency': 1\n    'Execution-Throughput': 1\n    "
+      "'Instruction-Group-Nums':\n      - 24\n'CPU-Info':\n  "
+      "'Generate-Special-Dir': 1\n  'Special-File-Dir-Path': " SIMENG_BUILD_DIR
+      "/specialFiles/\n  'Core-Count': 1\n  'Socket-Count': 1\n  SMT: 1\n  "
+      "BogoMIPS: 0\n  Features: ''\n  'CPU-Implementer': 0x0\n  "
+      "'CPU-Architecture': 0\n  'CPU-Variant': 0x0\n  'CPU-Part': 0x0\n  "
+      "'CPU-Revision': 0\n  'Package-Count': 1\n";
+  EXPECT_EQ(emittedConfig, expectedValues);
+}
+
+// Test that getting values from the config returns the correct values
+TEST(ConfigTest, as) {
+  simeng::config::SimInfo::generateDefault(simeng::config::ISA::AArch64);
+  ryml::ConstNodeRef config = simeng::config::SimInfo::getConfig();
+  EXPECT_EQ(config["Core"]["ISA"].as<std::string>(), "AArch64");
+  EXPECT_EQ(config["Core"]["Clock-Frequency-GHz"].as<float>(), 1.f);
+  EXPECT_EQ(config["Core"]["Timer-Frequency-MHz"].as<uint64_t>(), 100);
+  EXPECT_EQ(config["Core"]["Micro-Operations"].as<bool>(), false);
+}
+
+// Test that editting existing and adding new values is correct
+TEST(ConfigTest, AddConfigValues) {
+  simeng::config::SimInfo::addToConfig("{Core: {Simulation-Mode: outoforder}}");
+  simeng::config::SimInfo::addToConfig("{Core: {Key: Value}}");
+  simeng::config::SimInfo::addToConfig("{TestA: {Key: Value}}");
+  simeng::config::SimInfo::addToConfig("{Core: {Seq: [0, 1, 2]}}");
+  simeng::config::SimInfo::addToConfig("{TestB: {Seq: [0, 1, 2]}}");
+  simeng::config::SimInfo::addToConfig(
+      "{Ports: {1: {Portname: Port 1, Instruction-Group-Support: [BRANCH]}}, "
+      "Reservation-Stations: {1: {Size: 32, Dispatch-Rate: 1, Ports: [Port "
+      "1]}}, Execution-Units: {1: {Pipelined: False}}}");
+
+  ryml::ConstNodeRef config = simeng::config::SimInfo::getConfig();
+  EXPECT_EQ(config["Core"]["Simulation-Mode"].as<std::string>(), "outoforder");
+  EXPECT_EQ(config["Core"]["Key"].as<std::string>(), "Value");
+  EXPECT_EQ(config["TestA"]["Key"].as<std::string>(), "Value");
+
+  EXPECT_EQ(config["Core"]["Seq"][0].as<uint8_t>(), 0);
+  EXPECT_EQ(config["Core"]["Seq"][1].as<uint8_t>(), 1);
+  EXPECT_EQ(config["Core"]["Seq"][2].as<uint8_t>(), 2);
+
+  EXPECT_EQ(config["TestB"]["Seq"][0].as<uint8_t>(), 0);
+  EXPECT_EQ(config["TestB"]["Seq"][1].as<uint8_t>(), 1);
+  EXPECT_EQ(config["TestB"]["Seq"][2].as<uint8_t>(), 2);
+
+  EXPECT_EQ(config["Ports"].num_children(), 2);
+  EXPECT_EQ(config["Reservation-Stations"].num_children(), 2);
+  EXPECT_EQ(config["Execution-Units"].num_children(), 2);
+}
+
+// Test that adding an invalid entry fails the config validation
+TEST(ConfigTest, FailedExpectation) {
+  simeng::config::SimInfo::generateDefault(simeng::config::ISA::AArch64, true);
+  ASSERT_DEATH(
+      {
+        simeng::config::SimInfo::addToConfig(
+            "{Core: {Simulation-Mode: wrong}}");
+      },
+      "- Core:Simulation-Mode wrong not in set");
+  ASSERT_DEATH(
+      {
+        simeng::config::SimInfo::addToConfig(
+            "{Reservation-Stations: {1: {Size: "
+            "32, Dispatch-Rate: 1}}}");
+      },
+      "- Reservation-Stations:1:Ports has no value");
+  ASSERT_DEATH(
+      {
+        simeng::config::SimInfo::addToConfig(
+            "{Reservation-Stations: {1: {Size: "
+            "32, Dispatch-Rate: 1, Ports: [WRONG]}}}");
+      },
+      "- Reservation-Stations:1:Ports:0 WRONG not in set");
+
+  // Test for post validation checks are triggered
+  ASSERT_DEATH(
+      {
+        simeng::config::SimInfo::addToConfig(
+            "{CPU-Info: {Package-Count: 10, Core-Count: 3}}");
+      },
+      "- Package-Count must be a Less-than or equal to Core-Count, and "
+      "Core-Count must be divisible by Package-Count");
+  ASSERT_DEATH(
+      {
+        simeng::config::SimInfo::addToConfig(
+            "{Ports: {1: {Portname: Port 1}}}");
+      },
+      "- The number of execution units \\(1\\) must match the number of ports "
+      "\\(2\\)");
+  ASSERT_DEATH(
+      {
+        simeng::config::SimInfo::addToConfig(
+            "{Ports: {1: {Portname: Port 1}}, Execution-Units: {1: {Pipelined "
+            ": False}}}");
+      },
+      "- Port 1 has no associated reservation station");
+}
+
+// Test that ExpectationNode validation checks work as expected
+TEST(ConfigTest, validation) {
+  simeng::config::ExpectationNode expectations =
+      simeng::config::ExpectationNode();
+  expectations.addChild(
+      simeng::config::ExpectationNode::createExpectation("HEAD"));
+  expectations["HEAD"].addChild(
+      simeng::config::ExpectationNode::createExpectation<bool>(true,
+                                                               "CHILD_BOOL"));
+  expectations["HEAD"]["CHILD_BOOL"].setValueSet<bool>({false});
+  expectations["HEAD"].addChild(
+      simeng::config::ExpectationNode::createExpectation<float>(123.456f,
+                                                                "CHILD_FLOAT"));
+  expectations["HEAD"]["CHILD_FLOAT"].setValueBounds<float>(456.789f, 789.456f);
+  expectations["HEAD"].addChild(
+      simeng::config::ExpectationNode::createExpectation<std::string>(
+          "STR", "CHILD_STRING"));
+  expectations["HEAD"]["CHILD_STRING"].setValueSet<std::string>(
+      {"HELLO", "WORLD", "SIMENG"});
+  expectations["HEAD"].addChild(
+      simeng::config::ExpectationNode::createExpectation<uint64_t>(
+          333, "CHILD_UINT"));
+  expectations["HEAD"]["CHILD_UINT"].setValueBounds<uint64_t>(345, 678);
+
+  ryml::Tree tree;
+  tree.rootref() |= ryml::MAP;
+  ryml::NodeRef ref;
+  size_t id = tree.root_id();
+  tree.ref(id).append_child() << ryml::key("noVal");
+  ref = tree.ref(id).append_child() << ryml::key("bool");
+  ref << true;
+  ref = tree.ref(id).append_child() << ryml::key("float");
+  ref << 123.456f;
+  ref = tree.ref(id).append_child() << ryml::key("string");
+  ref << "STR";
+  ref = tree.ref(id).append_child() << ryml::key("uint");
+  ref << 333;
+
+  EXPECT_EQ(expectations["HEAD"]["CHILD_BOOL"]
+                .validateConfigNode(tree.rootref()["bool"])
+                .message,
+            "1 not in set {0}");
+  EXPECT_EQ(expectations["HEAD"]["CHILD_FLOAT"]
+                .validateConfigNode(tree.rootref()["float"])
+                .message,
+            "123.456 not in the bounds {456.789 to 789.456}");
+  EXPECT_EQ(expectations["HEAD"]["CHILD_STRING"]
+                .validateConfigNode(tree.rootref()["string"])
+                .message,
+            "STR not in set {HELLO, WORLD, SIMENG}");
+  EXPECT_EQ(expectations["HEAD"]["CHILD_UINT"]
+                .validateConfigNode(tree.rootref()["uint"])
+                .message,
+            "333 not in the bounds {345 to 678}");
+}
+
+// Test that calling setValueBounds() with the wrong data type fails
+TEST(ConfigTest, invalidTypeOnValueBounds) {
+  simeng::config::ExpectationNode expectations =
+      simeng::config::ExpectationNode();
+  expectations.addChild(
+      simeng::config::ExpectationNode::createExpectation("HEAD"));
+  expectations["HEAD"].addChild(
+      simeng::config::ExpectationNode::createExpectation<std::string>("DEFAULT",
+                                                                      "CHILD"));
+  ASSERT_DEATH(
+      { expectations["HEAD"]["CHILD"].setValueBounds<uint32_t>(0, 10); },
+      "The data type of the passed value bounds used in setValueBounds\\() "
+      "does not match that held within the ExpectationNode with key "
+      "HEAD:CHILD. Passed bounds are of type 32-bit unsigned integer and the "
+      "expected type of this node is string.");
+}
+
+// Test that calling setValueSet() with the wrong data type fails
+TEST(ConfigTest, invalidTypeOnSetBounds) {
+  simeng::config::ExpectationNode expectations =
+      simeng::config::ExpectationNode();
+  expectations.addChild(
+      simeng::config::ExpectationNode::createExpectation("HEAD"));
+  expectations["HEAD"].addChild(
+      simeng::config::ExpectationNode::createExpectation<std::string>("DEFAULT",
+                                                                      "CHILD"));
+  ASSERT_DEATH(
+      {
+        expectations["HEAD"]["CHILD"].setValueSet<int32_t>({0, 1, 2});
+      },
+      "The data type of the passed vector used in setValueSet\\() "
+      "does not match that held within the ExpectationNode with key "
+      "HEAD:CHILD. Passed vector elements are of type 32-bit integer and the "
+      "expected type of this node is string.");
+}
+
+// Test that calling setValueSet() after an expectation value set has already
+// been defined fails
+TEST(ConfigTest, alreadyDefinedBounds) {
+  simeng::config::ExpectationNode expectations =
+      simeng::config::ExpectationNode();
+  expectations.addChild(
+      simeng::config::ExpectationNode::createExpectation("HEAD"));
+  expectations["HEAD"].addChild(
+      simeng::config::ExpectationNode::createExpectation<uint64_t>(0, "CHILD"));
+  expectations["HEAD"]["CHILD"].setValueBounds<uint64_t>(0, 10);
+  ASSERT_DEATH(
+      {
+        expectations["HEAD"]["CHILD"].setValueSet<uint64_t>({1, 2, 3});
+      },
+      "Invalid call of setValueSet\\() for the ExpectationNode with key "
+      "HEAD:CHILD as value bounds have already been defined.");
+}
+
+// Test that calling setValueBounds() after expectation value bounds have
+// already been defined fails
+TEST(ConfigTest, alreadyDefinedSet) {
+  simeng::config::ExpectationNode expectations =
+      simeng::config::ExpectationNode();
+  expectations.addChild(
+      simeng::config::ExpectationNode::createExpectation("HEAD"));
+  expectations["HEAD"].addChild(
+      simeng::config::ExpectationNode::createExpectation<uint64_t>(0, "CHILD"));
+  expectations["HEAD"]["CHILD"].setValueSet<uint64_t>({1, 2, 3});
+  ASSERT_DEATH(
+      { expectations["HEAD"]["CHILD"].setValueBounds<uint64_t>(0, 10); },
+      "Invalid call of setValueBounds\\() for the ExpectationNode with "
+      "key HEAD:CHILD as a value set has already been defined.");
+}
+
+// Test that adding multiple wild ExpectationNodes to the same parent fails
+TEST(ConfigTest, multipleWildNodes) {
+  simeng::config::ExpectationNode expectations =
+      simeng::config::ExpectationNode();
+  expectations.addChild(
+      simeng::config::ExpectationNode::createExpectation("HEAD"));
+  expectations["HEAD"].addChild(
+      simeng::config::ExpectationNode::createExpectation(
+          simeng::config::wildcard));
+  ASSERT_DEATH(
+      {
+        expectations["HEAD"].addChild(
+            simeng::config::ExpectationNode::createExpectation(
+                simeng::config::wildcard));
+      },
+      "Attempted to add multiple wildcard nodes to the same ExpectationNode "
+      "instance of key HEAD");
+}
+
+// Test that, using a file path, a config can be set from a yaml file
+TEST(ConfigTest, configFromFile) {
+  std::string filePath = SIMENG_SOURCE_DIR "/configs/a64fx.yaml";
+  simeng::config::SimInfo::setConfig(filePath);
+  EXPECT_EQ(simeng::config::SimInfo::getConfigPath(), filePath);
+  EXPECT_EQ(simeng::config::SimInfo::getISA(), simeng::config::ISA::AArch64);
+  EXPECT_EQ(simeng::config::SimInfo::getISAString(), "AArch64");
+  EXPECT_EQ(simeng::config::SimInfo::getSimMode(),
+            simeng::config::SimulationMode::Outoforder);
+  EXPECT_EQ(simeng::config::SimInfo::getSimModeStr(), "Out-of-Order");
+  std::vector<uint64_t> sysRegisterEnums = {
+      aarch64_sysreg::AARCH64_SYSREG_DCZID_EL0,
+      aarch64_sysreg::AARCH64_SYSREG_FPCR,
+      aarch64_sysreg::AARCH64_SYSREG_FPSR,
+      aarch64_sysreg::AARCH64_SYSREG_TPIDR_EL0,
+      aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1,
+      aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0,
+      aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0,
+      aarch64_sysreg::AARCH64_SYSREG_SVCR};
+  EXPECT_EQ(simeng::config::SimInfo::getSysRegVec(), sysRegisterEnums);
+  std::vector<simeng::RegisterFileStructure> archRegStruct = {
+      {8, 32},
+      {256, 32},
+      {32, 17},
+      {1, 1},
+      {8, static_cast<uint16_t>(sysRegisterEnums.size())},
+      {256, 16},
+      {64, 1}};
+  EXPECT_EQ(simeng::config::SimInfo::getArchRegStruct(), archRegStruct);
+  std::vector<simeng::RegisterFileStructure> physRegStruct = {
+      {8, 96},
+      {256, 128},
+      {32, 48},
+      {1, 128},
+      {8, static_cast<uint16_t>(sysRegisterEnums.size())},
+      {256, 16},
+      {64, 1}};
+  EXPECT_EQ(simeng::config::SimInfo::getPhysRegStruct(), physRegStruct);
+  std::vector<uint16_t> physRegQuants = {
+      96, 128, 48, 128, static_cast<uint16_t>(sysRegisterEnums.size()), 16, 1};
+  EXPECT_EQ(simeng::config::SimInfo::getPhysRegQuantities(), physRegQuants);
+}
+// getPhysRegStruct()
+// getPhysRegQuantities()
+
+}  // namespace
diff --git a/test/regression/CMakeLists.txt b/test/regression/CMakeLists.txt
index 1231a8f39f..3fa281752c 100644
--- a/test/regression/CMakeLists.txt
+++ b/test/regression/CMakeLists.txt
@@ -20,6 +20,3 @@ target_link_libraries(regression-test-base ${LLVM_LIBS})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 add_subdirectory(aarch64) 
 add_subdirectory(riscv)
-
-# Link to yaml-cpp libraries
-target_link_libraries(regression-test-base yaml-cpp)
\ No newline at end of file
diff --git a/test/regression/RegressionTest.cc b/test/regression/RegressionTest.cc
index 156dae7d6e..5317a1857d 100644
--- a/test/regression/RegressionTest.cc
+++ b/test/regression/RegressionTest.cc
@@ -2,12 +2,14 @@
 
 #include <string>
 
-#include "simeng/FixedLatencyMemoryInterface.hh"
-#include "simeng/FlatMemoryInterface.hh"
-#include "simeng/GenericPredictor.hh"
+#include "simeng/branchpredictors/GenericPredictor.hh"
+#include "simeng/branchpredictors/PerceptronPredictor.hh"
+#include "simeng/config/SimInfo.hh"
 #include "simeng/control.hh"
 #include "simeng/kernel/Linux.hh"
 #include "simeng/kernel/LinuxProcess.hh"
+#include "simeng/memory/FixedLatencyMemoryInterface.hh"
+#include "simeng/memory/FlatMemoryInterface.hh"
 #include "simeng/models/emulation/Core.hh"
 #include "simeng/models/inorder/Core.hh"
 #include "simeng/models/outoforder/Core.hh"
@@ -32,16 +34,22 @@ void RegressionTest::TearDown() {
   }
 }
 
-void RegressionTest::run(const char* source, const char* triple,
-                         const char* extensions) {
-  testing::internal::CaptureStdout();
+void RegressionTest::createArchitecture(const char* source, const char* triple,
+                                        const char* extensions) {
+  // Zero-out process memory from any prior runs
+  if (processMemory_ != nullptr)
+    std::memset(processMemory_, '\0', processMemorySize_);
 
   // Assemble the source to a flat binary
   assemble(source, triple, extensions);
   if (HasFatalFailure()) return;
 
-  // Get pre-defined config file for OoO model
-  YAML::Node config = generateConfig();
+  // Generate the predefined model config
+  generateConfig();
+
+  // Due to SimInfo being static, we need to ensure the config values/options
+  // stored are up-to-date with the latest generated config file
+  simeng::config::SimInfo::reBuild();
 
   // Create a linux process from the assembled code block.
   // Memory allocation for process images also takes place
@@ -50,80 +58,108 @@ void RegressionTest::run(const char* source, const char* triple,
   // The process image is finalised by the createStack method
   // which creates and populates the initial process stack.
   // The created process image can be accessed via a shared_ptr
-  // returned by the getProcessImage method.
+  // returned by the getProcessImage method
   process_ = std::make_unique<simeng::kernel::LinuxProcess>(
-      simeng::span<char>(reinterpret_cast<char*>(code_), codeSize_), config);
+      simeng::span(reinterpret_cast<const uint8_t*>(code_), codeSize_));
+
   ASSERT_TRUE(process_->isValid());
-  uint64_t entryPoint = process_->getEntryPoint();
+  entryPoint_ = process_->getEntryPoint();
   processMemorySize_ = process_->getProcessImageSize();
+
   // This instance of procImgPtr pointer needs to be shared because
   // getMemoryValue in RegressionTest.hh uses reference to the class
-  // member processMemory_.
+  // member processMemory_
   std::shared_ptr<char> procImgPtr = process_->getProcessImage();
   processMemory_ = procImgPtr.get();
 
-  // Create memory interfaces for instruction and data access.
-  // For each memory interface, a dereferenced shared_ptr to the
-  // processImage is passed as argument.
-  simeng::FlatMemoryInterface instructionMemory(processMemory_,
-                                                processMemorySize_);
-
-  std::unique_ptr<simeng::FlatMemoryInterface> flatDataMemory =
-      std::make_unique<simeng::FlatMemoryInterface>(processMemory_,
-                                                    processMemorySize_);
-
-  std::unique_ptr<simeng::FixedLatencyMemoryInterface> fixedLatencyDataMemory =
-      std::make_unique<simeng::FixedLatencyMemoryInterface>(
-          processMemory_, processMemorySize_, 4);
-  std::unique_ptr<simeng::MemoryInterface> dataMemory;
-
-  // Create the OS kernel and the process
-  simeng::kernel::Linux kernel;
-  kernel.createProcess(*process_);
-
-  // Populate the heap with initial data (specified by the test being run).
+  // Populate the heap with initial data (specified by the test being run)
   ASSERT_LT(process_->getHeapStart() + initialHeapData_.size(),
-            process_->getStackPointer());
+            process_->getInitialStackPointer());
   std::copy(initialHeapData_.begin(), initialHeapData_.end(),
             processMemory_ + process_->getHeapStart());
 
+  ASSERT_TRUE(process_ != nullptr);
+
+  // Create the OS kernel and the process
+  kernel_ = std::make_unique<simeng::kernel::Linux>(
+      simeng::config::SimInfo::getConfig()["CPU-Info"]["Special-File-Dir-Path"]
+          .as<std::string>());
+  kernel_->createProcess(*process_);
+
   // Create the architecture
-  architecture_ = createArchitecture(kernel, config);
+  architecture_ = instantiateArchitecture(*kernel_);
+}
 
-  // Create a port allocator for an out-of-order core
-  std::unique_ptr<simeng::pipeline::PortAllocator> portAllocator =
-      createPortAllocator();
+void RegressionTest::createCore(const char* source, const char* triple,
+                                const char* extensions) {
+  // Create the architecture, kernel and process
+  createArchitecture(source, triple, extensions);
 
   // Create a branch predictor for a pipelined core
-  simeng::GenericPredictor predictor(config);
+  std::string predictorType =
+      simeng::config::SimInfo::getConfig()["Branch-Predictor"]["Type"]
+          .as<std::string>();
+  if (predictorType == "Generic") {
+    predictor_ = std::make_unique<simeng::GenericPredictor>();
+  } else if (predictorType == "Perceptron") {
+    predictor_ = std::make_unique<simeng::PerceptronPredictor>();
+  }
+
+  // Create memory interfaces for instruction and data access.
+  // For each memory interface, a dereferenced shared_ptr to the
+  // processImage is passed as an argument
+
+  ASSERT_TRUE(processMemory_ != nullptr);
+
+  instructionMemory_ = std::make_unique<simeng::memory::FlatMemoryInterface>(
+      processMemory_, processMemorySize_);
+
+  flatDataMemory_ = std::make_unique<simeng::memory::FlatMemoryInterface>(
+      processMemory_, processMemorySize_);
+
+  fixedLatencyDataMemory_ =
+      std::make_unique<simeng::memory::FixedLatencyMemoryInterface>(
+          processMemory_, processMemorySize_, 4);
+
   // Create the core model
   switch (std::get<0>(GetParam())) {
     case EMULATION:
       core_ = std::make_unique<simeng::models::emulation::Core>(
-          instructionMemory, *flatDataMemory, entryPoint, processMemorySize_,
-          *architecture_);
-      dataMemory = std::move(flatDataMemory);
+          *instructionMemory_, *flatDataMemory_, entryPoint_,
+          processMemorySize_, *architecture_);
+      dataMemory_ = std::move(flatDataMemory_);
       break;
     case INORDER:
       core_ = std::make_unique<simeng::models::inorder::Core>(
-          instructionMemory, *flatDataMemory, processMemorySize_, entryPoint,
-          *architecture_, predictor);
-      dataMemory = std::move(flatDataMemory);
+          *instructionMemory_, *flatDataMemory_, processMemorySize_,
+          entryPoint_, *architecture_, *predictor_);
+      dataMemory_ = std::move(flatDataMemory_);
       break;
     case OUTOFORDER:
+      // Create a port allocator for an out-of-order core
+      portAllocator_ = createPortAllocator();
+
       core_ = std::make_unique<simeng::models::outoforder::Core>(
-          instructionMemory, *fixedLatencyDataMemory, processMemorySize_,
-          entryPoint, *architecture_, predictor, *portAllocator, config);
-      dataMemory = std::move(fixedLatencyDataMemory);
+          *instructionMemory_, *fixedLatencyDataMemory_, processMemorySize_,
+          entryPoint_, *architecture_, *predictor_, *portAllocator_);
+      dataMemory_ = std::move(fixedLatencyDataMemory_);
       break;
   }
+}
+
+void RegressionTest::run(const char* source, const char* triple,
+                         const char* extensions) {
+  testing::internal::CaptureStdout();
+
+  // Create the core, memory interfaces, kernel and process
+  createCore(source, triple, extensions);
 
   // Run the core model until the program is complete
-  while (!core_->hasHalted() || dataMemory->hasPendingRequests()) {
+  while (!core_->hasHalted() || dataMemory_->hasPendingRequests()) {
     ASSERT_LT(numTicks_, maxTicks_) << "Maximum tick count exceeded.";
     core_->tick();
-    instructionMemory.tick();
-    dataMemory->tick();
+    instructionMemory_->tick();
+    dataMemory_->tick();
     numTicks_++;
   }
 
@@ -133,6 +169,24 @@ void RegressionTest::run(const char* source, const char* triple,
   programFinished_ = true;
 }
 
+void RegressionTest::checkGroup(const char* source, const char* triple,
+                                const char* extensions,
+                                const std::vector<uint16_t>& expectedGroups) {
+  createArchitecture(source, triple, extensions);
+
+  std::vector<std::shared_ptr<simeng::Instruction>> macroOp;
+  architecture_->predecode(code_, 4, 0, macroOp);
+
+  // Check that there is one expectation group per micro-op
+  EXPECT_EQ(macroOp.size(), expectedGroups.size());
+
+  // Check the assigned and expected group for each micro-op match
+  for (size_t i = 0; i < macroOp.size(); i++) {
+    auto group = macroOp[i]->getGroup();
+    EXPECT_EQ(group, expectedGroups[i]);
+  }
+}
+
 void RegressionTest::assemble(const char* source, const char* triple,
                               const char* extensions) {
   // Get LLVM target
@@ -198,8 +252,13 @@ void RegressionTest::assemble(const char* source, const char* triple,
   ASSERT_NE(asmBackend, nullptr) << "Failed to create LLVM asm backend";
 
   // Create MC code emitter
+#if SIMENG_LLVM_VERSION < 15
   std::unique_ptr<llvm::MCCodeEmitter> codeEmitter(
       target->createMCCodeEmitter(*instrInfo, *regInfo, context));
+#else
+  std::unique_ptr<llvm::MCCodeEmitter> codeEmitter(
+      target->createMCCodeEmitter(*instrInfo, context));
+#endif
   ASSERT_NE(codeEmitter, nullptr) << "Failed to create LLVM code emitter";
 
   // Create MC object writer
@@ -234,8 +293,14 @@ void RegressionTest::assemble(const char* source, const char* triple,
 
   // Create ELF object from output
   llvm::StringRef objectData = objectStream.str();
+#if SIMENG_LLVM_VERSION < 15
   auto elfOrErr = llvm::object::ELFFile<
       llvm::object::ELFType<llvm::support::little, true>>::create(objectData);
+#else
+  auto elfOrErr =
+      llvm::object::ELFFile<llvm::object::ELFType<llvm::endianness::little,
+                                                  true>>::create(objectData);
+#endif
   ASSERT_FALSE(elfOrErr.takeError()) << "Failed to load ELF object";
   auto& elf = *elfOrErr;
 
diff --git a/test/regression/RegressionTest.hh b/test/regression/RegressionTest.hh
index 8921ec8aa6..661584cd43 100644
--- a/test/regression/RegressionTest.hh
+++ b/test/regression/RegressionTest.hh
@@ -8,7 +8,20 @@
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
+
+#if defined(__clang__)
+// Prevent errors due to warnings in included file when using clang
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wall"
+#endif
+
 #include "llvm/MC/MCContext.h"
+
+#if defined(__clang__)
+// Allow errors again
+#pragma clang diagnostic pop
+#endif
+
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
@@ -31,6 +44,7 @@
 #if SIMENG_LLVM_VERSION < 14
 #include "llvm/Support/TargetRegistry.h"
 #else
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #endif
 
@@ -49,26 +63,38 @@ enum CoreType { EMULATION, INORDER, OUTOFORDER };
  * execution has completed.
  */
 class RegressionTest
-    : public ::testing::TestWithParam<std::tuple<CoreType, YAML::Node>> {
+    : public ::testing::TestWithParam<std::tuple<CoreType, std::string>> {
  protected:
   virtual ~RegressionTest();
 
   virtual void TearDown() override;
 
   /** Generate a default YAML-formatted configuration. */
-  virtual YAML::Node generateConfig() const = 0;
+  virtual void generateConfig() const = 0;
+
+  /** Instantiate an ISA specific architecture from a kernel. */
+  virtual std::unique_ptr<simeng::arch::Architecture> instantiateArchitecture(
+      simeng::kernel::Linux& kernel) const = 0;
+
+  /** Create a port allocator for an out-of-order core model. */
+  virtual std::unique_ptr<simeng::pipeline::PortAllocator> createPortAllocator(
+      ryml::ConstNodeRef config =
+          simeng::config::SimInfo::getConfig()) const = 0;
+
+  /** Create the kernel then instantiate an ISA specific architecture. Populates
+   * the architecture_ member variable. */
+  void createArchitecture(const char* source, const char* triple,
+                          const char* extensions);
 
   /** Run the assembly in `source`, building it for the target `triple` and ISA
    * extensions. */
   void run(const char* source, const char* triple, const char* extensions);
 
-  /** Create an ISA instance from a kernel. */
-  virtual std::unique_ptr<simeng::arch::Architecture> createArchitecture(
-      simeng::kernel::Linux& kernel, YAML::Node config) const = 0;
-
-  /** Create a port allocator for an out-of-order core model. */
-  virtual std::unique_ptr<simeng::pipeline::PortAllocator> createPortAllocator()
-      const = 0;
+  /** Predecode the first instruction in source and check the assigned group
+   * matches the expectation. */
+  void checkGroup(const char* source, const char* triple,
+                  const char* extensions,
+                  const std::vector<uint16_t>& expectedGroups);
 
   /** Get the value of an architectural register. */
   template <typename T>
@@ -94,41 +120,69 @@ class RegressionTest
   /** The initial data to populate the heap with. */
   std::vector<uint8_t> initialHeapData_;
 
-  /** The maximum number of ticks to run before aborting the test. */
-  uint64_t maxTicks_ = UINT64_MAX;
-
-  /** The number of ticks that were run before the test program completed. */
-  uint64_t numTicks_ = 0;
-
-  /** The architecture instance. */
-  std::unique_ptr<simeng::arch::Architecture> architecture_;
+  /** The process to be executed. */
+  std::unique_ptr<simeng::kernel::LinuxProcess> process_;
 
   /** The process memory. */
   char* processMemory_ = nullptr;
 
-  /** The size of the process memory in bytes. */
-  size_t processMemorySize_ = 0;
+  /** The output written to stdout during the test. */
+  std::string stdout_;
 
-  /** The process that was executed. */
-  std::unique_ptr<simeng::kernel::LinuxProcess> process_;
+  /** The flat binary produced by assembling the test source. */
+  uint8_t* code_ = nullptr;
 
-  /** The core that was used. */
-  std::unique_ptr<simeng::Core> core_ = nullptr;
+  /** The number of ticks that were run before the test program completed. */
+  uint64_t numTicks_ = 0;
 
-  /** The output written to stdout during the test. */
-  std::string stdout_;
+  /** The maximum number of ticks to run before aborting the test. */
+  uint64_t maxTicks_ = UINT64_MAX;
 
-  /** True if the test program finished running. */
-  bool programFinished_ = false;
+  /** Pointer to be instantiated for the architecture. */
+  std::unique_ptr<simeng::arch::Architecture> architecture_ = nullptr;
 
  private:
   /** Assemble test source to a flat binary for the given triple and ISA
    * extensions. */
   void assemble(const char* source, const char* triple, const char* extensions);
 
-  /** The flat binary produced by assembling the test source. */
-  uint8_t* code_ = nullptr;
+  /** Instantiate the core according to the config. */
+  void createCore(const char* source, const char* triple,
+                  const char* extensions);
+
+  /* Pointer to be instantiated for the kernel. */
+  std::unique_ptr<simeng::kernel::Linux> kernel_ = nullptr;
+
+  /* Pointer to be instantiated for the port allocator. */
+  std::unique_ptr<simeng::pipeline::PortAllocator> portAllocator_ = nullptr;
+
+  /* Pointer to be instantiated for the branch predictor. */
+  std::unique_ptr<simeng::BranchPredictor> predictor_ = nullptr;
+
+  /** All possible data memory interfaces. dataMemory_ set to one of these
+   * depending on core type. */
+  std::unique_ptr<simeng::memory::MemoryInterface> flatDataMemory_ = nullptr;
+  std::unique_ptr<simeng::memory::MemoryInterface> fixedLatencyDataMemory_ =
+      nullptr;
+
+  /** Pointer to be instantiated for the data memory interface. */
+  std::unique_ptr<simeng::memory::MemoryInterface> dataMemory_ = nullptr;
+
+  /** Pointer to be instantiated for the instruction memory interface. */
+  std::unique_ptr<simeng::memory::MemoryInterface> instructionMemory_ = nullptr;
+
+  /** Pointer to be instantiated for the core. */
+  std::unique_ptr<simeng::Core> core_ = nullptr;
+
+  /** The size of the process memory in bytes. */
+  size_t processMemorySize_ = 0;
+
+  /** True if the test program finished running. */
+  bool programFinished_ = false;
 
   /** The size of the assembled flat binary in bytes. */
   size_t codeSize_ = 0;
+
+  /** The entry point of the process. */
+  uint64_t entryPoint_ = 0;
 };
diff --git a/test/regression/aarch64/AArch64RegressionTest.cc b/test/regression/aarch64/AArch64RegressionTest.cc
index 8df4b3d1c2..90a1386c23 100644
--- a/test/regression/aarch64/AArch64RegressionTest.cc
+++ b/test/regression/aarch64/AArch64RegressionTest.cc
@@ -6,82 +6,64 @@
 using namespace simeng::arch::aarch64;
 
 void AArch64RegressionTest::run(const char* source) {
-  // Initialise LLVM
-  LLVMInitializeAArch64TargetInfo();
-  LLVMInitializeAArch64TargetMC();
-  LLVMInitializeAArch64AsmParser();
-
-  const char* subtargetFeatures;
-#if SIMENG_LLVM_VERSION < 14
-  subtargetFeatures = "+sve,+lse";
-#else
-  subtargetFeatures = "+sve,+lse,+sve2,+sme";
-#endif
-
-  RegressionTest::run(source, "aarch64", subtargetFeatures);
+  initialiseLLVM();
+  std::string subtargetFeatures = getSubtargetFeaturesString();
+
+  RegressionTest::run(source, "aarch64", subtargetFeatures.c_str());
 }
 
-YAML::Node AArch64RegressionTest::generateConfig() const {
-  YAML::Node config = YAML::Load(AARCH64_CONFIG);
+void AArch64RegressionTest::checkGroup(
+    const char* source, const std::vector<uint16_t>& expectedGroups) {
+  initialiseLLVM();
+  std::string subtargetFeatures = getSubtargetFeaturesString();
+
+  RegressionTest::checkGroup(source, "aarch64", subtargetFeatures.c_str(),
+                             expectedGroups);
+}
+
+void AArch64RegressionTest::generateConfig() const {
+  // Re-generate the default config for the AArch64 ISA
+  simeng::config::SimInfo::generateDefault(simeng::config::ISA::AArch64, true);
+
+  // Add the base additional AArch64 test suite config options
+  simeng::config::SimInfo::addToConfig(AARCH64_ADDITIONAL_CONFIG);
+  std::string mode;
   switch (std::get<0>(GetParam())) {
     case EMULATION:
-      config["Core"]["Simulation-Mode"] = "emulation";
+      mode = "emulation";
       break;
     case INORDER:
-      config["Core"]["Simulation-Mode"] = "inorderpipeline";
+      mode = "inorderpipelined";
       break;
     case OUTOFORDER:
-      config["Core"]["Simulation-Mode"] = "outoforder";
+      mode = "outoforder";
       break;
   }
+  simeng::config::SimInfo::addToConfig("{Core: {Simulation-Mode: " + mode +
+                                       "}}");
 
-  YAML::Node additionalConfig = std::get<1>(GetParam());
-  // Merge specific aarch64 config options
-  if (additionalConfig["Vector-Length"].IsDefined() &&
-      !(additionalConfig["Vector-Length"].IsNull())) {
-    config["Core"]["Vector-Length"] =
-        additionalConfig["Vector-Length"].as<uint64_t>();
-  } else {
-    config["Core"]["Vector-Length"] = 512;
-  }
-  if (additionalConfig["Streaming-Vector-Length"].IsDefined() &&
-      !(additionalConfig["Streaming-Vector-Length"].IsNull())) {
-    config["Core"]["Streaming-Vector-Length"] =
-        additionalConfig["Streaming-Vector-Length"].as<uint64_t>();
-  } else {
-    config["Core"]["Streaming-Vector-Length"] = 512;
-  }
-  if (additionalConfig["Micro-Operations"].IsDefined() &&
-      !(additionalConfig["Micro-Operations"].IsNull())) {
-    config["Core"]["Micro-Operations"] =
-        additionalConfig["Micro-Operations"].as<bool>();
-  } else {
-    config["Core"]["Micro-Operations"] = false;
-  }
-  return config;
+  // Add the test specific config options
+  simeng::config::SimInfo::addToConfig(std::get<1>(GetParam()));
 }
 
 std::unique_ptr<simeng::arch::Architecture>
-AArch64RegressionTest::createArchitecture(simeng::kernel::Linux& kernel,
-                                          YAML::Node config) const {
-  return std::make_unique<Architecture>(kernel, config);
+AArch64RegressionTest::instantiateArchitecture(
+    simeng::kernel::Linux& kernel) const {
+  return std::make_unique<Architecture>(kernel);
 }
 
 std::unique_ptr<simeng::pipeline::PortAllocator>
-AArch64RegressionTest::createPortAllocator() const {
-  // TODO: this is currently tightly coupled to the number of execution units,
-  // which is specified in the out-of-order core model
-  const std::vector<std::vector<uint16_t>> portArrangement = {
-      {simeng::arch::aarch64::InstructionGroups::INT,
-       simeng::arch::aarch64::InstructionGroups::FP,
-       simeng::arch::aarch64::InstructionGroups::SVE,
-       simeng::arch::aarch64::InstructionGroups::PREDICATE,
-       simeng::arch::aarch64::InstructionGroups::LOAD,
-       simeng::arch::aarch64::InstructionGroups::STORE_ADDRESS,
-       simeng::arch::aarch64::InstructionGroups::STORE_DATA,
-       simeng::arch::aarch64::InstructionGroups::BRANCH,
-       simeng::arch::aarch64::InstructionGroups::SME}};
-
+AArch64RegressionTest::createPortAllocator(ryml::ConstNodeRef config) const {
+  // Extract the port arrangement from the config file
+  std::vector<std::vector<uint16_t>> portArrangement(
+      config["Ports"].num_children());
+  for (size_t i = 0; i < config["Ports"].num_children(); i++) {
+    auto config_groups = config["Ports"][i]["Instruction-Group-Support-Nums"];
+    // Read groups in associated port
+    for (size_t j = 0; j < config_groups.num_children(); j++) {
+      portArrangement[i].push_back(config_groups[j].as<uint16_t>());
+    }
+  }
   return std::make_unique<simeng::pipeline::BalancedPortAllocator>(
       portArrangement);
 }
diff --git a/test/regression/aarch64/AArch64RegressionTest.hh b/test/regression/aarch64/AArch64RegressionTest.hh
index c816ae91a2..32d975b09d 100644
--- a/test/regression/aarch64/AArch64RegressionTest.hh
+++ b/test/regression/aarch64/AArch64RegressionTest.hh
@@ -4,31 +4,40 @@
 #include "simeng/arch/aarch64/Architecture.hh"
 #include "simeng/arch/aarch64/Instruction.hh"
 
-#define AARCH64_CONFIG                                                        \
-  ("{Core: {ISA: AArch64, Simulation-Mode: emulation, Clock-Frequency: 2.5, " \
-   "Timer-Frequency: 100, Micro-Operations: False}, Fetch: "                  \
-   "{Fetch-Block-Size: 32, Loop-Buffer-Size: 64, Loop-Detection-Threshold: "  \
-   "4}, Process-Image: {Heap-Size: 100000, Stack-Size: 100000}, "             \
-   "Register-Set: {GeneralPurpose-Count: 154, FloatingPoint/SVE-Count: 90, "  \
-   "Predicate-Count: 17, Conditional-Count: 128, Matrix-Count: 2}, "          \
-   "Pipeline-Widths: { Commit: 4, FrontEnd: 4, LSQ-Completion: 2}, "          \
-   "Queue-Sizes: {ROB: 180, Load: 64, Store: 36}, Branch-Predictor: "         \
-   "{BTB-Tag-Bits: 11, Saturating-Count-Bits: 2, Global-History-Length: 10, " \
-   "RAS-entries: 5, Fallback-Static-Predictor: 2}, Data-Memory: "             \
-   "{Interface-Type: Flat}, Instruction-Memory: {Interface-Type: Flat}, "     \
-   "LSQ-L1-Interface: {Access-Latency: 4, Exclusive: False, Load-Bandwidth: " \
-   "32, Store-Bandwidth: 16, Permitted-Requests-Per-Cycle: 2, "               \
-   "Permitted-Loads-Per-Cycle: 2, Permitted-Stores-Per-Cycle: 1}, Ports: "    \
-   "{'0': {Portname: Port 0, Instruction-Group-Support: [0, 14, 52, 66, 67, " \
-   "70, 71, 72]}}, Reservation-Stations: {'0': {Size: 60, Dispatch-Rate: 4, " \
-   "Ports: [0]}}, Execution-Units: {'0': {Pipelined: true}}}")
+[[maybe_unused]] static const char* AARCH64_ADDITIONAL_CONFIG = R"YAML(
+{
+  Core:
+    {
+      Clock-Frequency-GHz: 2.5,
+    },
+  Register-Set:
+    {
+      GeneralPurpose-Count: 154,
+      FloatingPoint/SVE-Count: 90,
+      Predicate-Count: 17, 
+      Conditional-Count: 128,
+      SME-Matrix-Count: 2,
+      SME-Lookup-Table-Count: 8,
+    },
+  L1-Data-Memory:
+    {
+      Interface-Type: Flat,
+    },
+  L1-Instruction-Memory:
+    {
+      Interface-Type: Flat,
+    },
+  Ports:
+    {
+      '0': { Portname: 0, Instruction-Group-Support: [INT, FP, SVE, PREDICATE, LOAD, STORE, BRANCH, SME] },
+    },
+}
+)YAML";
 
 /** A helper function to convert the supplied parameters of
  * INSTANTIATE_TEST_SUITE_P into test name. */
 inline std::string paramToString(
-    const testing::TestParamInfo<std::tuple<CoreType, YAML::Node>> val) {
-  YAML::Node config = YAML::Load(AARCH64_CONFIG);
-
+    const testing::TestParamInfo<std::tuple<CoreType, std::string>> val) {
   // Get core type as string
   std::string coreString = "";
   switch (std::get<0>(val.param)) {
@@ -47,39 +56,47 @@ inline std::string paramToString(
   }
   // Get vector length as string
   std::string vectorLengthString = "";
-  if (std::get<1>(val.param)["Vector-Length"].IsDefined() &&
-      !(std::get<1>(val.param)["Vector-Length"].IsNull())) {
-    vectorLengthString =
-        "WithVL" + std::get<1>(val.param)["Vector-Length"].as<std::string>();
-  } else if (std::get<1>(val.param)["Streaming-Vector-Length"].IsDefined() &&
-             !(std::get<1>(val.param)["Streaming-Vector-Length"].IsNull())) {
-    vectorLengthString =
-        "WithSVL" +
-        std::get<1>(val.param)["Streaming-Vector-Length"].as<std::string>();
+  // Temporarily construct a ryml::Tree to extract config options as strings
+  ryml::Tree tempTree =
+      ryml::parse_in_arena(ryml::to_csubstr(std::get<1>(val.param)));
+  if (tempTree.rootref().has_child("Core")) {
+    if (tempTree.rootref()["Core"].has_child("Vector-Length")) {
+      vectorLengthString +=
+          "WithVL" + tempTree["Core"]["Vector-Length"].as<std::string>();
+    }
+    if (tempTree.rootref()["Core"].has_child("Streaming-Vector-Length")) {
+      vectorLengthString +=
+          "WithSVL" +
+          tempTree["Core"]["Streaming-Vector-Length"].as<std::string>();
+    }
   }
   return coreString + vectorLengthString;
 }
 
 /** A helper function to generate all coreType vector-length pairs. */
-inline std::vector<std::tuple<CoreType, YAML::Node>> genCoreTypeVLPairs(
+inline std::vector<std::tuple<CoreType, std::string>> genCoreTypeVLPairs(
     CoreType type) {
-  std::vector<std::tuple<CoreType, YAML::Node>> coreVLPairs;
+  std::vector<std::tuple<CoreType, std::string>> coreVLPairs;
   for (uint64_t i = 128; i <= 2048; i += 128) {
-    YAML::Node vlNode;
-    vlNode["Vector-Length"] = i;
-    coreVLPairs.push_back(std::make_tuple(type, vlNode));
+    coreVLPairs.push_back(std::make_tuple(
+        type,
+        "{Core: {Vector-Length: " + std::to_string(i) +
+            "}, LSQ-L1-Interface: {Load-Bandwidth: " + std::to_string(i / 8) +
+            ", Store-Bandwidth: " + std::to_string(i / 8) + "}}"));
   }
   return coreVLPairs;
 }
 
 /** A helper function to generate all coreType streaming-vector-length pairs. */
-inline std::vector<std::tuple<CoreType, YAML::Node>> genCoreTypeSVLPairs(
+inline std::vector<std::tuple<CoreType, std::string>> genCoreTypeSVLPairs(
     CoreType type) {
-  std::vector<std::tuple<CoreType, YAML::Node>> coreSVLPairs;
-  for (uint64_t i = 128; i <= 2048; i += 128) {
-    YAML::Node svlNode;
-    svlNode["Streaming-Vector-Length"] = i;
-    coreSVLPairs.push_back(std::make_tuple(type, svlNode));
+  std::vector<std::tuple<CoreType, std::string>> coreSVLPairs;
+  for (uint64_t i = 128; i <= 2048; i *= 2) {
+    coreSVLPairs.push_back(std::make_tuple(
+        type,
+        "{Core: {Streaming-Vector-Length: " + std::to_string(i) +
+            "}, LSQ-L1-Interface: {Load-Bandwidth: " + std::to_string(i / 8) +
+            ", Store-Bandwidth: " + std::to_string(i / 8) + "}}"));
   }
   return coreSVLPairs;
 }
@@ -87,7 +104,8 @@ inline std::vector<std::tuple<CoreType, YAML::Node>> genCoreTypeSVLPairs(
 /** A helper macro to run a snippet of Armv9.2-a assembly code, returning from
  * the calling function if a fatal error occurs. Four bytes containing zeros are
  * appended to the source to ensure that the program will terminate with an
- * illegal instruction exception instead of running into the heap. */
+ * unallocated instruction encoding exception instead of running into the heap.
+ */
 #define RUN_AARCH64(source)                    \
   {                                            \
     std::string sourceWithTerminator = source; \
@@ -145,7 +163,7 @@ inline std::vector<std::tuple<CoreType, YAML::Node>> genCoreTypeSVLPairs(
  * For example:
  *
  *     // Compare za1h.s[0] to some expected 32-bit floating point values.
- *     CHECK_MAT_ROW(ARM64_REG_ZAS1, 0, float, {123.456f, 0.f, 42.f, -1.f});
+ *     CHECK_MAT_ROW(AARCH64_REG_ZAS1, 0, float, {123.456f, 0.f, 42.f, -1.f});
  */
 #define CHECK_MAT_ROW(tag, index, type, ...)               \
   {                                                        \
@@ -164,7 +182,7 @@ inline std::vector<std::tuple<CoreType, YAML::Node>> genCoreTypeSVLPairs(
  * For example:
  *
  *     // Compare za1v.s[0] to some expected 32-bit floating point values.
- *     CHECK_MAT_COL(ARM64_REG_ZAS1, 0, float, {123.456f, 0.f, 42.f, -1.f});
+ *     CHECK_MAT_COL(AARCH64_REG_ZAS1, 0, float, {123.456f, 0.f, 42.f, -1.f});
  */
 #define CHECK_MAT_COL(tag, index, type, ...)               \
   {                                                        \
@@ -172,6 +190,21 @@ inline std::vector<std::tuple<CoreType, YAML::Node>> genCoreTypeSVLPairs(
     checkMatrixRegisterCol<type>(tag, index, __VA_ARGS__); \
   }
 
+/** A helper macro to predecode the first instruction in a snippet of Armv9.2-a
+ * assembly code and check the assigned group(s) for each micro-op matches the
+ * expected group(s). Returns from the calling function if a fatal error occurs.
+ * Four bytes containing zeros are appended to the source to ensure that the
+ * program will terminate with an unallocated instruction encoding exception
+ * instead of running into the heap.
+ */
+#define EXPECT_GROUP(source, ...)                            \
+  {                                                          \
+    std::string sourceWithTerminator = source;               \
+    sourceWithTerminator += "\n.word 0";                     \
+    checkGroup(sourceWithTerminator.c_str(), {__VA_ARGS__}); \
+  }                                                          \
+  if (HasFatalFailure()) return
+
 /** The test fixture for all AArch64 regression tests. */
 class AArch64RegressionTest : public RegressionTest {
  protected:
@@ -180,16 +213,40 @@ class AArch64RegressionTest : public RegressionTest {
   /** Run the assembly code in `source`. */
   void run(const char* source);
 
+  /** Run the first instruction in source through predecode and check the
+   * groups. */
+  void checkGroup(const char* source,
+                  const std::vector<uint16_t>& expectedGroups);
+
   /** Generate a default YAML-formatted configuration. */
-  YAML::Node generateConfig() const override;
+  void generateConfig() const override;
 
-  /** Create an ISA instance from a kernel. */
-  virtual std::unique_ptr<simeng::arch::Architecture> createArchitecture(
-      simeng::kernel::Linux& kernel, YAML::Node config) const override;
+  /** Instantiate an ISA specific architecture from a kernel. */
+  virtual std::unique_ptr<simeng::arch::Architecture> instantiateArchitecture(
+      simeng::kernel::Linux& kernel) const override;
 
   /** Create a port allocator for an out-of-order core model. */
-  virtual std::unique_ptr<simeng::pipeline::PortAllocator> createPortAllocator()
-      const override;
+  virtual std::unique_ptr<simeng::pipeline::PortAllocator> createPortAllocator(
+      ryml::ConstNodeRef config =
+          simeng::config::SimInfo::getConfig()) const override;
+
+  /** Initialise LLVM */
+  void initialiseLLVM() {
+    LLVMInitializeAArch64TargetInfo();
+    LLVMInitializeAArch64TargetMC();
+    LLVMInitializeAArch64AsmParser();
+  }
+
+  /** Get the subtarget feature string based on LLVM version being used */
+  std::string getSubtargetFeaturesString() {
+#if SIMENG_LLVM_VERSION < 14
+    return "+sve,+lse";
+#elif SIMENG_LLVM_VERSION < 18
+    return "+sve,+lse,+sve2,+sme,+sme-f64";
+#else
+    return "+sve,+lse,+sve2,+sme,+sme-f64f64,+sme-i16i64,+sme2";
+#endif
+  }
 
   /** Check the elements of a Neon register.
    *
@@ -235,22 +292,22 @@ class AArch64RegressionTest : public RegressionTest {
     // Get matrix row register tag
     uint8_t base = 0;
     uint8_t tileTypeCount = 0;
-    if (tag == ARM64_REG_ZA || tag == ARM64_REG_ZAB0) {
+    if (tag == AARCH64_REG_ZA || tag == AARCH64_REG_ZAB0) {
       // Treat ZA as byte tile : ZAB0 represents whole matrix, only 1 tile
       // Add all rows for this SVL
       // Don't need to set base as will always be 0
       tileTypeCount = 1;
-    } else if (tag >= ARM64_REG_ZAH0 && tag <= ARM64_REG_ZAH1) {
-      base = tag - ARM64_REG_ZAH0;
+    } else if (tag >= AARCH64_REG_ZAH0 && tag <= AARCH64_REG_ZAH1) {
+      base = tag - AARCH64_REG_ZAH0;
       tileTypeCount = 2;
-    } else if (tag >= ARM64_REG_ZAS0 && tag <= ARM64_REG_ZAS3) {
-      base = tag - ARM64_REG_ZAS0;
+    } else if (tag >= AARCH64_REG_ZAS0 && tag <= AARCH64_REG_ZAS3) {
+      base = tag - AARCH64_REG_ZAS0;
       tileTypeCount = 4;
-    } else if (tag >= ARM64_REG_ZAD0 && tag <= ARM64_REG_ZAD7) {
-      base = tag - ARM64_REG_ZAD0;
+    } else if (tag >= AARCH64_REG_ZAD0 && tag <= AARCH64_REG_ZAD7) {
+      base = tag - AARCH64_REG_ZAD0;
       tileTypeCount = 8;
-    } else if (tag >= ARM64_REG_ZAQ0 && tag <= ARM64_REG_ZAQ15) {
-      base = tag - ARM64_REG_ZAQ0;
+    } else if (tag >= AARCH64_REG_ZAQ0 && tag <= AARCH64_REG_ZAQ15) {
+      base = tag - AARCH64_REG_ZAQ0;
       tileTypeCount = 16;
     }
     uint16_t reg_tag = base + (index * tileTypeCount);
@@ -274,22 +331,22 @@ class AArch64RegressionTest : public RegressionTest {
     // Get matrix row register tag
     uint8_t base = 0;
     uint8_t tileTypeCount = 0;
-    if (tag == ARM64_REG_ZA || tag == ARM64_REG_ZAB0) {
+    if (tag == AARCH64_REG_ZA || tag == AARCH64_REG_ZAB0) {
       // Treat ZA as byte tile : ZAB0 represents whole matrix, only 1 tile
       // Add all rows for this SVL
       // Don't need to set base as will always be 0
       tileTypeCount = 1;
-    } else if (tag >= ARM64_REG_ZAH0 && tag <= ARM64_REG_ZAH1) {
-      base = tag - ARM64_REG_ZAH0;
+    } else if (tag >= AARCH64_REG_ZAH0 && tag <= AARCH64_REG_ZAH1) {
+      base = tag - AARCH64_REG_ZAH0;
       tileTypeCount = 2;
-    } else if (tag >= ARM64_REG_ZAS0 && tag <= ARM64_REG_ZAS3) {
-      base = tag - ARM64_REG_ZAS0;
+    } else if (tag >= AARCH64_REG_ZAS0 && tag <= AARCH64_REG_ZAS3) {
+      base = tag - AARCH64_REG_ZAS0;
       tileTypeCount = 4;
-    } else if (tag >= ARM64_REG_ZAD0 && tag <= ARM64_REG_ZAD7) {
-      base = tag - ARM64_REG_ZAD0;
+    } else if (tag >= AARCH64_REG_ZAD0 && tag <= AARCH64_REG_ZAD7) {
+      base = tag - AARCH64_REG_ZAD0;
       tileTypeCount = 8;
-    } else if (tag >= ARM64_REG_ZAQ0 && tag <= ARM64_REG_ZAQ15) {
-      base = tag - ARM64_REG_ZAQ0;
+    } else if (tag >= AARCH64_REG_ZAQ0 && tag <= AARCH64_REG_ZAQ15) {
+      base = tag - AARCH64_REG_ZAQ0;
       tileTypeCount = 16;
     }
 
@@ -349,13 +406,13 @@ class AArch64RegressionTest : public RegressionTest {
   /** Generate an array representing a NEON register from a source vector and a
    * number of elements defined by a number of bytes used. */
   template <typename T>
-  std::array<T, (256 / sizeof(T))> fillNeon(std::vector<T> src,
-                                            int num_bytes) const {
+  std::array<T, (256 / sizeof(T))> fillNeon(const std::vector<T>& src,
+                                            uint32_t num_bytes) const {
     // Create array to be returned and fill with a default value of 0
     std::array<T, (256 / sizeof(T))> generatedArray;
     generatedArray.fill(0);
     // Fill array by cycling through source elements
-    for (int i = 0; i < (num_bytes / sizeof(T)); i++) {
+    for (size_t i = 0; i < (num_bytes / sizeof(T)); i++) {
       generatedArray[i] = src[i % src.size()];
     }
     return generatedArray;
@@ -388,7 +445,7 @@ class AArch64RegressionTest : public RegressionTest {
     std::array<T, (256 / sizeof(T))> generatedArray;
     generatedArray.fill(0);
     // Fill array by adding an increasing offset value to the base value
-    for (int i = 0; i < (num_bytes / sizeof(T)); i++) {
+    for (size_t i = 0; i < (num_bytes / sizeof(T)); i++) {
       generatedArray[i] = base + (i * offset);
     }
     return generatedArray;
@@ -471,17 +528,37 @@ class AArch64RegressionTest : public RegressionTest {
     return generatedArray;
   }
 
+  /** A function to get the current vector length from the test config string if
+   * present (defaults to 0). */
+  uint64_t getVL() {
+    uint64_t VL = 0;
+    // Temporarily construct a ryml::Tree to extract the VL
+    ryml::Tree tempTree =
+        ryml::parse_in_arena(ryml::to_csubstr(std::get<1>(GetParam())));
+    if (tempTree.rootref().has_child("Core") &&
+        tempTree.rootref()["Core"].has_child("Vector-Length")) {
+      VL = tempTree["Core"]["Vector-Length"].as<uint64_t>();
+    }
+    return VL;
+  }
+
+  /** A function to get the current streaming vector length from the test config
+   * string if present (defaults to 0). */
+  uint64_t getSVL() {
+    uint64_t SVL = 0;
+    // Temporarily construct a ryml::Tree to extract the SVL
+    ryml::Tree tempTree =
+        ryml::parse_in_arena(ryml::to_csubstr(std::get<1>(GetParam())));
+    if (tempTree.rootref().has_child("Core") &&
+        tempTree.rootref()["Core"].has_child("Streaming-Vector-Length")) {
+      SVL = tempTree["Core"]["Streaming-Vector-Length"].as<uint64_t>();
+    }
+    return SVL;
+  }
+
   /** The current vector-length being used by the test suite. */
-  const uint64_t VL =
-      (std::get<1>(GetParam())["Vector-Length"].IsDefined() &&
-       !(std::get<1>(GetParam())["Vector-Length"].IsNull()))
-          ? std::get<1>(GetParam())["Vector-Length"].as<uint64_t>()
-          : 0;
+  const uint64_t VL = getVL();
 
   /** The current streaming-vector-length being used by the test suite. */
-  const uint64_t SVL =
-      (std::get<1>(GetParam())["Streaming-Vector-Length"].IsDefined() &&
-       !(std::get<1>(GetParam())["Streaming-Vector-Length"].IsNull()))
-          ? std::get<1>(GetParam())["Streaming-Vector-Length"].as<uint64_t>()
-          : 0;
-};
\ No newline at end of file
+  const uint64_t SVL = getSVL();
+};
diff --git a/test/regression/aarch64/CMakeLists.txt b/test/regression/aarch64/CMakeLists.txt
index fb5e499ce3..5746e78495 100644
--- a/test/regression/aarch64/CMakeLists.txt
+++ b/test/regression/aarch64/CMakeLists.txt
@@ -22,12 +22,19 @@ add_executable(regression-aarch64
                instructions/store.cc
                instructions/sve.cc
                )
+
+configure_file(${capstone_SOURCE_DIR}/arch/AArch64/AArch64GenInstrInfo.inc AArch64GenInstrInfo.inc COPYONLY)
+
 target_include_directories(regression-aarch64 PRIVATE
-                           ${CMAKE_CURRENT_SOURCE_DIR})
+                            ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(regression-aarch64 PRIVATE
+                            ${CMAKE_CURRENT_BINARY_DIR})
+
 target_link_libraries(regression-aarch64 regression-test-base)
+target_compile_options(regression-aarch64 PRIVATE ${SIMENG_COMPILE_OPTIONS})
 
 # Define a macro so that tests can find data files
 target_compile_definitions(regression-aarch64 PRIVATE
   "SIMENG_AARCH64_TEST_ROOT=\"${CMAKE_CURRENT_SOURCE_DIR}\"")
 
-add_test(NAME regression-aarch64-test COMMAND regression-aarch64)
+add_test(NAME regression-aarch64-test COMMAND regression-aarch64)
\ No newline at end of file
diff --git a/test/regression/aarch64/Exception.cc b/test/regression/aarch64/Exception.cc
index 8bac46bdd1..b987ae4429 100644
--- a/test/regression/aarch64/Exception.cc
+++ b/test/regression/aarch64/Exception.cc
@@ -7,6 +7,63 @@ namespace {
 
 using Exception = AArch64RegressionTest;
 
+/** AArch64 opcodes. Each opcode represents a unique AArch64 operation. */
+namespace Opcode {
+#define GET_INSTRINFO_ENUM
+#include "AArch64GenInstrInfo.inc"
+}  // namespace Opcode
+
+// Test that an invalid capstone instruction id raises an unallocated
+// instruction encoding exception
+TEST_P(Exception, encoding_unallocated) {
+  // Initialise heap with an unallocated instruction encoding
+  initialHeapData_.resize(4);
+  uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heap[0] = 0xDEADBEEF;
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+    mov x20, x0
+
+    br x20
+  )");
+  const char err[] =
+      "\n[SimEng:ExceptionHandler] Encountered unallocated instruction "
+      "encoding exception";
+  EXPECT_EQ(stdout_.substr(0, strlen(err)), err);
+}
+
+// Test that an instruction with no implemented execution logic raises a
+// not-yet-implemented exception
+TEST_P(Exception, not_yet_implemented) {
+  // Initialise heap with an instruction with no execution logic, namely with
+  // capstone undefined AArch64 opcode AArch64_UDF
+  initialHeapData_.resize(4);
+  uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heap[0] = 0x0;
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+    mov x20, x0
+
+    br x20
+  )");
+  const char err[] =
+      "\n[SimEng:ExceptionHandler] Encountered execution not-yet-implemented "
+      "exception";
+  EXPECT_EQ(stdout_.substr(0, strlen(err)), err);
+}
+
+// Test for InstructionException::AliasNotYetImplemented omitted. Obtaining an
+// instruction encoding that will consistently trigger an AliasNotYetImplemented
+// exception is not feasible due to the continual updates to our alias reversion
+// support and the difficulty of generating the bytes for an instruction alias
+// not yet supported.
+
 // Test that branching to an address that is misaligned raises an exception.
 TEST_P(Exception, misaligned_pc) {
   RUN_AARCH64(R"(
@@ -16,10 +73,92 @@ TEST_P(Exception, misaligned_pc) {
   const char err[] =
       "\n[SimEng:ExceptionHandler] Encountered misaligned program counter "
       "exception";
-  EXPECT_EQ(stdout_.substr(0, sizeof(err) - 1), err);
+  EXPECT_EQ(stdout_.substr(0, strlen(err)), err);
+}
+
+// Test that trying to load data from an address outside the bounds of the
+// process image raises a data abort exception
+TEST_P(Exception, data_abort) {
+  RUN_AARCH64(R"(
+    mov x0, #10000
+    mul x0, x0, x0
+    ldr x1, [x0]
+  )");
+  const char err[] =
+      "\n[SimEng:ExceptionHandler] Encountered data abort exception";
+  EXPECT_EQ(stdout_.substr(0, strlen(err)), err);
+}
+
+// Test that an unsupported SVC call raises an exception
+TEST_P(Exception, unsupported_svc) {
+  RUN_AARCH64(R"(
+    mov x8, #3
+    svc #0
+  )");
+
+  // EQ comparison on the full exception output to ensure the correct system
+  // call ID of 3 is printed
+  int svcOpcodeId = Opcode::AArch64_SVC;
+  std::string err =
+      std::string(
+          "\n[SimEng:ExceptionHandler] Encountered supervisor call "
+          "exception\n[SimEng:ExceptionHandler]   Generated by "
+          "instruction:\n[SimEng:ExceptionHandler]     0x0000000000000004: 01 "
+          "00 00 d4     svc #0\n[SimEng:ExceptionHandler]       opcode ID: ") +
+      std::to_string(svcOpcodeId) +
+      std::string("\n\n[SimEng:ExceptionHandler] Unrecognised syscall: 3");
+  EXPECT_EQ(stdout_.substr(0, err.size()), err.c_str());
+}
+
+// TODO: Write test for InstructionException::HypervisorCall once it has a
+// trigger case
+// TODO: Write test for InstructionException::SecureMonitorCall once it has a
+// trigger case
+
+// Test that trying to process an instruction with no supporting issue port
+// raises a no available port exception
+TEST_P(Exception, no_available_port) {
+  RUN_AARCH64(R"(
+    fmov d0, #3
+  )");
+  std::string err;
+  // Exception raised on outoforder core archetype only
+  if (std::get<0>(GetParam()) == OUTOFORDER) {
+    err =
+        "\n[SimEng:ExceptionHandler] Encountered unsupported execution "
+        "port exception";
+  } else {
+    // Placeholder string for non-outoforder core to be replaced when
+    // appropriate. Ensures changes to this test case won't be forgotten if
+    // updates to other core archetypes are carried out such that they can now
+    // raise an InstructionException::NoAvailablePort exception
+    err =
+        "\n[SimEng:ExceptionHandler] Encountered execution not-yet-implemented "
+        "exception";
+  }
+  EXPECT_EQ(stdout_.substr(0, err.size()), err.c_str());
+}
+
+// Test that utilising an unsupported system register raises a unmapped system
+// register exception
+TEST_P(Exception, unmapped_sys_reg) {
+  RUN_AARCH64(R"(
+    mrs x0, CPTR_EL2
+  )");
+  const char err[] =
+      "\n[SimEng:ExceptionHandler] Encountered unmapped system register "
+      "exception";
+  EXPECT_EQ(stdout_.substr(0, strlen(err)), err);
 }
 
 #if SIMENG_LLVM_VERSION >= 14
+// TODO: Write test for InstructionException::StreamingModeUpdate once it has a
+// trigger case
+// TODO: Write test for InstructionException::ZAregisterStatusUpdate once it has
+// a trigger case
+// TODO: Write test for InstructionException::SMZAUpdate once it has a trigger
+// case
+
 // Test that performing an SME instruction in the wrong context mode raises an
 // exception.
 TEST_P(Exception, SME_context_modes) {
@@ -30,7 +169,7 @@ TEST_P(Exception, SME_context_modes) {
   const char err0[] =
       "\n[SimEng:ExceptionHandler] Encountered SME execution attempt when "
       "streaming mode disabled";
-  EXPECT_EQ(stdout_.substr(0, sizeof(err0) - 1), err0);
+  EXPECT_EQ(stdout_.substr(0, strlen(err0)), err0);
 
   RUN_AARCH64(R"(
   smstart sm
@@ -39,7 +178,7 @@ TEST_P(Exception, SME_context_modes) {
   const char err1[] =
       "\n[SimEng:ExceptionHandler] Encountered ZA register access attempt when "
       "disabled";
-  EXPECT_EQ(stdout_.substr(0, sizeof(err1) - 1), err1);
+  EXPECT_EQ(stdout_.substr(0, strlen(err1)), err1);
 
   RUN_AARCH64(R"(
   smstart sm
@@ -48,7 +187,50 @@ TEST_P(Exception, SME_context_modes) {
   const char err2[] =
       "\n[SimEng:ExceptionHandler] Encountered ZA register access attempt when "
       "disabled";
-  EXPECT_EQ(stdout_.substr(0, sizeof(err2) - 1), err2);
+  EXPECT_EQ(stdout_.substr(0, strlen(err2)), err2);
+}
+
+// Ensure that calling smstart/smstop such that the values in SVCR.SMZA do not
+// change doesn't cause a flush of the associated register files
+TEST_P(Exception, Null_Smstart_smstop_calls) {
+  RUN_AARCH64(R"(
+    smstart
+    dup z0.d, #3
+    smstart
+  )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({3}, SVL / 8));
+
+  RUN_AARCH64(R"(
+    smstart
+    dup z0.d, #4
+    smstart sm
+  )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({4}, SVL / 8));
+
+  RUN_AARCH64(R"(
+    smstart
+    dup z0.d, #5
+    smstart za
+  )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({5}, SVL / 8));
+
+  RUN_AARCH64(R"(
+    dup z0.d, #6
+    smstop
+  )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({6}, VL / 8));
+
+  RUN_AARCH64(R"(
+    dup z0.d, #7
+    smstop sm
+  )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({7}, VL / 8));
+
+  RUN_AARCH64(R"(
+    dup z0.d, #8
+    smstop za
+  )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({8}, VL / 8));
 }
 
 TEST_P(Exception, svcr) {
@@ -76,6 +258,13 @@ TEST_P(Exception, svcr) {
   )");
   CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0}, VL / 8));
 
+  RUN_AARCH64(R"(
+    # Ensure z regs get enabled when SM enabled
+    smstart sm
+    dup z0.d, #3
+  )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({3}, SVL / 8));
+
   RUN_AARCH64(R"(
     # Ensure z regs get zeroed out when SM disabled
     smstart
@@ -84,6 +273,14 @@ TEST_P(Exception, svcr) {
   )");
   CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0}, VL / 8));
 
+  RUN_AARCH64(R"(
+    # Ensure z regs do not get zeroed out when ZA is disabled
+    smstart
+    dup z0.d, #3
+    smstop za
+  )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({3}, SVL / 8));
+
   RUN_AARCH64(R"(
     # Ensure za reg gets zeroed out when ZA enabled
     smstart
@@ -95,8 +292,83 @@ TEST_P(Exception, svcr) {
     smstop
     smstart
   )");
-  for (int i = 0; i < (SVL / 8); i++) {
-    CHECK_MAT_ROW(ARM64_REG_ZA, i, uint32_t, fillNeon<uint32_t>({0}, SVL / 8));
+  for (uint64_t i = 0; i < (SVL / 8); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                  fillNeon<uint32_t>({0}, SVL / 8));
+  }
+
+  // Check that changes to SVCR using msr svcr, xn work correctly
+  RUN_AARCH64(R"(
+    mov x4, #3
+    mov x5, #0
+    # Ensure vector length changes from SVE's to SME's
+    cntb x0
+    msr svcr, x4
+    cntb x1
+    msr svcr, x5
+    cntb x2
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(0), VL / 8);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(1), SVL / 8);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(2), VL / 8);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(0), getGeneralRegister<uint64_t>(2));
+  EXPECT_GT(getGeneralRegister<uint64_t>(1), getGeneralRegister<uint64_t>(0));
+  EXPECT_GT(SVL, VL);
+
+  RUN_AARCH64(R"(
+    mov x4, #3
+    # Ensure z regs get zeroed out when SM enabled
+    dup z0.d, #3
+    msr svcr, x4
+  )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0}, VL / 8));
+
+  RUN_AARCH64(R"(
+    mov x4, #1
+    # Ensure z regs get enabled when SM enabled
+    msr svcr, x4
+    dup z0.d, #3
+  )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({3}, SVL / 8));
+
+  RUN_AARCH64(R"(
+    mov x4, #3
+    mov x5, #0
+    # Ensure z regs get zeroed out when SM disabled
+    msr svcr, x4
+    dup z0.d, #3
+    msr svcr, x5
+  )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0}, VL / 8));
+
+  RUN_AARCH64(R"(
+    # enable SM and ZA
+    mov x4, #3
+    # just disable ZA
+    mov x5, #1
+    # Ensure z regs do not get zeroed out when ZA is disabled
+    msr svcr, x4
+    dup z0.d, #3
+    msr svcr, x5
+  )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({3}, SVL / 8));
+
+  RUN_AARCH64(R"(
+    mov x4, #3
+    mov x5, #0
+    # Ensure za reg gets zeroed out when ZA enabled
+    msr svcr, x4
+    dup z0.s, #2
+    dup z1.s, #3
+    ptrue p0.s
+    ptrue p1.s
+    fmopa za0.s, p0/m, p1/m, z0.s, z1.s
+    msr svcr, x5
+    msr svcr, x4
+  )");
+  for (uint64_t i = 0; i < (SVL / 8); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                  fillNeon<uint32_t>({0}, SVL / 8));
   }
 }
 #endif
@@ -106,13 +378,18 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(
         std::make_tuple(
             EMULATION,
-            YAML::Load("{Vector-Length: 512, Streaming-Vector-Length: 1024}")),
+            "{Core: {Vector-Length: 512, Streaming-Vector-Length: 1024}, "
+            "LSQ-L1-Interface: {Load-Bandwidth: 256, Store-Bandwidth: 256}}"),
         std::make_tuple(
             INORDER,
-            YAML::Load("{Vector-Length: 512, Streaming-Vector-Length: 1024}")),
+            "{Core: {Vector-Length: 512, Streaming-Vector-Length: 1024}, "
+            "LSQ-L1-Interface: {Load-Bandwidth: 256, Store-Bandwidth: 256}}"),
         std::make_tuple(
             OUTOFORDER,
-            YAML::Load("{Vector-Length: 512, Streaming-Vector-Length: 1024}"))),
+            "{Core: {Vector-Length: 512, Streaming-Vector-Length: 1024}, "
+            "LSQ-L1-Interface: {Load-Bandwidth: 256, Store-Bandwidth: 256}, "
+            "Ports: {'0': {Portname: 0, Instruction-Group-Support: [INT, SVE, "
+            "PREDICATE, LOAD, STORE, BRANCH, SME]}}}")),
     paramToString);
 
 }  // namespace
diff --git a/test/regression/aarch64/LoadStoreQueue.cc b/test/regression/aarch64/LoadStoreQueue.cc
index 1d7e317dfc..0250b48e24 100644
--- a/test/regression/aarch64/LoadStoreQueue.cc
+++ b/test/regression/aarch64/LoadStoreQueue.cc
@@ -23,7 +23,7 @@ TEST_P(LoadStoreQueue, RAW) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(2), 42u);
 }
 
-// Test multiple simulteneous RAW violations are flushed correctly.
+// Test multiple simultaneous RAW violations are flushed correctly.
 TEST_P(LoadStoreQueue, RAWx2) {
   initialHeapData_.resize(8);
   reinterpret_cast<uint64_t*>(initialHeapData_.data())[0] = -1;
@@ -91,8 +91,7 @@ TEST_P(LoadStoreQueue, SpeculativeInvalidLoad) {
 }
 
 INSTANTIATE_TEST_SUITE_P(AArch64, LoadStoreQueue,
-                         ::testing::Values(std::make_tuple(OUTOFORDER,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(OUTOFORDER, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/regression/aarch64/MicroOperation.cc b/test/regression/aarch64/MicroOperation.cc
index 86b576e2da..ceca4744e0 100644
--- a/test/regression/aarch64/MicroOperation.cc
+++ b/test/regression/aarch64/MicroOperation.cc
@@ -10,6 +10,260 @@
 namespace {
 
 using MicroOp = AArch64RegressionTest;
+using namespace simeng::arch::aarch64;
+
+TEST_P(MicroOp, ld1Two) {
+  initialHeapData_.resize(32);
+  uint64_t* heap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  heap[0] = 0x66554433221100FF;
+  heap[1] = 0xEEDDCCBBAA998877;
+  heap[2] = 0x66554433221100FF;
+  heap[3] = 0xEEDDCCBBAA998877;
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ld1 {v0.16b, v1.16b}, [x0]
+    ld1 {v2.8b, v3.8b}, [x0]
+    ld1 {v4.8h, v5.8h}, [x0]
+    ld1 {v6.4h, v7.4h}, [x0]
+    ld1 {v8.4s, v9.4s}, [x0]
+    ld1 {v10.2s, v11.2s}, [x0]
+    ld1 {v12.2d, v13.2d}, [x0]
+    ld1 {v14.1d, v15.1d}, [x0]
+  )");
+  CHECK_NEON(0, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(1, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(2, uint8_t, {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66});
+  CHECK_NEON(3, uint8_t, {0x77, 0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+
+  CHECK_NEON(4, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(5, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(6, uint16_t, {0x00FF, 0x2211, 0x4433, 0x6655});
+  CHECK_NEON(7, uint16_t, {0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+
+  CHECK_NEON(8, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(9, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(10, uint32_t, {0x221100FF, 0x66554433});
+  CHECK_NEON(11, uint32_t, {0xAA998877, 0xEEDDCCBB});
+
+  CHECK_NEON(12, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(13, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(14, uint64_t, {0x66554433221100FF});
+  CHECK_NEON(15, uint64_t, {0xEEDDCCBBAA998877});
+}
+
+TEST_P(MicroOp, ld1TwoPost) {
+  initialHeapData_.resize(192);
+  uint64_t* heap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  bool aORb = true;
+  uint64_t valueA = 0x66554433221100FF;
+  uint64_t valueB = 0xEEDDCCBBAA998877;
+  for (int i = 0; i < 24; i++) {
+    heap[i] = aORb ? valueA : valueB;
+    aORb = !aORb;
+  }
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #32
+    mov x2, #16
+
+    ld1 {v0.16b, v1.16b}, [x0], #32
+    ld1 {v2.8b, v3.8b}, [x0], #16
+    ld1 {v4.8h, v5.8h}, [x0], x1
+    ld1 {v6.4h, v7.4h}, [x0], x2
+    ld1 {v8.4s, v9.4s}, [x0], #32
+    ld1 {v10.2s, v11.2s}, [x0], #16
+    ld1 {v12.2d, v13.2d}, [x0], x1
+    ld1 {v14.1d, v15.1d}, [x0], x2
+  )");
+  CHECK_NEON(0, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(1, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(2, uint8_t, {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66});
+  CHECK_NEON(3, uint8_t, {0x77, 0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+
+  CHECK_NEON(4, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(5, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(6, uint16_t, {0x00FF, 0x2211, 0x4433, 0x6655});
+  CHECK_NEON(7, uint16_t, {0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+
+  CHECK_NEON(8, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(9, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(10, uint32_t, {0x221100FF, 0x66554433});
+  CHECK_NEON(11, uint32_t, {0xAA998877, 0xEEDDCCBB});
+
+  CHECK_NEON(12, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(13, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(14, uint64_t, {0x66554433221100FF});
+  CHECK_NEON(15, uint64_t, {0xEEDDCCBBAA998877});
+}
+
+TEST_P(MicroOp, ld1Four) {
+  initialHeapData_.resize(64);
+  uint64_t* heap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  heap[0] = 0x66554433221100FF;
+  heap[1] = 0xEEDDCCBBAA998877;
+  heap[2] = 0x66554433221100FF;
+  heap[3] = 0xEEDDCCBBAA998877;
+  heap[4] = 0x66554433221100FF;
+  heap[5] = 0xEEDDCCBBAA998877;
+  heap[6] = 0x66554433221100FF;
+  heap[7] = 0xEEDDCCBBAA998877;
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
+    ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [x0]
+    ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0]
+    ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [x0]
+    ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0]
+    ld1 {v20.2s, v21.2s, v22.2s, v23.2s}, [x0]
+    ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x0]
+    ld1 {v28.1d, v29.1d, v30.1d, v31.1d}, [x0]
+  )");
+  CHECK_NEON(0, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(1, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(2, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(3, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(4, uint8_t, {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66});
+  CHECK_NEON(5, uint8_t, {0x77, 0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(6, uint8_t, {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66});
+  CHECK_NEON(7, uint8_t, {0x77, 0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+
+  CHECK_NEON(8, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(9, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(10, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(11, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(12, uint16_t, {0x00FF, 0x2211, 0x4433, 0x6655});
+  CHECK_NEON(13, uint16_t, {0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(14, uint16_t, {0x00FF, 0x2211, 0x4433, 0x6655});
+  CHECK_NEON(15, uint16_t, {0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+
+  CHECK_NEON(16, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(17, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(18, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(19, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(20, uint32_t, {0x221100FF, 0x66554433});
+  CHECK_NEON(21, uint32_t, {0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(22, uint32_t, {0x221100FF, 0x66554433});
+  CHECK_NEON(23, uint32_t, {0xAA998877, 0xEEDDCCBB});
+
+  CHECK_NEON(24, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(25, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(26, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(27, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(28, uint64_t, {0x66554433221100FF});
+  CHECK_NEON(29, uint64_t, {0xEEDDCCBBAA998877});
+  CHECK_NEON(30, uint64_t, {0x66554433221100FF});
+  CHECK_NEON(31, uint64_t, {0xEEDDCCBBAA998877});
+}
+
+TEST_P(MicroOp, ld1FourPost) {
+  initialHeapData_.resize(384);
+  uint64_t* heap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  bool aORb = true;
+  uint64_t valueA = 0x66554433221100FF;
+  uint64_t valueB = 0xEEDDCCBBAA998877;
+  for (int i = 0; i < 48; i++) {
+    heap[i] = aORb ? valueA : valueB;
+    aORb = !aORb;
+  }
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
+    ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [x0]
+    ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0]
+    ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [x0]
+    ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0]
+    ld1 {v20.2s, v21.2s, v22.2s, v23.2s}, [x0]
+    ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x0]
+    ld1 {v28.1d, v29.1d, v30.1d, v31.1d}, [x0]
+  )");
+  CHECK_NEON(0, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(1, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(2, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(3, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(4, uint8_t, {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66});
+  CHECK_NEON(5, uint8_t, {0x77, 0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(6, uint8_t, {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66});
+  CHECK_NEON(7, uint8_t, {0x77, 0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+
+  CHECK_NEON(8, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(9, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(10, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(11, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(12, uint16_t, {0x00FF, 0x2211, 0x4433, 0x6655});
+  CHECK_NEON(13, uint16_t, {0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(14, uint16_t, {0x00FF, 0x2211, 0x4433, 0x6655});
+  CHECK_NEON(15, uint16_t, {0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+
+  CHECK_NEON(16, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(17, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(18, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(19, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(20, uint32_t, {0x221100FF, 0x66554433});
+  CHECK_NEON(21, uint32_t, {0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(22, uint32_t, {0x221100FF, 0x66554433});
+  CHECK_NEON(23, uint32_t, {0xAA998877, 0xEEDDCCBB});
+
+  CHECK_NEON(24, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(25, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(26, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(27, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(28, uint64_t, {0x66554433221100FF});
+  CHECK_NEON(29, uint64_t, {0xEEDDCCBBAA998877});
+  CHECK_NEON(30, uint64_t, {0x66554433221100FF});
+  CHECK_NEON(31, uint64_t, {0xEEDDCCBBAA998877});
+}
 
 TEST_P(MicroOp, loadPairD) {
   initialHeapData_.resize(48);
@@ -39,6 +293,10 @@ TEST_P(MicroOp, loadPairD) {
   CHECK_NEON(6, double, {-3.0});
   CHECK_NEON(7, double, {1.0});
   CHECK_NEON(8, double, {-1.0});
+
+  EXPECT_GROUP(R"(ldp d1, d2, [x0], #16)", InstructionGroups::LOAD_SCALAR,
+               InstructionGroups::LOAD_SCALAR,
+               InstructionGroups::INT_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(MicroOp, loadPairQ) {
@@ -394,14 +652,22 @@ TEST_P(MicroOp, storePairD) {
     stp d4, d5, [sp, #16]
     stp d6, d7, [sp, #-16]!
   )");
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 1024), -5.0);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 1016), -3.5);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 1008), 3.5);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 1000), 5.0);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 992), -1.5);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 984), -0.5);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 976), 0.5);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 968), 1.5);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 1024),
+            -5.0);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 1016),
+            -3.5);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 1008),
+            3.5);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 1000),
+            5.0);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 992),
+            -1.5);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 984),
+            -0.5);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 976),
+            0.5);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 968),
+            1.5);
 }
 
 TEST_P(MicroOp, storePairQ) {
@@ -441,37 +707,37 @@ TEST_P(MicroOp, storePairQ) {
     stp q4, q5, [sp, #32]
     stp q6, q7, [sp, #-32]!
   )");
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1024),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1024),
             0xABBACAFEABBACAFE);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1016),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1016),
             0x1234567898765432);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1008),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1008),
             0xABCDEFABCDEFABCD);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1000),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1000),
             0xCAFEABBACAFEABBA);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 992),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 992),
             0x9876543212345678);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 984),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 984),
             0xFEDCBAFEDCBAFEDC);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 976),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 976),
             0xABBACAFEABBACAFE);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 968),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 968),
             0x1234567898765432);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 960),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 960),
             0x9876543212345678);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 952),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 952),
             0xFEDCBAFEDCBAFEDC);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 944),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 944),
             0xABBACAFEABBACAFE);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 936),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 936),
             0x1234567898765432);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 928),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 928),
             0xABBACAFEABBACAFE);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 920),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 920),
             0x1234567898765432);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 912),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 912),
             0xABCDEFABCDEFABCD);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 904),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 904),
             0xCAFEABBACAFEABBA);
 }
 
@@ -493,14 +759,22 @@ TEST_P(MicroOp, storePairS) {
     stp s4, s5, [sp, #8]
     stp s6, s7, [sp, #-8]!
   )");
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 1024), -5.0f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 1020), -3.5f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 1016), 3.5f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 1012), 5.0f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 1008), -1.5f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 1004), -0.5f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 1000), 0.5f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 996), 1.5f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 1024),
+            -5.0f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 1020),
+            -3.5f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 1016),
+            3.5f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 1012),
+            5.0f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 1008),
+            -1.5f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 1004),
+            -0.5f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 1000),
+            0.5f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 996),
+            1.5f);
 }
 
 TEST_P(MicroOp, storePairW) {
@@ -521,14 +795,22 @@ TEST_P(MicroOp, storePairW) {
     stp w4, w5, [sp, #8]
     stp w6, w7, [sp, #-8]!
   )");
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 1024), 12);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 1020), 24);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 1016), 84);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 1012), 96);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 1008), 36);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 1004), 48);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 1000), 60);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 996), 72);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 1024),
+            12);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 1020),
+            24);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 1016),
+            84);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 1012),
+            96);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 1008),
+            36);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 1004),
+            48);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 1000),
+            60);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 996),
+            72);
 }
 
 TEST_P(MicroOp, storePairX) {
@@ -549,14 +831,22 @@ TEST_P(MicroOp, storePairX) {
     stp x4, x5, [sp, #16]
     stp x6, x7, [sp, #-16]!
   )");
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1024), 12);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1016), 24);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1008), 84);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1000), 96);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 992), 36);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 984), 48);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 976), 60);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 968), 72);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1024),
+            12);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1016),
+            24);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1008),
+            84);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1000),
+            96);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 992),
+            36);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 984),
+            48);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 976),
+            60);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 968),
+            72);
 }
 
 TEST_P(MicroOp, storeB) {
@@ -584,10 +874,14 @@ TEST_P(MicroOp, storeB) {
     str b2, [sp, #1]
     str b3, [sp, #-1]!
   )");
-  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() - 1024), 0xAB);
-  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() - 1023), 0xFE);
-  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() - 1022), 0xBA);
-  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() - 1021), 0xCA);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 1024),
+            0xAB);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 1023),
+            0xFE);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 1022),
+            0xBA);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 1021),
+            0xCA);
 }
 
 TEST_P(MicroOp, storeD) {
@@ -604,10 +898,14 @@ TEST_P(MicroOp, storeD) {
     str d2, [sp, #8]
     str d3, [sp, #-8]!
   )");
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 1024), -3.0);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 1016), 3.0);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 1008), -1.5);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 1000), 1.5);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 1024),
+            -3.0);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 1016),
+            3.0);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 1008),
+            -1.5);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 1000),
+            1.5);
 }
 
 TEST_P(MicroOp, storeH) {
@@ -635,13 +933,13 @@ TEST_P(MicroOp, storeH) {
     str h2, [sp, #2]
     str h3, [sp, #-2]!
   )");
-  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getStackPointer() - 1024),
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 1024),
             0xABBA);
-  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getStackPointer() - 1022),
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 1022),
             0x5678);
-  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getStackPointer() - 1020),
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 1020),
             0xCAFE);
-  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getStackPointer() - 1018),
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 1018),
             0x1234);
 }
 
@@ -674,21 +972,21 @@ TEST_P(MicroOp, storeQ) {
     str q2, [sp, #16]
     str q3, [sp, #-16]!
   )");
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1024),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1024),
             0xABBACAFEABBACAFE);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1016),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1016),
             0x1234567898765432);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1008),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1008),
             0xABBACAFEABBACAFE);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1000),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1000),
             0x1234567898765432);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 992),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 992),
             0xABCDEFABCDEFABCD);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 984),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 984),
             0xCAFEABBACAFEABBA);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 976),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 976),
             0x9876543212345678);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 968),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 968),
             0xFEDCBAFEDCBAFEDC);
 }
 
@@ -706,10 +1004,14 @@ TEST_P(MicroOp, storeS) {
     str s2, [sp, #4]
     str s3, [sp, #-4]!
   )");
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 1024), -3.0f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 1020), 3.0f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 1016), -1.5f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 1012), 1.5f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 1024),
+            -3.0f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 1020),
+            3.0f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 1016),
+            -1.5f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 1012),
+            1.5f);
 }
 
 TEST_P(MicroOp, storeW) {
@@ -726,10 +1028,14 @@ TEST_P(MicroOp, storeW) {
     str w2, [sp, #4]
     str w3, [sp, #-4]!
   )");
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 1024), 12);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 1020), 48);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 1016), 24);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 1012), 36);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 1024),
+            12);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 1020),
+            48);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 1016),
+            24);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 1012),
+            36);
 }
 
 TEST_P(MicroOp, storeX) {
@@ -746,10 +1052,14 @@ TEST_P(MicroOp, storeX) {
     str x2, [sp, #8]
     str x3, [sp, #-8]!
   )");
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1024), 12);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1016), 48);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1008), 24);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1000), 36);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1024),
+            12);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1016),
+            48);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1008),
+            24);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1000),
+            36);
 }
 
 TEST_P(MicroOp, storeThenLoad) {
@@ -773,10 +1083,14 @@ TEST_P(MicroOp, storeThenLoad) {
     ldr x7, [sp, #8]
     ldr x8, [sp, #-8]!
   )");
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1024), 12);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1016), 48);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1008), 24);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1000), 36);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1024),
+            12);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1016),
+            48);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1008),
+            24);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1000),
+            36);
   EXPECT_EQ(getGeneralRegister<uint64_t>(5), 12);
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 24);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 36);
@@ -808,14 +1122,22 @@ TEST_P(MicroOp, storeThenLoadPair) {
     ldp x12, x13, [sp, #16]
     ldp x14, x15, [sp, #-16]!
   )");
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1024), 12);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1016), 24);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1008), 84);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1000), 96);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 992), 36);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 984), 48);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 976), 60);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 968), 72);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1024),
+            12);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1016),
+            24);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1008),
+            84);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1000),
+            96);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 992),
+            36);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 984),
+            48);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 976),
+            60);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 968),
+            72);
   EXPECT_EQ(getGeneralRegister<uint64_t>(8), 12);
   EXPECT_EQ(getGeneralRegister<uint64_t>(9), 24);
   EXPECT_EQ(getGeneralRegister<uint64_t>(10), 36);
@@ -829,9 +1151,11 @@ TEST_P(MicroOp, storeThenLoadPair) {
 INSTANTIATE_TEST_SUITE_P(
     AArch64, MicroOp,
     ::testing::Values(
-        std::make_tuple(EMULATION, YAML::Load("{Micro-Operations: True}")),
-        std::make_tuple(INORDER, YAML::Load("{Micro-Operations: True}")),
-        std::make_tuple(OUTOFORDER, YAML::Load("{Micro-Operations: True}"))),
+        std::make_tuple(EMULATION, "{Core: {Micro-Operations: True}}"),
+        std::make_tuple(INORDER, "{Core: {Micro-Operations: True}}"),
+        std::make_tuple(OUTOFORDER,
+                        "{Core: {Micro-Operations: True}, L1-Data-Memory: "
+                        "{Interface-Type: Fixed}}")),
     paramToString);
 
 }  // namespace
diff --git a/test/regression/aarch64/SmokeTest.cc b/test/regression/aarch64/SmokeTest.cc
index fad544d04c..da214bfae8 100644
--- a/test/regression/aarch64/SmokeTest.cc
+++ b/test/regression/aarch64/SmokeTest.cc
@@ -3,6 +3,7 @@
 namespace {
 
 using SmokeTest = AArch64RegressionTest;
+using namespace simeng::arch::aarch64;
 
 // Test that a trivial instruction will execute
 TEST_P(SmokeTest, instruction) {
@@ -10,6 +11,8 @@ TEST_P(SmokeTest, instruction) {
     orr x0, xzr, #7
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(0), 7u);
+  EXPECT_GROUP(R"(orr x0, xzr, #7)",
+               InstructionGroups::INT_SIMPLE_LOGICAL_NOSHIFT);
 }
 
 // Test a loop executing 1024 times, adding 3 to w1 each time
@@ -34,8 +37,10 @@ TEST_P(SmokeTest, stack) {
     str w0, [sp, -4]
     str w1, [sp, -8]
   )");
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 4), 7u);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 8), 42u);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 4),
+            7u);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 8),
+            42u);
 }
 
 // Test that we can store values to the heap
@@ -59,9 +64,11 @@ TEST_P(SmokeTest, heap) {
 
 INSTANTIATE_TEST_SUITE_P(
     AArch64, SmokeTest,
-    ::testing::Values(std::make_tuple(EMULATION, YAML::Load("{}")),
-                      std::make_tuple(INORDER, YAML::Load("{}")),
-                      std::make_tuple(OUTOFORDER, YAML::Load("{}"))),
+    ::testing::Values(std::make_tuple(EMULATION, "{}"),
+                      std::make_tuple(INORDER, "{}"),
+                      std::make_tuple(OUTOFORDER,
+                                      "{L1-Data-Memory: "
+                                      "{Interface-Type: Fixed}}")),
     paramToString);
 
 }  // namespace
diff --git a/test/regression/aarch64/Syscall.cc b/test/regression/aarch64/Syscall.cc
index eed2208d9c..0866c278e2 100644
--- a/test/regression/aarch64/Syscall.cc
+++ b/test/regression/aarch64/Syscall.cc
@@ -1,5 +1,8 @@
+#include <fcntl.h>
 #include <stdlib.h>
+#include <sys/stat.h>
 #include <sys/syscall.h>
+#include <unistd.h>
 
 #include <cstring>
 #include <fstream>
@@ -14,68 +17,6 @@ using Syscall = AArch64RegressionTest;
 /** The maximum size of a filesystem path. */
 static const size_t LINUX_PATH_MAX = 4096;
 
-TEST_P(Syscall, getrandom) {
-  initialHeapData_.resize(24);
-  memset(initialHeapData_.data(), -1, 16);
-
-  RUN_AARCH64(R"(
-      # Get heap address
-      mov x0, 0
-      mov x8, 214
-      svc #0
-
-      # store inital heap address
-      mov x10, x0
-
-      # Save 8 random bytes to the heap
-      # getrandom(buf * = [a], buflen = 8, no flags)
-      mov x1, #8
-      mov x8, #278
-      svc #0
-
-      # Save another 8 random bytes to the heap
-      # getrandom(buf * = [a], buflen = 8, no flags)
-      add x0, x10, #8
-      mov x1, #8
-      mov x8, #278
-      svc #0
-
-    )");
-
-  // Check getrandom returned 8 (8 bytes were requested)
-  EXPECT_EQ(getGeneralRegister<int64_t>(0), 8);
-
-  int heapStart = getGeneralRegister<int64_t>(10);
-  for (size_t i = 0; i < 8; i++) {
-    printf("compare %x == %x\n", getMemoryValue<uint8_t>(heapStart + i),
-           getMemoryValue<uint8_t>(heapStart + 8 + i));
-  }
-
-  // check that the retuned bytes arent all equal to -1.
-  // heap was initialised to -1 so check bytes have changed
-  bool allUnchanged = true;
-  for (size_t i = 0; i < 16; i++) {
-    if (getMemoryValue<uint8_t>(heapStart + i) != 0xFF) {
-      allUnchanged = false;
-      break;
-    }
-  }
-  EXPECT_EQ(allUnchanged, false);
-
-  // Check that the returned bytes from the two syscalls dont all match.
-  // If they do then the returned bytes surely werent random
-  bool allMatch = true;
-  for (char i = 0; i < 8; i++) {
-    if (getMemoryValue<uint8_t>(heapStart + i) !=
-        getMemoryValue<uint8_t>(heapStart + 8 + i)) {
-      allMatch = false;
-      break;
-    }
-  }
-
-  EXPECT_EQ(allMatch, false);
-}
-
 TEST_P(Syscall, ioctl) {
   // TIOCGWINSZ: test it returns zero and sets the output to anything
   initialHeapData_.resize(8);
@@ -86,7 +27,7 @@ TEST_P(Syscall, ioctl) {
     mov x8, 214
     svc #0
 
-    # ioctl(fd=1, request=0x5413, argp=x0)
+    # ioctl(fd=1, request=TIOCGWINSZ, argp=x0)
     mov x2, x0
     mov x1, 0x5413
     mov x0, #1
@@ -94,12 +35,72 @@ TEST_P(Syscall, ioctl) {
     svc #0
   )");
   EXPECT_EQ(getGeneralRegister<int64_t>(0), 0);
+  // Winsize changes between inside and outside of RUN_AARCH64 statement hence
+  // we cannot reliably test against a known value
   EXPECT_NE(getMemoryValue<uint16_t>(process_->getHeapStart() + 0), -1);
   EXPECT_NE(getMemoryValue<uint16_t>(process_->getHeapStart() + 2), -1);
   EXPECT_NE(getMemoryValue<uint16_t>(process_->getHeapStart() + 4), -1);
   EXPECT_NE(getMemoryValue<uint16_t>(process_->getHeapStart() + 6), -1);
 }
 
+TEST_P(Syscall, ftruncate) {
+  const char filepath[] = SIMENG_AARCH64_TEST_ROOT "/data/truncate-test.txt";
+
+  // Copy filepath to heap
+  initialHeapData_.resize(strlen(filepath) + 1);
+  memcpy(initialHeapData_.data(), filepath, strlen(filepath) + 1);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+    mov x20, x0
+
+    # <input> = openat(AT_FDCWD, filepath, O_WRONLY, S_IRUSR)
+    mov x0, -100
+    mov x1, x20
+    mov x2, 0x0001
+    mov x3, 400
+    mov x8, #56
+    svc #0
+    mov x21, x0
+
+    # ftruncate(fd, length) - increase length of file
+    mov x0, x21
+    mov x1, #100
+    mov x8, #46
+    svc #0
+    mov x22, x0
+
+    # ftruncate(fd, length) - decrease length of file
+    mov x0, x21
+    mov x1, #10
+    mov x8, #46
+    svc #0
+    mov x23, x0
+
+    # close(fd)
+    mov x0, x21
+    mov x8, #57
+    svc #0
+  )");
+  // Check returned 0
+  EXPECT_EQ(getGeneralRegister<int64_t>(0), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(22), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(23), 0);
+  // Check file has been truncated
+  std::ifstream truncatedFileI(filepath);
+  std::string fileContents;
+  getline(truncatedFileI, fileContents);
+  truncatedFileI.close();
+  EXPECT_EQ(fileContents, "This is a ");
+  // Reset file
+  std::ofstream truncatedFileO(filepath);
+  truncatedFileO << "This is a test file for the ftruncate syscall";
+  truncatedFileO.close();
+}
+
 TEST_P(Syscall, faccessat) {
   const char filepath[] = "./tempFile.txt";
   initialHeapData_.resize(strlen(filepath) + 1);
@@ -186,7 +187,13 @@ TEST_P(Syscall, faccessat) {
   unlink(filepath);
 
   char abs_filepath[LINUX_PATH_MAX];
-  realpath(SIMENG_AARCH64_TEST_ROOT "/data/input.txt", abs_filepath);
+  if (!realpath(SIMENG_AARCH64_TEST_ROOT "/data/input.txt", abs_filepath)) {
+    // Something went wrong
+    std::cerr << "[SimEng:syscall] realpath failed with errno = " << errno
+              << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
   initialHeapData_.resize(strlen(abs_filepath) + 1);
   // Copy abs_filepath to heap
   memcpy(initialHeapData_.data(), abs_filepath, strlen(abs_filepath) + 1);
@@ -212,7 +219,12 @@ TEST_P(Syscall, faccessat) {
   // Check syscall works using dirfd instead of AT_FDCWD
   const char file[] = "input.txt\0";
   char dirPath[LINUX_PATH_MAX];
-  realpath(SIMENG_AARCH64_TEST_ROOT "/data/\0", dirPath);
+  if (!realpath(SIMENG_AARCH64_TEST_ROOT "/data/\0", dirPath)) {
+    // Something went wrong
+    std::cerr << "[SimEng:syscall] realpath failed with errno = " << errno
+              << std::endl;
+    exit(EXIT_FAILURE);
+  }
 
   initialHeapData_.resize(strlen(dirPath) + strlen(file) + 2);
   // Copy dirPath to heap
@@ -286,7 +298,66 @@ TEST_P(Syscall, getdents64) {
   EXPECT_EQ(getGeneralRegister<int64_t>(22), 120);
 }
 
-// Test reading from and seeking through a file
+TEST_P(Syscall, lseek) {
+  const char filepath[] = SIMENG_AARCH64_TEST_ROOT "/data/input.txt";
+
+  // Copy filepath to heap
+  initialHeapData_.resize(strlen(filepath) + 1);
+  memcpy(initialHeapData_.data(), filepath, strlen(filepath) + 1);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+    mov x20, x0
+
+    # <input> = openat(AT_FDCWD, filepath, O_WRONLY, S_IRUSR)
+    mov x0, -100
+    mov x1, x20
+    mov x2, 0x0001
+    mov x3, 400
+    mov x8, #56
+    svc #0
+    mov x21, x0
+
+    # lseek(fd=<input>, offset=8, whence=SEEK_SET) - seek to offset
+    mov x0, x21
+    mov x1, #8
+    mov x2, #0
+    mov x8, #62
+    svc #0
+    mov x22, x0
+
+    # lseek(fd=<input>, offset=8, whence=SEEK_CUR) - seek to current location plus offset
+    mov x0, x21
+    mov x1, #8
+    mov x2, #1
+    mov x8, #62
+    svc #0
+    mov x23, x0
+
+    # lseek(fd=<input>, offset=8, whence=SEEK_END) - seek to the size of the file plus offset
+    mov x0, x21
+    mov x1, #8
+    mov x2, #2
+    mov x8, #62
+    svc #0
+    mov x24, x0
+
+    # close(fd)
+    mov x0, x21
+    mov x8, #57
+    svc #0
+  )");
+
+  EXPECT_EQ(getGeneralRegister<int64_t>(22), 8);
+  EXPECT_EQ(getGeneralRegister<int64_t>(23), 16);
+  EXPECT_EQ(getGeneralRegister<int64_t>(24), 35);
+}
+
+// Test reading from and seeking through a file (tests openat, readv, read, and
+// lseek syscalls)
 TEST_P(Syscall, file_read) {
   const char filepath[] = SIMENG_AARCH64_TEST_ROOT "/data/input.txt";
 
@@ -353,20 +424,43 @@ TEST_P(Syscall, file_read) {
     mov x8, #65
     svc #0
 
+    # lseek(fd=<input>, offset=0, whence=SEEK_SET)
+    mov x0, x21
+    mov x1, 0
+    mov x2, 0
+    mov x8, #62
+    svc #0
+
+    # read(fd=<input>, buf=sp, count=26)
+    mov x0, x21
+    sub x1, sp, 64
+    mov x2, #26
+    mov x8, #63
+    svc #0
+
     # close(fd=<input>)
     mov x0, x21
     mov x8, #57
     svc #0
   )");
 
-  // Check result of read operations
-  const char reference[] = "ABCD\0UV\0EFGH\0\0\0\0MNOPQRST";
-  char* data = processMemory_ + process_->getHeapStart();
-  for (int i = 0; i < sizeof(reference); i++) {
-    EXPECT_EQ(data[i], reference[i]) << "at index i=" << i << '\n';
+  // Check result of readv operations
+  const char refReadv[] = "ABCD\0UV\0EFGH\0\0\0\0MNOPQRST";
+  char* dataReadv = processMemory_ + process_->getHeapStart();
+  for (size_t i = 0; i < strlen(refReadv); i++) {
+    EXPECT_EQ(dataReadv[i], refReadv[i]) << "at index i=" << i << '\n';
+  }
+
+  // Check result of read operation
+  const char refRead[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+  char* dataRead = processMemory_ + process_->getInitialStackPointer() - 64;
+  for (size_t i = 0; i < strlen(refRead); i++) {
+    EXPECT_EQ(dataRead[i], refRead[i]) << "at index i=" << i << '\n';
   }
 }
 
+// Test reading from and seeking through a file (tests openat, writev, and write
+// syscalls)
 TEST_P(Syscall, file_write) {
   const char str[] = "Hello, World!\n";
   const char filepath[] = "./simeng-fileio-test.txt";
@@ -414,6 +508,13 @@ TEST_P(Syscall, file_write) {
     mov x8, #66
     svc #0
 
+    # write(fd=<tempfile>, buf=x1, count=14)
+    mov x0, x21
+    mov x1, x20
+    mov x2, #14
+    mov x8, #64
+    svc #0
+
     # close(fd=<tempfile>)
     mov x0, x21
     mov x8, #57
@@ -424,11 +525,47 @@ TEST_P(Syscall, file_write) {
   char outdata[15];
   std::ifstream outfile(filepath);
   ASSERT_TRUE(outfile.good());
+  outfile.read(outdata, 14);
+  EXPECT_FALSE(outfile.eof());
+  EXPECT_EQ(strncmp(str, outdata, 14), 0);
   outfile.read(outdata, 15);
   EXPECT_TRUE(outfile.eof());
   EXPECT_EQ(strncmp(str, outdata, 14), 0);
 }
 
+// Tests that writing to the standard out file descriptor functions correctly
+TEST_P(Syscall, stdout) {
+  const char str[] = "Hello, World!\n";
+  for (char c : str) {
+    initialHeapData_.push_back(c);
+  }
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    # iovec = {{x0, 10}, {x0+10, 4}}
+    str x0, [sp, #-32]
+    mov x1, 10
+    str x1, [sp, #-24]
+    add x0, x0, 10
+    str x0, [sp, #-16]
+    mov x1, 4
+    str x1, [sp, #-8]
+
+    # writev(fd=1, iov=iovec, iovcnt=2)
+    mov x0, #1
+    sub x1, sp, 32
+    mov x2, #2
+    mov x8, #66
+    svc #0
+  )");
+  EXPECT_EQ(stdout_.substr(0, strlen(str)), str);
+  EXPECT_EQ(getGeneralRegister<int64_t>(0), strlen(str));
+}
+
+// Tests that an openat syscall on a non-existent file returns an error value
 TEST_P(Syscall, filenotfound) {
   // Copy filepath to heap
   const char filepath[] = "./nonexistent-file";
@@ -451,245 +588,42 @@ TEST_P(Syscall, filenotfound) {
   )");
 
   // Check return value is -1
-  EXPECT_EQ(getGeneralRegister<uint64_t>(0), -1);
+  EXPECT_EQ(getGeneralRegister<int64_t>(0), -1);
 }
 
-TEST_P(Syscall, mmap) {
-  // Test for 3 consecutive allocations
-  RUN_AARCH64(R"(
-    # mmap(addr=NULL, length=65536, prot=3, flags=34, fd=-1, offset=0)
-    mov x0, #0
-    mov x1, #65536
-    mov x2, #3
-    mov x3, #34
-    mov x4, #-1
-    mov x5, #0
-    mov x8, #222
-    svc #0
-    mov x9, x0
+// Test that readlinkat works for supported cases
+TEST_P(Syscall, readlinkat) {
+  const char path[] = "/proc/self/exe";
 
-    # mmap(addr=NULL, length=1024, prot=3, flags=34, fd=-1, offset=0)
-    mov x0, #0
-    mov x1, #1024
-    mov x2, #3
-    mov x3, #34
-    mov x4, #-1
-    mov x5, #0
-    mov x8, #222
+  std::string reference =
+      SIMENG_SOURCE_DIR + std::string("/SimEngDefaultProgram");
+
+  // Copy path to heap
+  initialHeapData_.resize(strlen(path) + reference.size() + 1);
+  memcpy(initialHeapData_.data(), path, strlen(path) + 1);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
     svc #0
-    mov x10, x0
+    mov x20, x0
 
-    # mmap(addr=NULL, length=16384, prot=3, flags=34, fd=-1, offset=0)
+    # readlinkat(dirfd=0, pathname=x20, buf=x20+15, bufsize=1024)
     mov x0, #0
-    mov x1, #16384
-    mov x2, #3
-    mov x3, #34
-    mov x4, #-1
-    mov x5, #0
-    mov x8, #222
+    mov x1, x20
+    add x2, x20, #15
+    mov x3, #1024
+    mov x8, #78
     svc #0
-    mov x11, x0
   )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(9), process_->getMmapStart());
-  EXPECT_EQ(getGeneralRegister<uint64_t>(10), process_->getMmapStart() + 65536);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(11), process_->getMmapStart() + 69632);
 
-  // Test for mmap allocation between two previous allocations
-  RUN_AARCH64(R"(
-    # Setup 3 contiguous allocations
-    # mmap(addr=NULL, length=1024, prot=3, flags=34, fd=-1, offset=0)
-    mov x0, #0
-    mov x1, #1024
-    mov x2, #3
-    mov x3, #34
-    mov x4, #-1
-    mov x5, #0
-    mov x8, #222
-    svc #0
-    mov x9, x0
-
-    # mmap(addr=NULL, length=12288, prot=3, flags=34, fd=-1, offset=0)
-    mov x0, #0
-    mov x1, #12288
-    mov x2, #3
-    mov x3, #34
-    mov x4, #-1
-    mov x5, #0
-    mov x8, #222
-    svc #0
-    mov x10, x0
-
-    # mmap(addr=NULL, length=1024, prot=3, flags=34, fd=-1, offset=0)
-    mov x0, #0
-    mov x1, #1024
-    mov x2, #3
-    mov x3, #34
-    mov x4, #-1
-    mov x5, #0
-    mov x8, #222
-    svc #0
-    mov x11, x0
-
-    # unmap second allocation to create an empty space between allocations
-    # munmap(addr=x10, length=12288, prot=3, flags=34, fd=-1, offset=0)
-    mov x0, x10
-    mov x1, #12288
-    mov x8, #215
-    svc #0
-    mov x12, x0
-
-    # Allocate a region larger than the new empty space
-    # mmap(addr=NULL, length=16384, prot=3, flags=34, fd=-1, offset=0)
-    mov x0, #0
-    mov x1, #16384
-    mov x2, #3
-    mov x3, #34
-    mov x4, #-1
-    mov x5, #0
-    mov x8, #222
-    svc #0
-    mov x13, x0
-
-    # Two allocations whose combined length equals the new empty space
-    # mmap(addr=NULL, length=4096, prot=3, flags=34, fd=-1, offset=0)
-    mov x0, #0
-    mov x1, #4096
-    mov x2, #3
-    mov x3, #34
-    mov x4, #-1
-    mov x5, #0
-    mov x8, #222
-    svc #0
-    mov x14, x0
-
-    # mmap(addr=NULL, length=8192, prot=3, flags=34, fd=-1, offset=0)
-    mov x0, #0
-    mov x1, #8192
-    mov x2, #3
-    mov x3, #34
-    mov x4, #-1
-    mov x5, #0
-    mov x8, #222
-    svc #0
-    mov x15, x0
-  )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(9), process_->getMmapStart());
-  EXPECT_EQ(getGeneralRegister<uint64_t>(10), process_->getMmapStart() + 4096);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(11), process_->getMmapStart() + 16384);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(12), 0);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(13), process_->getMmapStart() + 20480);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(14), process_->getMmapStart() + 4096);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(15), process_->getMmapStart() + 8192);
-}
-
-TEST_P(Syscall, munmap) {
-  // Test that no errors are given during expected usage
-  RUN_AARCH64(R"(
-    # mmap(addr=NULL, length=65536, prot=3, flags=34, fd=-1, offset=0)
-    mov x0, #0
-    mov x1, #65536
-    mov x2, #3
-    mov x3, #34
-    mov x4, #-1
-    mov x5, #0
-    mov x8, #222
-    svc #0
-    mov x9, x0
-
-    # munmap(addr=mmapStart_, length=65536, prot=3, flags=34, fd=-1, offset=0)
-    mov x0, x9
-    mov x1, #65536
-    mov x8, #215
-    svc #0
-    mov x10, x0
-
-    # munmap(addr=mmapStart_, length=65536, prot=3, flags=34, fd=-1, offset=0)
-    mov x0, x9
-    mov x1, #65536
-    mov x8, #215
-    svc #0
-    mov x11, x0
-  )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(9), process_->getMmapStart());
-  EXPECT_EQ(getGeneralRegister<int64_t>(10), 0);
-  EXPECT_EQ(getGeneralRegister<int64_t>(11), 0);
-
-  // Test that EINVAL error types trigger
-  RUN_AARCH64(R"(
-    # mmap(addr=NULL, length=1024, prot=3, flags=34, fd=-1, offset=0)
-    mov x0, #0
-    mov x1, #1024
-    mov x2, #3
-    mov x3, #34
-    mov x4, #-1
-    mov x5, #0
-    mov x8, #222
-    svc #0
-    mov x9, x0
-
-    # munmap(addr=mmapStart_, length=65536, prot=3, flags=34, fd=-1, offset=0)
-    mov x0, x9
-    mov x1, #65536
-    mov x8, #215
-    svc #0
-    mov x10, x0
-
-    # munmap(addr=mmapStart_, length=65536, prot=3, flags=34, fd=-1, offset=0)
-    add x9, x9, #1024
-    mov x0, x9
-    mov x1, #65536
-    mov x8, #215
-    svc #0
-    mov x11, x0
-  )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(9), process_->getMmapStart() + 1024);
-  EXPECT_EQ(getGeneralRegister<int64_t>(10), -1);
-  EXPECT_EQ(getGeneralRegister<int64_t>(11), -1);
-}
-
-TEST_P(Syscall, stdout) {
-  const char str[] = "Hello, World!\n";
-  for (char c : str) {
-    initialHeapData_.push_back(c);
-  }
-  RUN_AARCH64(R"(
-    # Get heap address
-    mov x0, 0
-    mov x8, 214
-    svc #0
-
-    # iovec = {{x0, 10}, {x0+10, 4}}
-    str x0, [sp, #-32]
-    mov x1, 10
-    str x1, [sp, #-24]
-    add x0, x0, 10
-    str x0, [sp, #-16]
-    mov x1, 4
-    str x1, [sp, #-8]
-
-    # writev(fd=1, iov=iovec, iovcnt=2)
-    mov x0, #1
-    sub x1, sp, 32
-    mov x2, #2
-    mov x8, #66
-    svc #0
-  )");
-  EXPECT_EQ(stdout_.substr(0, sizeof(str) - 1), str);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(0), sizeof(str) - 1);
-}
-
-TEST_P(Syscall, mprotect) {
-  // Check mprotect returns placeholder value as currently not implemented
-  RUN_AARCH64(R"(
-    # mprotect(addr=47472, len=4096, prot=1) = 0
-    mov x0, #47472
-    mov x1, #4096
-    mov x2, #1
-    mov x8, #226
-    svc #0
-  )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(0), 0);
-}
+  EXPECT_EQ(getGeneralRegister<int64_t>(0), reference.size());
+  char* data = processMemory_ + process_->getHeapStart() + 15;
+  for (size_t i = 0; i < reference.size(); i++) {
+    EXPECT_EQ(data[i], reference.c_str()[i]) << "at index i=" << i << '\n';
+  }
+}
 
 TEST_P(Syscall, newfstatat) {
   const char filepath[] = SIMENG_AARCH64_TEST_ROOT "/data/input.txt";
@@ -714,8 +648,64 @@ TEST_P(Syscall, newfstatat) {
     svc #0
     mov x21, x0
   )");
+  // Run fstatat syscall to define a reference
+  struct ::stat statbufRef;
+  ::fstatat(AT_FDCWD, filepath, &statbufRef, 0);
+
   // Check fstatat returned 0
-  EXPECT_EQ(getGeneralRegister<uint64_t>(21), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(21), 0);
+  // Check fstatat buf matches reference
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart()),
+            statbufRef.st_dev);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 8),
+            statbufRef.st_ino);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 16),
+            statbufRef.st_mode);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 20),
+            statbufRef.st_nlink);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 24),
+            statbufRef.st_uid);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 28),
+            statbufRef.st_gid);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 32),
+            statbufRef.st_rdev);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 40), 0ull);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 48),
+            statbufRef.st_size);
+  EXPECT_EQ(getMemoryValue<int32_t>(process_->getHeapStart() + 56),
+            statbufRef.st_blksize);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 60), 0ull);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 64),
+            statbufRef.st_blocks);
+#ifdef __MACH__
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 72),
+            statbufRef.st_atimespec.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 80),
+            statbufRef.st_atimespec.tv_nsec);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 88),
+            statbufRef.st_mtimespec.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 96),
+            statbufRef.st_mtimespec.tv_nsec);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 104),
+            statbufRef.st_ctimespec.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 112),
+            statbufRef.st_ctimespec.tv_nsec);
+#else
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 72),
+            statbufRef.st_atim.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 80),
+            statbufRef.st_atim.tv_nsec);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 88),
+            statbufRef.st_mtim.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 96),
+            statbufRef.st_mtim.tv_nsec);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 104),
+            statbufRef.st_ctim.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 112),
+            statbufRef.st_ctim.tv_nsec);
+#endif
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 116), 0ull);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 124), 0ull);
 
   RUN_AARCH64(R"(
     # Get heap address
@@ -734,12 +724,17 @@ TEST_P(Syscall, newfstatat) {
     mov x21, x0
   )");
   // Check fstatat returned -1 (file not found)
-  EXPECT_EQ(getGeneralRegister<uint64_t>(21), -1);
+  EXPECT_EQ(getGeneralRegister<int64_t>(21), -1);
 
   // Check syscall works using dirfd instead of AT_FDCWD
   const char file[] = "input.txt\0";
   char dirPath[LINUX_PATH_MAX];
-  realpath(SIMENG_AARCH64_TEST_ROOT "/data/\0", dirPath);
+  if (!realpath(SIMENG_AARCH64_TEST_ROOT "/data/\0", dirPath)) {
+    // Something went wrong
+    std::cerr << "[SimEng:syscall] realpath failed with errno = " << errno
+              << std::endl;
+    exit(EXIT_FAILURE);
+  }
 
   initialHeapData_.resize(128 + strlen(dirPath) + strlen(file) + 2);
   // Copy dirPath to heap
@@ -772,12 +767,75 @@ TEST_P(Syscall, newfstatat) {
     svc #0
     mov x21, x0
   )");
+  // Run fstatat syscall to define a reference
+  ::fstatat(AT_FDCWD, filepath, &statbufRef, 0);
+
+  // Check fstatat returned 0
   EXPECT_EQ(getGeneralRegister<int64_t>(27), 0);
+
+  // Check fstatat buf matches reference
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart()),
+            statbufRef.st_dev);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 8),
+            statbufRef.st_ino);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 16),
+            statbufRef.st_mode);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 20),
+            statbufRef.st_nlink);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 24),
+            statbufRef.st_uid);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 28),
+            statbufRef.st_gid);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 32),
+            statbufRef.st_rdev);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 40), 0ull);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 48),
+            statbufRef.st_size);
+  EXPECT_EQ(getMemoryValue<int32_t>(process_->getHeapStart() + 56),
+            statbufRef.st_blksize);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 60), 0ull);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 64),
+            statbufRef.st_blocks);
+#ifdef __MACH__
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 72),
+            statbufRef.st_atimespec.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 80),
+            statbufRef.st_atimespec.tv_nsec);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 88),
+            statbufRef.st_mtimespec.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 96),
+            statbufRef.st_mtimespec.tv_nsec);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 104),
+            statbufRef.st_ctimespec.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 112),
+            statbufRef.st_ctimespec.tv_nsec);
+#else
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 72),
+            statbufRef.st_atim.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 80),
+            statbufRef.st_atim.tv_nsec);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 88),
+            statbufRef.st_mtim.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 96),
+            statbufRef.st_mtim.tv_nsec);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 104),
+            statbufRef.st_ctim.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 112),
+            statbufRef.st_ctim.tv_nsec);
+#endif
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 116), 0ull);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 124), 0ull);
 }
 
-TEST_P(Syscall, getrusage) {
-  // Reserve 128 bytes for usage
-  initialHeapData_.resize(128);
+TEST_P(Syscall, fstat) {
+  const char filepath[] = SIMENG_AARCH64_TEST_ROOT "/data/input.txt";
+
+  // Reserve 256 bytes for fstat struct
+  initialHeapData_.resize(256 + strlen(filepath) + 1);
+
+  // Copy filepath to heap
+  memcpy(initialHeapData_.data() + 256, filepath, strlen(filepath) + 1);
+
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
@@ -785,51 +843,110 @@ TEST_P(Syscall, getrusage) {
     svc #0
     mov x20, x0
 
-    # getrusage(who = RUSAGE_SELF, usage)
-    mov x0, #0
-    mov x1, x20
-    mov x8, #165
+    # <input> = openat(AT_FDCWD, filepath, O_RDONLY, S_IRUSR)
+    mov x0, -100
+    add x1, x20, 256
+    mov x2, 0x0000
+    mov x3, 400
+    mov x8, #56
     svc #0
     mov x21, x0
 
-    # getrusage(who = RUSAGE_CHILDREN, usage)
-    mov x0, #-1
+    # fstat(fd=<input>, buf=x20)
+    mov x0, x21
     mov x1, x20
-    mov x8, #165
+    mov x8, #80
+    svc #0
+    mov x23, x0
+
+    # close(fd=<input>)
+    mov x0, x21
+    mov x8, #57
     svc #0
-    mov x22, x0
   )");
-  EXPECT_EQ(getGeneralRegister<int64_t>(21), 0);
-  EXPECT_EQ(getGeneralRegister<int64_t>(22), 0);
+  // Run fstat syscall to define a reference
+  int64_t fd = ::openat(AT_FDCWD, filepath, O_RDONLY, S_IRUSR);
+  struct ::stat statbufRef;
+  ::fstat(fd, &statbufRef);
+  ::close(fd);
 
-  // MacOS doesn't support the final enum RUSAGE_THREAD
-#ifndef __MACH__
-  // Reserve 128 bytes for usage
-  initialHeapData_.resize(128);
+  // Check fstat returned 0
+  EXPECT_EQ(getGeneralRegister<int64_t>(23), 0);
+  // Check fstat buf matches reference
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart()),
+            statbufRef.st_dev);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 8),
+            statbufRef.st_ino);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 16),
+            statbufRef.st_mode);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 20),
+            statbufRef.st_nlink);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 24),
+            statbufRef.st_uid);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 28),
+            statbufRef.st_gid);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 32),
+            statbufRef.st_rdev);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 40), 0ull);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 48),
+            statbufRef.st_size);
+  EXPECT_EQ(getMemoryValue<int32_t>(process_->getHeapStart() + 56),
+            statbufRef.st_blksize);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 60), 0ull);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 64),
+            statbufRef.st_blocks);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 72),
+            statbufRef.st_atime);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 80), 0ull);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 88),
+            statbufRef.st_mtime);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 96), 0ull);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 104),
+            statbufRef.st_ctime);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 112), 0ull);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 116), 0ull);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 124), 0ull);
+}
+
+TEST_P(Syscall, exit_group) {
   RUN_AARCH64(R"(
-      # Get heap address
-      mov x0, 0
-      mov x8, 214
-      svc #0
-      mov x20, x0
+    # exit_group(1)
+    mov x0, #1
+    mov x8, #94
+    svc #0
+  )");
+  // Set reference for stdout
+  std::string str =
+      "\n[SimEng:ExceptionHandler] Received exit_group syscall: terminating "
+      "with exit code 1";
+  EXPECT_EQ(stdout_.substr(0, str.size()), str);
+}
 
-      # getrusage(who = RUSAGE_THREAD, usage)
-      mov x0, #1
-      mov x1, x20
-      mov x8, #165
-      svc #0
-      mov x21, x0
-    )");
+TEST_P(Syscall, set_tid_address) {
+  // Reserve 8 bytes for tid
+  initialHeapData_.resize(8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+    mov x20, x0
+
+    # set_tid_address(tidptr=x20)
+    mov x0, x20
+    mov x8, #96
+    svc #0
+    mov x21, x0
+  )");
   EXPECT_EQ(getGeneralRegister<int64_t>(21), 0);
-#endif
 }
 
-TEST_P(Syscall, ftruncate) {
-  const char filepath[] = SIMENG_AARCH64_TEST_ROOT "/data/truncate-test.txt";
+// TODO: write futex test
+// TODO: write set_robust_list test
 
-  // Copy filepath to heap
-  initialHeapData_.resize(strlen(filepath) + 1);
-  memcpy(initialHeapData_.data(), filepath, strlen(filepath) + 1);
+TEST_P(Syscall, clock_gettime) {
+  // Reserve 32 bytes for time data
+  initialHeapData_.resize(32);
 
   RUN_AARCH64(R"(
     # Get heap address
@@ -838,44 +955,647 @@ TEST_P(Syscall, ftruncate) {
     svc #0
     mov x20, x0
 
-    # <input> = openat(AT_FDCWD, filepath, O_WRONLY, S_IRUSR)
-    mov x0, -100
+    # Execute loop to elapse time in core
+    mov x10, #10000
+    subs x10, x10, #1
+    b.ne #-4
+
+    # clock_gettime(clk_id=CLOCK_REALTIME, tp=x20)
+    mov x0, #0
     mov x1, x20
-    mov x2, 0x0001
-    mov x3, 400
-    mov x8, #56
+    mov x8, #113
     svc #0
     mov x21, x0
 
-    # ftruncate(fd, length) - increase length of file
-    mov x0, x21
-    mov x1, #100
-    mov x8, #46
+    # Execute loop to elapse time in core
+    mov x10, #10000
+    subs x10, x10, #1
+    b.ne #-4
+
+    # clock_gettime(clk_id=CLOCK_MONOTONIC, tp=x20+16)
+    mov x0, #1
+    add x1, x20, #16
+    mov x8, #113
     svc #0
     mov x22, x0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(21), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(22), 0);
+  // Set time values based on core model in use
+  uint64_t secondsReal = 0;
+  uint64_t nanosecondsReal = 0;
+  uint64_t secondsMono = 0;
+  uint64_t nanosecondsMono = 0;
+  // Seconds will be 0 as too much host time would have to elapse in the test
+  // suite for 1 simulated second to elapse
+  if (std::get<0>(GetParam()) == EMULATION) {
+    nanosecondsReal = 8003;
+    nanosecondsMono = 16006;
+  } else if (std::get<0>(GetParam()) == INORDER) {
+    nanosecondsReal = 8006;
+    nanosecondsMono = 16010;
+  } else if (std::get<0>(GetParam()) == OUTOFORDER) {
+    nanosecondsReal = 8009;
+    nanosecondsMono = 16015;
+  }
 
-    # ftruncate(fd, length) - decrease length of file
-    mov x0, x21
-    mov x1, #46
-    mov x8, #46
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart()), secondsReal);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 8),
+            nanosecondsReal);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 16),
+            secondsMono);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 24),
+            nanosecondsMono);
+}
+
+// TODO: tests only test errored instances of using sched_setaffinity due to
+// omitted functionality. Redo test once functionality is implemented
+TEST_P(Syscall, sched_setaffinity) {
+  RUN_AARCH64(R"(
+    # sched_setaffinity(pid=0, cpusetsize=1, mask=0)
+    mov x0, #0
+    mov x1, #1
+    mov x2, #0
+    mov x8, #122
+    svc #0
+    mov x21, x0
+
+    # sched_setaffinity(pid=1, cpusetsize=1, mask=1)
+    mov x0, #1
+    mov x1, #1
+    mov x2, #1
+    mov x8, #122
+    svc #0
+    mov x22, x0
+
+    # sched_setaffinity(pid=0, cpusetsize=0, mask=1)
+    mov x0, #0
+    mov x1, #0
+    mov x2, #1
+    mov x8, #122
     svc #0
     mov x23, x0
 
-    # close(fd)
-    mov x0, x21
-    mov x8, #57
+    # sched_setaffinity(pid=0, cpusetsize=1, mask=1)
+    mov x0, #0
+    mov x1, #1
+    mov x2, #1
+    mov x8, #122
+    svc #0
+    mov x24, x0
+    )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(21), -EFAULT);
+  EXPECT_EQ(getGeneralRegister<int64_t>(22), -ESRCH);
+  EXPECT_EQ(getGeneralRegister<int64_t>(23), -EINVAL);
+  EXPECT_EQ(getGeneralRegister<int64_t>(24), 0);
+}
+
+// TODO: tests only test errored instances of using sched_getaffinity due to
+// omitted functionality. Redo test once functionality is implemented
+TEST_P(Syscall, sched_getaffinity) {
+  RUN_AARCH64(R"(
+    # schedGetAffinity(pid=0, cpusetsize=0, mask=0)
+    mov x0, #0
+    mov x1, #0
+    mov x2, #0
+    mov x8, #123
+    svc #0
+    mov x21, x0
+
+    # sched_getaffinity(pid=1, cpusetsize=0, mask=1)
+    mov x0, #1
+    mov x1, #0
+    mov x2, #1
+    mov x8, #123
+    svc #0
+    mov x22, x0
+
+    # sched_getaffinity(pid=0, cpusetsize=0, mask=1)
+    mov x0, #0
+    mov x1, #0
+    mov x2, #1
+    mov x8, #123
     svc #0
+    mov x23, x0
+    )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(21), -1);
+  EXPECT_EQ(getGeneralRegister<int64_t>(22), -1);
+  EXPECT_EQ(getGeneralRegister<int64_t>(23), 1);
+}
+
+// TODO: write tgkill test
+// TODO: write rt_sigaction test
+// TODO: write rt_sigprocmask test
+
+TEST_P(Syscall, uname) {
+  // Reserve 325 bytes for utsname struct
+  initialHeapData_.resize(325);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+    mov x20, x0
+
+    # getrusage(buf=x20)
+    mov x0, x20
+    mov x8, #160
+    svc #0
+    mov x21, x0
   )");
-  // Check returned 0
-  EXPECT_EQ(getGeneralRegister<uint64_t>(22), 0);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(23), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(21), 0);
+
+  // Check utsname struct in memory
+  char* data = processMemory_ + process_->getHeapStart();
+  const char sysname[] = "Linux";
+  for (size_t i = 0; i < strlen(sysname); i++) EXPECT_EQ(data[i], sysname[i]);
+
+  // Add 65 to data pointer for reserved length of each string field in Linux
+  data += 65;
+  const char nodename[] = "simeng.hpc.cs.bris.ac.uk";
+  for (size_t i = 0; i < strlen(nodename); i++) EXPECT_EQ(data[i], nodename[i]);
+
+  data += 65;
+  const char release[] = "4.14.0";
+  for (size_t i = 0; i < strlen(release); i++) EXPECT_EQ(data[i], release[i]);
+
+  data += 65;
+  const char version[] = "#1 SimEng Mon Apr 29 16:28:37 UTC 2019";
+  for (size_t i = 0; i < strlen(version); i++) EXPECT_EQ(data[i], version[i]);
+
+  data += 65;
+  const char machine[] = "aarch64";
+  for (size_t i = 0; i < strlen(machine); i++) EXPECT_EQ(data[i], machine[i]);
+}
+
+TEST_P(Syscall, getrusage) {
+  // Reserve 128 bytes for usage
+  initialHeapData_.resize(128);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+    mov x20, x0
+
+    # getrusage(who = RUSAGE_SELF, usage)
+    mov x0, #0
+    mov x1, x20
+    mov x8, #165
+    svc #0
+    mov x21, x0
+
+    # getrusage(who = RUSAGE_CHILDREN, usage)
+    mov x0, #-1
+    mov x1, x20
+    mov x8, #165
+    svc #0
+    mov x22, x0
+  )");
+  // getrusage rusage struct values changes between inside and outside of
+  // RUN_AARCH64 statement hence we cannot reliably test against a known value.
+  // Thus only test return value
+  EXPECT_EQ(getGeneralRegister<int64_t>(21), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(22), 0);
+
+  // MacOS doesn't support the final enum RUSAGE_THREAD
+#ifndef __MACH__
+  // Reserve 128 bytes for usage
+  initialHeapData_.resize(128);
+  RUN_AARCH64(R"(
+      # Get heap address
+      mov x0, 0
+      mov x8, 214
+      svc #0
+      mov x20, x0
+
+      # getrusage(who = RUSAGE_THREAD, usage)
+      mov x0, #1
+      mov x1, x20
+      mov x8, #165
+      svc #0
+      mov x21, x0
+    )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(21), 0);
+#endif
+}
+
+TEST_P(Syscall, gettimeofday) {
+  // Reserve 64 bytes for time data
+  initialHeapData_.resize(64);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+    mov x20, x0
+
+    # Execute loop to elapse time in core
+    mov x10, #10000
+    subs x10, x10, #1
+    b.ne #-4
+
+    # gettimeofday(tv=x20, tz=null)
+    mov x0, x20
+    mov x1, #0
+    mov x8, #169
+    svc #0
+    mov x21, x0
+
+    # Execute loop to elapse time in core
+    mov x10, #10000
+    subs x10, x10, #1
+    b.ne #-4
+
+    # gettimeofday(tv=null, tz=x20+16)
+    mov x0, #0
+    add x1, x20, #16
+    mov x8, #169
+    svc #0
+    mov x22, x0
+
+    # Execute loop to elapse time in core
+    mov x10, #10000
+    subs x10, x10, #1
+    b.ne #-4
+
+    # gettimeofday(tv=x20+32, tz=x20+48)
+    add x0, x20, #32
+    add x1, x20, #48
+    mov x8, #169
+    svc #0
+    mov x23, x0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(21), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(22), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(23), 0);
+
+  // Set time values based on core model in use
+
+  // Seconds will be 0 as too much host time would have to elapse in the test
+  // suite for 1 simulated second to elapse
+  simeng::kernel::timeval tvLoop0 = {0, 8};
+  // tv set to NULL here so no value change will occur
+  simeng::kernel::timeval tvLoop2 = {0, 24};
+  // All tz values are set to 0 given values are the displacement from GMT
+  simeng::kernel::timeval tzLoop1 = {0, 0};
+  simeng::kernel::timeval tzLoop2 = {0, 0};
+
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart()), tvLoop0.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 8),
+            tvLoop0.tv_usec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 16),
+            tzLoop1.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 24),
+            tzLoop1.tv_usec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 32),
+            tvLoop2.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 40),
+            tvLoop2.tv_usec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 48),
+            tzLoop2.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 56),
+            tzLoop2.tv_usec);
+}
+
+TEST_P(Syscall, gettid) {
+  RUN_AARCH64(R"(
+    # gettid()
+    mov x8, #178
+    svc #0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(0), 0);
+}
+
+TEST_P(Syscall, getpid) {
+  RUN_AARCH64(R"(
+    # getpid()
+    mov x8, #172
+    svc #0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(0), 0);
+}
+
+TEST_P(Syscall, getuid) {
+  RUN_AARCH64(R"(
+    # getuid()
+    mov x8, #174
+    svc #0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(0), 0);
+}
+
+TEST_P(Syscall, geteuid) {
+  RUN_AARCH64(R"(
+    # geteuid()
+    mov x8, #175
+    svc #0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(0), 0);
+}
+
+TEST_P(Syscall, getgid) {
+  RUN_AARCH64(R"(
+    # getgid()
+    mov x8, #176
+    svc #0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(0), 0);
+}
+
+TEST_P(Syscall, getegid) {
+  RUN_AARCH64(R"(
+    # getegid()
+    mov x8, #177
+    svc #0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(0), 0);
+}
+
+// TODO: write sysinfo test
+// TODO: write shutdown test
+
+TEST_P(Syscall, mprotect) {
+  // Check mprotect returns placeholder value as currently not implemented
+  RUN_AARCH64(R"(
+    # mprotect(addr=47472, len=4096, prot=1) = 0
+    mov x0, #47472
+    mov x1, #4096
+    mov x2, #1
+    mov x8, #226
+    svc #0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(0), 0);
+}
+
+// TODO: write mbind test
+// TODO: write prlimit64 test
+// TODO: write rseq test
+
+TEST_P(Syscall, munmap) {
+  // Test that no errors are given during expected usage
+  RUN_AARCH64(R"(
+    # mmap(addr=NULL, length=65536, prot=3, flags=34, fd=-1, offset=0)
+    mov x0, #0
+    mov x1, #65536
+    mov x2, #3
+    mov x3, #34
+    mov x4, #-1
+    mov x5, #0
+    mov x8, #222
+    svc #0
+    mov x9, x0
+
+    # munmap(addr=mmapStart_, length=65536, prot=3, flags=34, fd=-1, offset=0)
+    mov x0, x9
+    mov x1, #65536
+    mov x8, #215
+    svc #0
+    mov x10, x0
+
+    # munmap(addr=mmapStart_, length=65536, prot=3, flags=34, fd=-1, offset=0)
+    mov x0, x9
+    mov x1, #65536
+    mov x8, #215
+    svc #0
+    mov x11, x0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(9), process_->getMmapStart());
+  EXPECT_EQ(getGeneralRegister<int64_t>(10), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(11), 0);
+
+  // Test that EINVAL error types trigger
+  RUN_AARCH64(R"(
+    # mmap(addr=NULL, length=1024, prot=3, flags=34, fd=-1, offset=0)
+    mov x0, #0
+    mov x1, #1024
+    mov x2, #3
+    mov x3, #34
+    mov x4, #-1
+    mov x5, #0
+    mov x8, #222
+    svc #0
+    mov x9, x0
+
+    # munmap(addr=mmapStart_, length=65536, prot=3, flags=34, fd=-1, offset=0)
+    mov x0, x9
+    mov x1, #65536
+    mov x8, #215
+    svc #0
+    mov x10, x0
+
+    # munmap(addr=mmapStart_, length=65536, prot=3, flags=34, fd=-1, offset=0)
+    add x9, x9, #1024
+    mov x0, x9
+    mov x1, #65536
+    mov x8, #215
+    svc #0
+    mov x11, x0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(9), process_->getMmapStart() + 1024);
+  EXPECT_EQ(getGeneralRegister<int64_t>(10), -1);
+  EXPECT_EQ(getGeneralRegister<int64_t>(11), -1);
+}
+
+TEST_P(Syscall, mmap) {
+  // Test for 3 consecutive allocations
+  RUN_AARCH64(R"(
+    # mmap(addr=NULL, length=65536, prot=3, flags=34, fd=-1, offset=0)
+    mov x0, #0
+    mov x1, #65536
+    mov x2, #3
+    mov x3, #34
+    mov x4, #-1
+    mov x5, #0
+    mov x8, #222
+    svc #0
+    mov x9, x0
+
+    # mmap(addr=NULL, length=1024, prot=3, flags=34, fd=-1, offset=0)
+    mov x0, #0
+    mov x1, #1024
+    mov x2, #3
+    mov x3, #34
+    mov x4, #-1
+    mov x5, #0
+    mov x8, #222
+    svc #0
+    mov x10, x0
+
+    # mmap(addr=NULL, length=16384, prot=3, flags=34, fd=-1, offset=0)
+    mov x0, #0
+    mov x1, #16384
+    mov x2, #3
+    mov x3, #34
+    mov x4, #-1
+    mov x5, #0
+    mov x8, #222
+    svc #0
+    mov x11, x0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(9), process_->getMmapStart());
+  EXPECT_EQ(getGeneralRegister<int64_t>(10), process_->getMmapStart() + 65536);
+  EXPECT_EQ(getGeneralRegister<int64_t>(11), process_->getMmapStart() + 69632);
+
+  // Test for mmap allocation between two previous allocations
+  RUN_AARCH64(R"(
+    # Setup 3 contiguous allocations
+    # mmap(addr=NULL, length=1024, prot=3, flags=34, fd=-1, offset=0)
+    mov x0, #0
+    mov x1, #1024
+    mov x2, #3
+    mov x3, #34
+    mov x4, #-1
+    mov x5, #0
+    mov x8, #222
+    svc #0
+    mov x9, x0
+
+    # mmap(addr=NULL, length=12288, prot=3, flags=34, fd=-1, offset=0)
+    mov x0, #0
+    mov x1, #12288
+    mov x2, #3
+    mov x3, #34
+    mov x4, #-1
+    mov x5, #0
+    mov x8, #222
+    svc #0
+    mov x10, x0
+
+    # mmap(addr=NULL, length=1024, prot=3, flags=34, fd=-1, offset=0)
+    mov x0, #0
+    mov x1, #1024
+    mov x2, #3
+    mov x3, #34
+    mov x4, #-1
+    mov x5, #0
+    mov x8, #222
+    svc #0
+    mov x11, x0
+
+    # unmap second allocation to create an empty space between allocations
+    # munmap(addr=x10, length=12288, prot=3, flags=34, fd=-1, offset=0)
+    mov x0, x10
+    mov x1, #12288
+    mov x8, #215
+    svc #0
+    mov x12, x0
+
+    # Allocate a region larger than the new empty space
+    # mmap(addr=NULL, length=16384, prot=3, flags=34, fd=-1, offset=0)
+    mov x0, #0
+    mov x1, #16384
+    mov x2, #3
+    mov x3, #34
+    mov x4, #-1
+    mov x5, #0
+    mov x8, #222
+    svc #0
+    mov x13, x0
+
+    # Two allocations whose combined length equals the new empty space
+    # mmap(addr=NULL, length=4096, prot=3, flags=34, fd=-1, offset=0)
+    mov x0, #0
+    mov x1, #4096
+    mov x2, #3
+    mov x3, #34
+    mov x4, #-1
+    mov x5, #0
+    mov x8, #222
+    svc #0
+    mov x14, x0
+
+    # mmap(addr=NULL, length=8192, prot=3, flags=34, fd=-1, offset=0)
+    mov x0, #0
+    mov x1, #8192
+    mov x2, #3
+    mov x3, #34
+    mov x4, #-1
+    mov x5, #0
+    mov x8, #222
+    svc #0
+    mov x15, x0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(9), process_->getMmapStart());
+  EXPECT_EQ(getGeneralRegister<int64_t>(10), process_->getMmapStart() + 4096);
+  EXPECT_EQ(getGeneralRegister<int64_t>(11), process_->getMmapStart() + 16384);
+  EXPECT_EQ(getGeneralRegister<int64_t>(12), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(13), process_->getMmapStart() + 20480);
+  EXPECT_EQ(getGeneralRegister<int64_t>(14), process_->getMmapStart() + 4096);
+  EXPECT_EQ(getGeneralRegister<int64_t>(15), process_->getMmapStart() + 8192);
+}
+
+TEST_P(Syscall, getrandom) {
+  initialHeapData_.resize(24);
+  memset(initialHeapData_.data(), -1, 16);
+
+  RUN_AARCH64(R"(
+      # Get heap address
+      mov x0, 0
+      mov x8, 214
+      svc #0
+
+      # store initial heap address
+      mov x10, x0
+
+      # Save 8 random bytes to the heap
+      # getrandom(buf * = [a], buflen = 8, no flags)
+      mov x1, #8
+      mov x8, #278
+      svc #0
+
+      # Save another 8 random bytes to the heap
+      # getrandom(buf * = [a], buflen = 8, no flags)
+      add x0, x10, #8
+      mov x1, #8
+      mov x8, #278
+      svc #0
+
+    )");
+
+  // Check getrandom returned 8 (8 bytes were requested)
+  EXPECT_EQ(getGeneralRegister<int64_t>(0), 8);
+
+  int heapStart = getGeneralRegister<int64_t>(10);
+  for (size_t i = 0; i < 8; i++) {
+    printf("compare %x == %x\n", getMemoryValue<uint8_t>(heapStart + i),
+           getMemoryValue<uint8_t>(heapStart + 8 + i));
+  }
+
+  // Check that the returned bytes aren't all equal to -1.
+  // heap was initialised to -1 so check bytes have changed
+  bool allUnchanged = true;
+  for (size_t i = 0; i < 16; i++) {
+    if (getMemoryValue<uint8_t>(heapStart + i) != 0xFF) {
+      allUnchanged = false;
+      break;
+    }
+  }
+  EXPECT_EQ(allUnchanged, false);
+
+  // Check that the returned bytes from the two syscalls don't all match.
+  // If they do then the returned bytes surely weren't random
+  bool allMatch = true;
+  for (char i = 0; i < 8; i++) {
+    if (getMemoryValue<uint8_t>(heapStart + i) !=
+        getMemoryValue<uint8_t>(heapStart + 8 + i)) {
+      allMatch = false;
+      break;
+    }
+  }
+
+  EXPECT_EQ(allMatch, false);
 }
 
 INSTANTIATE_TEST_SUITE_P(
     AArch64, Syscall,
-    ::testing::Values(std::make_tuple(EMULATION, YAML::Load("{}")),
-                      std::make_tuple(INORDER, YAML::Load("{}")),
-                      std::make_tuple(OUTOFORDER, YAML::Load("{}"))),
+    ::testing::Values(std::make_tuple(EMULATION, "{}"),
+                      std::make_tuple(INORDER, "{}"),
+                      std::make_tuple(OUTOFORDER,
+                                      "{L1-Data-Memory: "
+                                      "{Interface-Type: Fixed}}")),
     paramToString);
 
 }  // namespace
diff --git a/test/regression/aarch64/SystemRegisters.cc b/test/regression/aarch64/SystemRegisters.cc
index 869526f600..ea361891d5 100644
--- a/test/regression/aarch64/SystemRegisters.cc
+++ b/test/regression/aarch64/SystemRegisters.cc
@@ -87,9 +87,11 @@ TEST_P(SystemRegister, counter_timers) {
 
 INSTANTIATE_TEST_SUITE_P(
     AArch64, SystemRegister,
-    ::testing::Values(std::make_tuple(EMULATION, YAML::Load("{}")),
-                      std::make_tuple(INORDER, YAML::Load("{}")),
-                      std::make_tuple(OUTOFORDER, YAML::Load("{}"))),
+    ::testing::Values(std::make_tuple(EMULATION, "{}"),
+                      std::make_tuple(INORDER, "{}"),
+                      std::make_tuple(OUTOFORDER,
+                                      "{L1-Data-Memory: "
+                                      "{Interface-Type: Fixed}}")),
     paramToString);
 
 }  // namespace
diff --git a/test/regression/aarch64/data/truncate-test.txt b/test/regression/aarch64/data/truncate-test.txt
index 12b437c152..94adc17b9e 100644
Binary files a/test/regression/aarch64/data/truncate-test.txt and b/test/regression/aarch64/data/truncate-test.txt differ
diff --git a/test/regression/aarch64/instructions/arithmetic.cc b/test/regression/aarch64/instructions/arithmetic.cc
index 2945e2d1f4..0d2552f355 100644
--- a/test/regression/aarch64/instructions/arithmetic.cc
+++ b/test/regression/aarch64/instructions/arithmetic.cc
@@ -3,6 +3,7 @@
 namespace {
 
 using InstArithmetic = AArch64RegressionTest;
+using namespace simeng::arch::aarch64::InstructionGroups;
 
 TEST_P(InstArithmetic, add) {
   RUN_AARCH64(R"(
@@ -353,6 +354,9 @@ TEST_P(InstArithmetic, negsw) {
   )");
   EXPECT_EQ(getNZCV(), 0b1001);
   EXPECT_EQ(getGeneralRegister<uint32_t>(0), static_cast<uint32_t>(1ul << 31));
+
+  EXPECT_GROUP(R"(negs w0, w1)", INT_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_GROUP(R"(negs w0, w1, lsl 31)", INT_SIMPLE_ARTH);
 }
 
 // Test that NZCV flags are set correctly by 64-bit negs
@@ -402,6 +406,9 @@ TEST_P(InstArithmetic, negsx) {
   )");
   EXPECT_EQ(getNZCV(), 0b1001);
   EXPECT_EQ(getGeneralRegister<uint64_t>(0), static_cast<uint64_t>(1ul << 63));
+
+  EXPECT_GROUP(R"(negs x0, x1)", INT_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_GROUP(R"(negs x0, x1, lsl 31)", INT_SIMPLE_ARTH);
 }
 
 TEST_P(InstArithmetic, sbc) {
@@ -664,8 +671,7 @@ TEST_P(InstArithmetic, umsubl) {
 }
 
 INSTANTIATE_TEST_SUITE_P(AArch64, InstArithmetic,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/regression/aarch64/instructions/bitmanip.cc b/test/regression/aarch64/instructions/bitmanip.cc
index f44f620a06..a72dcb64dc 100644
--- a/test/regression/aarch64/instructions/bitmanip.cc
+++ b/test/regression/aarch64/instructions/bitmanip.cc
@@ -3,11 +3,12 @@
 namespace {
 
 using InstBitmanip = AArch64RegressionTest;
+using namespace simeng::arch::aarch64::InstructionGroups;
 
 TEST_P(InstBitmanip, bfm) {
   // 32-bit
   RUN_AARCH64(R"(
-    # Fill desintation registers with 1s
+    # Fill destination registers with 1s
     mov w0, wzr
     sub w1, w0, #1
     sub w2, w0, #1
@@ -29,7 +30,7 @@ TEST_P(InstBitmanip, bfm) {
 
   // 64-bit
   RUN_AARCH64(R"(
-    # Fill desintation registers with 1s
+    # Fill destination registers with 1s
     mov x0, xzr
     sub x1, x0, #1
     sub x2, x0, #1
@@ -172,7 +173,7 @@ TEST_P(InstBitmanip, rev) {
 TEST_P(InstBitmanip, sbfm) {
   // 32-bit
   RUN_AARCH64(R"(
-    # Fill desintation registers with 1s
+    # Fill destination registers with 1s
     mov w0, wzr
     sub w1, w0, #1
     sub w2, w0, #1
@@ -207,7 +208,7 @@ TEST_P(InstBitmanip, sbfm) {
 
   // 64-bit
   RUN_AARCH64(R"(
-    # Fill desintation registers with 1s
+    # Fill destination registers with 1s
     mov x0, xzr
     sub x1, x0, #1
     sub x2, x0, #1
@@ -243,12 +244,23 @@ TEST_P(InstBitmanip, sbfm) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0x78);
   EXPECT_EQ(getGeneralRegister<uint64_t>(8), 0x5678);
   EXPECT_EQ(getGeneralRegister<uint64_t>(9), 0x12345678);
+
+  EXPECT_GROUP(R"(sbfm w4, w0, #30, #27)", INT_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_GROUP(R"(sbfm x6, x0, #32, #22)", INT_SIMPLE_ARTH_NOSHIFT);
+
+  EXPECT_GROUP(R"(sxtb w7, w0)", INT_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_GROUP(R"(sxtb x7, w0)", INT_SIMPLE_ARTH_NOSHIFT);
+
+  EXPECT_GROUP(R"(sxth w7, w0)", INT_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_GROUP(R"(sxth x7, w0)", INT_SIMPLE_ARTH_NOSHIFT);
+
+  EXPECT_GROUP(R"(sxtw x7, w0)", INT_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstBitmanip, ubfm) {
   // 32-bit
   RUN_AARCH64(R"(
-    # Fill desintation registers with 1s
+    # Fill destination registers with 1s
     mov w0, wzr
     sub w1, w0, #1
     sub w2, w0, #1
@@ -269,7 +281,7 @@ TEST_P(InstBitmanip, ubfm) {
   EXPECT_EQ(getGeneralRegister<uint32_t>(4), 0x01E80000ull);
 
   RUN_AARCH64(R"(
-    # Fill desintation registers with 1s
+    # Fill destination registers with 1s
     mov x0, xzr
     sub x1, x0, #1
     sub x2, x0, #1
@@ -291,8 +303,7 @@ TEST_P(InstBitmanip, ubfm) {
 }
 
 INSTANTIATE_TEST_SUITE_P(AArch64, InstBitmanip,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/regression/aarch64/instructions/comparison.cc b/test/regression/aarch64/instructions/comparison.cc
index 3aed7d44eb..c91c48ee00 100644
--- a/test/regression/aarch64/instructions/comparison.cc
+++ b/test/regression/aarch64/instructions/comparison.cc
@@ -3,6 +3,16 @@
 namespace {
 
 using InstComparison = AArch64RegressionTest;
+using namespace simeng::arch::aarch64::InstructionGroups;
+
+// Similar to RISC-V atomic instructions, read-modify-write operations i.e. a
+// load, comparison and store, is given the group LOAD_INT only. The instruction
+// object is tagged with the appropriate identifiers (isLoad and isStore) but
+// the group only reflects the first stage of execution. This ensures the
+// instruction goes to the correct part of the pipeline i.e. the LSQ. But we
+// currently do not model the rest of the atomic behaviour precisely as the
+// comparison happens here also. The change of the instructions behaviour over
+// its lifetime is currently not reflected in the group it is given.
 
 // Test correct Value stored after comparison for CASAL (32 & 64 bit)
 TEST_P(InstComparison, casal) {
@@ -36,7 +46,9 @@ TEST_P(InstComparison, casal) {
   EXPECT_EQ(getMemoryValue<uint32_t>(getGeneralRegister<uint64_t>(0)),
             0xDEADBEEF);
   EXPECT_EQ(getMemoryValue<uint32_t>(getGeneralRegister<uint64_t>(3)), 100);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer()), 89);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer()), 89);
+
+  EXPECT_GROUP("casal w1, w2, [x0]", LOAD_INT);
 
   // 64-bit
   initialHeapData_.resize(16);
@@ -68,7 +80,9 @@ TEST_P(InstComparison, casal) {
   EXPECT_EQ(getMemoryValue<uint64_t>(getGeneralRegister<uint64_t>(0)),
             0xDEADBEEF);
   EXPECT_EQ(getMemoryValue<uint64_t>(getGeneralRegister<uint64_t>(3)), 101);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer()), 76);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer()), 76);
+
+  EXPECT_GROUP("casal x1, x7, [sp]", LOAD_INT);
 }
 
 // Test that NZCV flags are set correctly by the 32-bit cmn instruction
@@ -94,6 +108,8 @@ TEST_P(InstComparison, cmnw) {
     cmn w0, #0x1
   )");
   EXPECT_EQ(getNZCV(), 0b0110);
+
+  EXPECT_GROUP(R"(cmn w0, #0x1)", INT_SIMPLE_CMP);
 }
 
 // Test that NZCV flags are set correctly by the 64-bit cmn instruction
@@ -119,6 +135,8 @@ TEST_P(InstComparison, cmnx) {
     cmn x0, #0x1
   )");
   EXPECT_EQ(getNZCV(), 0b0110);
+
+  EXPECT_GROUP(R"(cmn X0, #0x1)", INT_SIMPLE_CMP);
 }
 
 // Test that NZCV flags are set correctly by the 32-bit ccmn instruction
@@ -190,6 +208,9 @@ TEST_P(InstComparison, tstw) {
     tst w0, #0x80000000
   )");
   EXPECT_EQ(getNZCV(), 0b1000);
+
+  EXPECT_GROUP(R"(tst w0, w2)", INT_SIMPLE_LOGICAL_NOSHIFT);
+  EXPECT_GROUP(R"(tst w0, #0x80000000)", INT_SIMPLE_LOGICAL_NOSHIFT);
 }
 
 // Test that NZCV flags are set correctly by 32-bit cmp
@@ -243,6 +264,8 @@ TEST_P(InstComparison, cmpw) {
     cmp w1, #1
   )");
   EXPECT_EQ(getNZCV(), 0b0011);
+
+  EXPECT_GROUP(R"(cmp w1, #1)", INT_SIMPLE_CMP);
 }
 
 // Test that NZCV flags are set correctly by 64-bit cmp
@@ -323,6 +346,8 @@ TEST_P(InstComparison, cmpx) {
     cmp x0, x2, uxtx 4
   )");
   EXPECT_EQ(getNZCV(), 0b0010);
+
+  EXPECT_GROUP(R"(cmp x0, x2, uxtx 4)", INT_SIMPLE_CMP);
 }
 
 // Test that NZCV flags are set correctly by 64-bit tst
@@ -347,11 +372,13 @@ TEST_P(InstComparison, tstx) {
     tst x0, #0x8000000000000000
   )");
   EXPECT_EQ(getNZCV(), 0b1000);
+
+  EXPECT_GROUP(R"(tst x0, x2)", INT_SIMPLE_LOGICAL_NOSHIFT);
+  EXPECT_GROUP(R"(tst x0, #0b0010)", INT_SIMPLE_LOGICAL_NOSHIFT);
 }
 
 INSTANTIATE_TEST_SUITE_P(AArch64, InstComparison,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/regression/aarch64/instructions/conditional.cc b/test/regression/aarch64/instructions/conditional.cc
index e4ec342700..e10d8b297d 100644
--- a/test/regression/aarch64/instructions/conditional.cc
+++ b/test/regression/aarch64/instructions/conditional.cc
@@ -3,6 +3,7 @@
 namespace {
 
 using InstConditional = AArch64RegressionTest;
+using namespace simeng::arch::aarch64::InstructionGroups;
 
 TEST_P(InstConditional, ccmn) {
   // 64-bit
@@ -120,6 +121,9 @@ TEST_P(InstConditional, csetm) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(4), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), -1);
+
+  EXPECT_GROUP(R"(csetm w6, le)", INT_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_GROUP(R"(csetm x6, le)", INT_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstConditional, csinc) {
@@ -166,6 +170,11 @@ TEST_P(InstConditional, csinc) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 42u);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 42u);
   EXPECT_EQ(getGeneralRegister<uint64_t>(8), 43u);
+
+  EXPECT_GROUP(R"(csinc w6, w1, w2, le)", INT_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_GROUP(R"(cinc w8, w1, ge)", INT_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_GROUP(R"(csinc x6, x1, x2, le)", INT_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_GROUP(R"(cinc x8, x1, ge)", INT_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstConditional, csneg) {
@@ -212,6 +221,9 @@ TEST_P(InstConditional, csneg) {
   EXPECT_EQ(getGeneralRegister<int64_t>(6), 42);
   EXPECT_EQ(getGeneralRegister<int64_t>(7), 42);
   EXPECT_EQ(getGeneralRegister<int64_t>(8), -42);
+
+  EXPECT_GROUP(R"(cneg w8, w1, ge)", INT_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_GROUP(R"(cneg x8, x1, ge)", INT_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstConditional, tbz) {
@@ -253,8 +265,7 @@ TEST_P(InstConditional, tbz) {
 }
 
 INSTANTIATE_TEST_SUITE_P(AArch64, InstConditional,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/regression/aarch64/instructions/divide.cc b/test/regression/aarch64/instructions/divide.cc
index a6a36663d3..7c96915b62 100644
--- a/test/regression/aarch64/instructions/divide.cc
+++ b/test/regression/aarch64/instructions/divide.cc
@@ -62,8 +62,7 @@ TEST_P(InstDiv, udiv) {
 }
 
 INSTANTIATE_TEST_SUITE_P(AArch64, InstDiv,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/regression/aarch64/instructions/float.cc b/test/regression/aarch64/instructions/float.cc
index fbb21ba60d..03f3f799df 100644
--- a/test/regression/aarch64/instructions/float.cc
+++ b/test/regression/aarch64/instructions/float.cc
@@ -533,12 +533,18 @@ TEST_P(InstFloat, fcvt) {
 }
 
 TEST_P(InstFloat, fcvtzu) {
-  initialHeapData_.resize(32);
+  initialHeapData_.resize(80);
   double* dheap = reinterpret_cast<double*>(initialHeapData_.data());
   dheap[0] = 1.0;
   dheap[1] = -42.76;
   dheap[2] = -0.125;
   dheap[3] = 321.5;
+  dheap[4] = std::nan("");
+  dheap[5] = -std::nan("");
+  dheap[6] = INFINITY;
+  dheap[7] = -INFINITY;
+  dheap[8] = 4294967296.0;            // uint32_max + 1
+  dheap[9] = 18446744073709551616.0;  // uint64_max + 1
 
   // Double to uint32
   RUN_AARCH64(R"(
@@ -549,15 +555,30 @@ TEST_P(InstFloat, fcvtzu) {
 
     ldp d0, d1, [x0]
     ldp d2, d3, [x0, #16]
+    ldp d4, d5, [x0, #32]
+    ldp d6, d7, [x0, #48]
+    ldp d8, d9, [x0, #64]
     fcvtzu w0, d0
     fcvtzu w1, d1
     fcvtzu w2, d2
     fcvtzu w3, d3
+    fcvtzu w4, d4
+    fcvtzu w5, d5
+    fcvtzu w6, d6
+    fcvtzu w7, d7
+    fcvtzu w8, d8
+    fcvtzu w9, d9
   )");
   EXPECT_EQ((getGeneralRegister<uint32_t>(0)), 1);
-  EXPECT_EQ((getGeneralRegister<uint32_t>(1)), -42);
+  EXPECT_EQ((getGeneralRegister<uint32_t>(1)), 0);
   EXPECT_EQ((getGeneralRegister<uint32_t>(2)), 0);
   EXPECT_EQ((getGeneralRegister<uint32_t>(3)), 321);
+  EXPECT_EQ((getGeneralRegister<uint32_t>(4)), 0);
+  EXPECT_EQ((getGeneralRegister<uint32_t>(5)), 0);
+  EXPECT_EQ((getGeneralRegister<uint32_t>(6)), UINT32_MAX);
+  EXPECT_EQ((getGeneralRegister<uint32_t>(7)), 0);
+  EXPECT_EQ((getGeneralRegister<uint32_t>(8)), UINT32_MAX);
+  EXPECT_EQ((getGeneralRegister<uint32_t>(9)), UINT32_MAX);
 
   // Double to uint64
   RUN_AARCH64(R"(
@@ -568,17 +589,32 @@ TEST_P(InstFloat, fcvtzu) {
 
     ldp d0, d1, [x0]
     ldp d2, d3, [x0, #16]
+    ldp d4, d5, [x0, #32]
+    ldp d6, d7, [x0, #48]
+    ldp d8, d9, [x0, #64]
     fcvtzu x0, d0
     fcvtzu x1, d1
     fcvtzu x2, d2
     fcvtzu x3, d3
+    fcvtzu x4, d4
+    fcvtzu x5, d5
+    fcvtzu x6, d6
+    fcvtzu x7, d7
+    fcvtzu x8, d8
+    fcvtzu x9, d9
   )");
   EXPECT_EQ((getGeneralRegister<uint64_t>(0)), 1);
-  EXPECT_EQ((getGeneralRegister<uint64_t>(1)), -42);
+  EXPECT_EQ((getGeneralRegister<uint64_t>(1)), 0);
   EXPECT_EQ((getGeneralRegister<uint64_t>(2)), 0);
   EXPECT_EQ((getGeneralRegister<uint64_t>(3)), 321);
+  EXPECT_EQ((getGeneralRegister<uint64_t>(4)), 0);
+  EXPECT_EQ((getGeneralRegister<uint64_t>(5)), 0);
+  EXPECT_EQ((getGeneralRegister<uint64_t>(6)), UINT64_MAX);
+  EXPECT_EQ((getGeneralRegister<uint64_t>(7)), 0);
+  EXPECT_EQ((getGeneralRegister<uint64_t>(8)), 4294967296);
+  EXPECT_EQ((getGeneralRegister<uint64_t>(9)), UINT64_MAX);
 
-  // Double to implicit_cast<double>(uint64)
+  // Double to scalar uint64
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
@@ -587,25 +623,44 @@ TEST_P(InstFloat, fcvtzu) {
 
     ldp d0, d1, [x0]
     ldp d2, d3, [x0, #16]
+    ldp d4, d5, [x0, #32]
+    ldp d6, d7, [x0, #48]
+    ldp d8, d9, [x0, #64]
     fcvtzu d10, d0
     fcvtzu d11, d1
     fcvtzu d12, d2
     fcvtzu d13, d3
+    fcvtzu d14, d4
+    fcvtzu d15, d5
+    fcvtzu d16, d6
+    fcvtzu d17, d7
+    fcvtzu d18, d8
+    fcvtzu d19, d9
   )");
   // Values verified on A64FX via simple assembly test kernel
-  double a = 4.9406564584124654e-324;
-  double b = 0.0;
-  double c = 1.5859507231504014e-321;
-  CHECK_NEON(10, double, {a, 0.0});
-  CHECK_NEON(11, double, {b, 0.0});
-  CHECK_NEON(12, double, {b, 0.0});
-  CHECK_NEON(13, double, {c, 0.0});
+  CHECK_NEON(10, uint64_t, {1, 0});
+  CHECK_NEON(11, uint64_t, {0, 0});
+  CHECK_NEON(12, uint64_t, {0, 0});
+  CHECK_NEON(13, uint64_t, {321, 0});
+  CHECK_NEON(14, uint64_t, {0, 0});
+  CHECK_NEON(15, uint64_t, {0, 0});
+  CHECK_NEON(16, uint64_t, {UINT64_MAX, 0});
+  CHECK_NEON(17, uint64_t, {0, 0});
+  CHECK_NEON(18, uint64_t, {4294967296, 0});
+  CHECK_NEON(19, uint64_t, {UINT64_MAX, 0});
 
   float* fheap = reinterpret_cast<float*>(initialHeapData_.data());
-  fheap[0] = 1.0;
-  fheap[1] = -42.76;
-  fheap[2] = -0.125;
-  fheap[3] = 321.5;
+  fheap[0] = 1.0f;
+  fheap[1] = -42.76f;
+  fheap[2] = -0.125f;
+  fheap[3] = 321.5f;
+  fheap[4] = std::nanf("");
+  fheap[5] = -std::nanf("");
+  fheap[6] = INFINITY;
+  fheap[7] = -INFINITY;
+  fheap[8] = 4294967296.0;            // uint32_max + 1
+  fheap[9] = 18446744073709551616.0;  // uint64_max + 1
+
   // Float to uint32
   RUN_AARCH64(R"(
     # Get heap address
@@ -615,15 +670,30 @@ TEST_P(InstFloat, fcvtzu) {
 
     ldp s0, s1, [x0]
     ldp s2, s3, [x0, #8]
+    ldp s4, s5, [x0, #16]
+    ldp s6, s7, [x0, #24]
+    ldp s8, s9, [x0, #32]
     fcvtzu w0, s0
     fcvtzu w1, s1
     fcvtzu w2, s2
     fcvtzu w3, s3
+    fcvtzu w4, s4
+    fcvtzu w5, s5
+    fcvtzu w6, s6
+    fcvtzu w7, s7
+    fcvtzu w8, s8
+    fcvtzu w9, s9
   )");
   EXPECT_EQ((getGeneralRegister<uint32_t>(0)), 1);
-  EXPECT_EQ((getGeneralRegister<uint32_t>(1)), -42);
+  EXPECT_EQ((getGeneralRegister<uint32_t>(1)), 0);
   EXPECT_EQ((getGeneralRegister<uint32_t>(2)), 0);
   EXPECT_EQ((getGeneralRegister<uint32_t>(3)), 321);
+  EXPECT_EQ((getGeneralRegister<uint32_t>(4)), 0);
+  EXPECT_EQ((getGeneralRegister<uint32_t>(5)), 0);
+  EXPECT_EQ((getGeneralRegister<uint32_t>(6)), UINT32_MAX);
+  EXPECT_EQ((getGeneralRegister<uint32_t>(7)), 0);
+  EXPECT_EQ((getGeneralRegister<uint32_t>(8)), UINT32_MAX);
+  EXPECT_EQ((getGeneralRegister<uint32_t>(9)), UINT32_MAX);
 
   // Float to uint64
   RUN_AARCH64(R"(
@@ -634,15 +704,30 @@ TEST_P(InstFloat, fcvtzu) {
 
     ldp s0, s1, [x0]
     ldp s2, s3, [x0, #8]
+    ldp s4, s5, [x0, #16]
+    ldp s6, s7, [x0, #24]
+    ldp s8, s9, [x0, #32]
     fcvtzu x0, s0
     fcvtzu x1, s1
     fcvtzu x2, s2
     fcvtzu x3, s3
+    fcvtzu x4, s4
+    fcvtzu x5, s5
+    fcvtzu x6, s6
+    fcvtzu x7, s7
+    fcvtzu x8, s8
+    fcvtzu x9, s9
   )");
   EXPECT_EQ((getGeneralRegister<uint64_t>(0)), 1);
-  EXPECT_EQ((getGeneralRegister<uint64_t>(1)), -42);
+  EXPECT_EQ((getGeneralRegister<uint64_t>(1)), 0);
   EXPECT_EQ((getGeneralRegister<uint64_t>(2)), 0);
   EXPECT_EQ((getGeneralRegister<uint64_t>(3)), 321);
+  EXPECT_EQ((getGeneralRegister<uint64_t>(4)), 0);
+  EXPECT_EQ((getGeneralRegister<uint64_t>(5)), 0);
+  EXPECT_EQ((getGeneralRegister<uint64_t>(6)), UINT64_MAX);
+  EXPECT_EQ((getGeneralRegister<uint64_t>(7)), 0);
+  EXPECT_EQ((getGeneralRegister<uint64_t>(8)), 4294967296);
+  EXPECT_EQ((getGeneralRegister<uint64_t>(9)), UINT64_MAX);
 }
 
 TEST_P(InstFloat, fdiv) {
@@ -1455,8 +1540,7 @@ TEST_P(InstFloat, frintp) {
 }
 
 INSTANTIATE_TEST_SUITE_P(AArch64, InstFloat,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
\ No newline at end of file
diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc
index 387d0a058b..09269eebb8 100644
--- a/test/regression/aarch64/instructions/load.cc
+++ b/test/regression/aarch64/instructions/load.cc
@@ -181,57 +181,343 @@ TEST_P(InstLoad, ld1_tworeg) {  // 128-bit
 }
 
 TEST_P(InstLoad, ld1_multi_struct) {
-  // 16-bit, load into one register
-  // 16B = 16 elements of one byte
-  initialHeapData_.resize(16);
-  uint8_t* heapi8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
-  heapi8[0] = 0xFF;
-  heapi8[1] = 0x00;
-  heapi8[2] = 0x11;
-  heapi8[3] = 0x22;
-  heapi8[4] = 0x33;
-  heapi8[5] = 0x44;
-  heapi8[6] = 0x55;
-  heapi8[7] = 0x66;
-  heapi8[8] = 0x77;
-  heapi8[9] = 0x88;
-  heapi8[10] = 0x99;
-  heapi8[11] = 0xAA;
-  heapi8[12] = 0xBB;
-  heapi8[13] = 0xCC;
-  heapi8[14] = 0xDD;
-  heapi8[15] = 0xEE;
-
+  initialHeapData_.resize(64);
+  uint64_t* heap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  heap[0] = 0x66554433221100FF;
+  heap[1] = 0xEEDDCCBBAA998877;
+  heap[2] = 0x66554433221100FF;
+  heap[3] = 0xEEDDCCBBAA998877;
+  heap[4] = 0x66554433221100FF;
+  heap[5] = 0xEEDDCCBBAA998877;
+  heap[6] = 0x66554433221100FF;
+  heap[7] = 0xEEDDCCBBAA998877;
+
+  // One reg, 16b elements
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
     mov x8, 214
     svc #0
 
+    mov x1, #16
+
     # Load values from heap
     ld1 {v0.16b}, [x0]
 
     # save heap address before post index
     mov x10, x0
 
-    # Load values from heap with post-index
+    # Load values from heap with imm post-index
     ld1 {v1.16b}, [x0], #16
 
     # save heap address after post index
     mov x11, x0
+    sub x0, x0, #16
+
+    # Load values from heap with reg post-index
+    ld1 {v2.16b}, [x0], x1
 
+    mov x12, x0
   )");
 
   CHECK_NEON(0, uint8_t,
              {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
               0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(1, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  EXPECT_EQ(getGeneralRegister<uint64_t>(11),
+            getGeneralRegister<uint64_t>(10) + 16);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(12),
+            getGeneralRegister<uint64_t>(10) + 16);
+
+  // Two reg, 16b elements
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #32
+
+    # Load values from heap
+    ld1 {v0.16b, v1.16b}, [x0]
 
+    # save heap address before post index
+    mov x10, x0
+
+    # Load values from heap with imm post-index
+    ld1 {v2.16b, v3.16b}, [x0], #32
+
+    # save heap address after post index
+    mov x11, x0
+    sub x0, x0, #32
+
+    # Load values from heap with reg post-index
+    ld1 {v4.16b, v5.16b}, [x0], x1
+
+    mov x12, x0
+  )");
+
+  CHECK_NEON(0, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
   CHECK_NEON(1, uint8_t,
              {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
               0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(2, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(3, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(4, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(5, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  EXPECT_EQ(getGeneralRegister<uint64_t>(11),
+            getGeneralRegister<uint64_t>(10) + 32);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(12),
+            getGeneralRegister<uint64_t>(10) + 32);
+
+  // Two reg, 2d elements
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #32
+
+    # Load values from heap
+    ld1 {v0.2d, v1.2d}, [x0]
+
+    # save heap address before post index
+    mov x10, x0
 
+    # Load values from heap with imm post-index
+    ld1 {v2.2d, v3.2d}, [x0], #32
+
+    # save heap address after post index
+    mov x11, x0
+    sub x0, x0, #32
+
+    # Load values from heap with reg post-index
+    ld1 {v4.2d, v5.2d}, [x0], x1
+
+    mov x12, x0
+  )");
+
+  CHECK_NEON(0, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(1, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(2, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(3, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(4, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(5, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
   EXPECT_EQ(getGeneralRegister<uint64_t>(11),
-            getGeneralRegister<uint64_t>(10) + 16);
+            getGeneralRegister<uint64_t>(10) + 32);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(12),
+            getGeneralRegister<uint64_t>(10) + 32);
+
+  // Two reg, 4s elements
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #32
+
+    # Load values from heap
+    ld1 {v0.4s, v1.4s}, [x0]
+
+    # save heap address before post index
+    mov x10, x0
+
+    # Load values from heap with imm post-index
+    ld1 {v2.4s, v3.4s}, [x0], #32
+
+    # save heap address after post index
+    mov x11, x0
+    sub x0, x0, #32
+
+    # Load values from heap with reg post-index
+    ld1 {v4.4s, v5.4s}, [x0], x1
+
+    mov x12, x0
+  )");
+
+  CHECK_NEON(0, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(1, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(2, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(3, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(4, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(5, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  EXPECT_EQ(getGeneralRegister<uint64_t>(11),
+            getGeneralRegister<uint64_t>(10) + 32);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(12),
+            getGeneralRegister<uint64_t>(10) + 32);
+
+  // Four reg, 16b elements
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #64
+
+    # Load values from heap
+    ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
+
+    # save heap address before post index
+    mov x10, x0
+
+    # Load values from heap with imm post-index
+    ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+
+    # save heap address after post index
+    mov x11, x0
+    sub x0, x0, #64
+
+    # Load values from heap with reg post-index
+    ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [x0], x1
+
+    mov x12, x0
+  )");
+
+  CHECK_NEON(0, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(1, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(2, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(3, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(4, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(5, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(6, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(7, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(8, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(9, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(10, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  CHECK_NEON(11, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99,
+              0xAA, 0xBB, 0xCC, 0xDD, 0xEE});
+  EXPECT_EQ(getGeneralRegister<uint64_t>(11),
+            getGeneralRegister<uint64_t>(10) + 64);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(12),
+            getGeneralRegister<uint64_t>(10) + 64);
+
+  // Four reg, 2d elements
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #64
+
+    # Load values from heap
+    ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
+
+    # save heap address before post index
+    mov x10, x0
+
+    # Load values from heap with imm post-index
+    ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64
+
+    # save heap address after post index
+    mov x11, x0
+    sub x0, x0, #64
+
+    # Load values from heap with reg post-index
+    ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], x1
+
+    mov x12, x0
+  )");
+
+  CHECK_NEON(0, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(1, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(2, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(3, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(4, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(5, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(6, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(7, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(8, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(9, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(10, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  CHECK_NEON(11, uint64_t, {0x66554433221100FF, 0xEEDDCCBBAA998877});
+  EXPECT_EQ(getGeneralRegister<uint64_t>(11),
+            getGeneralRegister<uint64_t>(10) + 64);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(12),
+            getGeneralRegister<uint64_t>(10) + 64);
+
+  // Four reg, 4s elements
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #64
+
+    # Load values from heap
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0]
+
+    # save heap address before post index
+    mov x10, x0
+
+    # Load values from heap with imm post-index
+    ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x0], #64
+
+    # save heap address after post index
+    mov x11, x0
+    sub x0, x0, #64
+
+    # Load values from heap with reg post-index
+    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], x1
+
+    mov x12, x0
+  )");
+
+  CHECK_NEON(0, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(1, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(2, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(3, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(4, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(5, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(6, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(7, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(8, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(9, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(10, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  CHECK_NEON(11, uint32_t, {0x221100FF, 0x66554433, 0xAA998877, 0xEEDDCCBB});
+  EXPECT_EQ(getGeneralRegister<uint64_t>(11),
+            getGeneralRegister<uint64_t>(10) + 64);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(12),
+            getGeneralRegister<uint64_t>(10) + 64);
 }
 
 TEST_P(InstLoad, ld2_multi_struct) {
@@ -319,10 +605,14 @@ TEST_P(InstLoad, ldadd) {
   EXPECT_EQ(getGeneralRegister<uint32_t>(6), 48);
   EXPECT_EQ(getGeneralRegister<uint32_t>(7), 128);
 
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 1024), 112);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 992), 64);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 960), 176);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 928), 128);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 1024),
+            112);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 992),
+            64);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 960),
+            176);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 928),
+            128);
 }
 
 TEST_P(InstLoad, ldar) {
@@ -1076,8 +1366,7 @@ TEST_P(InstLoad, ldxr) {
 }
 
 INSTANTIATE_TEST_SUITE_P(AArch64, InstLoad,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/regression/aarch64/instructions/logical.cc b/test/regression/aarch64/instructions/logical.cc
index 425fb1bc91..b21ca37bb7 100644
--- a/test/regression/aarch64/instructions/logical.cc
+++ b/test/regression/aarch64/instructions/logical.cc
@@ -3,6 +3,7 @@
 namespace {
 
 using InstLogical = AArch64RegressionTest;
+using namespace simeng::arch::aarch64::InstructionGroups;
 
 TEST_P(InstLogical, andw) {
   // 0 & 0 = 0
@@ -188,6 +189,10 @@ TEST_P(InstLogical, asrw) {
     asr w0, w0, w1
   )");
   EXPECT_EQ(getGeneralRegister<int32_t>(0), -8);
+
+  // TODO being noshift seems incorrect - but potentially aliasing to SBF
+  EXPECT_GROUP(R"(asr w0, w0, wzr)", INT_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_GROUP(R"(asr w0, w0, #1)", INT_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstLogical, asrx) {
@@ -221,6 +226,10 @@ TEST_P(InstLogical, asrx) {
     asr x0, x0, x1
   )");
   EXPECT_EQ(getGeneralRegister<int64_t>(0), -8);
+
+  // TODO noshift seems incorrect - but potentially aliasing to SBF
+  EXPECT_GROUP(R"(asr x0, x0, xzr)", INT_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_GROUP(R"(asr x0, x0, #2)", INT_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstLogical, bic) {
@@ -405,13 +414,18 @@ TEST_P(InstLogical, lsrv) {
     lsrv x3, x0, xzr
     lsrv x4, x0, x1
     lsrv x5, x0, x2
-    # Check lsr alias as xell
+    # Check lsr alias as well
     lsr x6, x1, x0
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(3), 7ull);
   EXPECT_EQ(getGeneralRegister<uint64_t>(4), 7ull >> 31);
   EXPECT_EQ(getGeneralRegister<uint64_t>(5), 7ull >> 6);
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 31ull >> 7);
+
+  EXPECT_GROUP(R"(lsr w6, w1, w0)", INT_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_GROUP(R"(lsr x6, x1, x0)", INT_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_GROUP(R"(lsr w6, w1, #1)", INT_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_GROUP(R"(lsr x6, x1, #1)", INT_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstLogical, orn) {
@@ -475,6 +489,11 @@ TEST_P(InstLogical, orn) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), UINT64_C(-1) & ~UINT64_C(0b0101));
   EXPECT_EQ(getGeneralRegister<uint64_t>(7),
             UINT64_C(-1) & ~(UINT64_C(0b0101) << 60));
+
+  EXPECT_GROUP(R"(mvn w6, w0)", INT_SIMPLE_LOGICAL_NOSHIFT);
+  EXPECT_GROUP(R"(mvn w7, w0, lsl #28)", INT_SIMPLE_LOGICAL);
+  EXPECT_GROUP(R"(mvn x6, x1)", INT_SIMPLE_LOGICAL_NOSHIFT);
+  EXPECT_GROUP(R"(mvn x7, x1, lsl #60)", INT_SIMPLE_LOGICAL);
 }
 
 TEST_P(InstLogical, rorv) {
@@ -510,8 +529,7 @@ TEST_P(InstLogical, rorv) {
 }
 
 INSTANTIATE_TEST_SUITE_P(AArch64, InstLogical,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/regression/aarch64/instructions/misc.cc b/test/regression/aarch64/instructions/misc.cc
index 367e311e72..5ae703f4e6 100644
--- a/test/regression/aarch64/instructions/misc.cc
+++ b/test/regression/aarch64/instructions/misc.cc
@@ -46,8 +46,7 @@ TEST_P(InstMisc, ret) {
 }
 
 INSTANTIATE_TEST_SUITE_P(AArch64, InstMisc,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/regression/aarch64/instructions/multiply.cc b/test/regression/aarch64/instructions/multiply.cc
index 097c710bb7..6a7bc267f3 100644
--- a/test/regression/aarch64/instructions/multiply.cc
+++ b/test/regression/aarch64/instructions/multiply.cc
@@ -3,6 +3,7 @@
 namespace {
 
 using InstMul = AArch64RegressionTest;
+using namespace simeng::arch::aarch64::InstructionGroups;
 
 TEST_P(InstMul, maddw) {
   RUN_AARCH64(R"(
@@ -41,6 +42,16 @@ TEST_P(InstMul, mulw) {
     mul w2, w0, w1
   )");
   EXPECT_EQ(getGeneralRegister<uint32_t>(2), 42u);
+
+  RUN_AARCH64(R"(
+    movz x0, #7
+    movz x1, #6
+    mul x2, x0, x1
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(2), 42u);
+
+  EXPECT_GROUP(R"(mul w2, w0, w1)", INT_MUL);
+  EXPECT_GROUP(R"(mul x2, x0, x1)", INT_MUL);
 }
 
 TEST_P(InstMul, smaddl) {
@@ -78,6 +89,8 @@ TEST_P(InstMul, smull) {
     smull x3, w0, w1
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(3), 0x0000002A00000000);
+
+  EXPECT_GROUP(R"(smull x3, w0, w1)", INT_MUL);
 }
 
 TEST_P(InstMul, umaddl) {
@@ -93,11 +106,12 @@ TEST_P(InstMul, umaddl) {
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(3), 0x0005002A00000000);
   EXPECT_EQ(getGeneralRegister<uint64_t>(4), 0x0000002A00000000);
+
+  EXPECT_GROUP(R"(umull x4, w0, w1)", INT_MUL);
 }
 
 INSTANTIATE_TEST_SUITE_P(AArch64, InstMul,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index 4131eb5c21..96d23590a6 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -5,6 +5,7 @@
 namespace {
 
 using InstNeon = AArch64RegressionTest;
+using namespace simeng::arch::aarch64::InstructionGroups;
 
 TEST_P(InstNeon, add) {
   // 8-bit vector
@@ -316,8 +317,43 @@ TEST_P(InstNeon, addv) {
     ldr q0, [x0]
     addv b1, v0.8b
   )");
-
   CHECK_NEON(1, uint8_t, {36});
+
+  // 16-bit
+  initialHeapData_.resize(16);
+  uint16_t* heap16 = reinterpret_cast<uint16_t*>(initialHeapData_.data());
+  for (int i = 0; i < 8; i++) {
+    heap16[i] = 2 * (i + 1);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    addv h1, v0.4h
+  )");
+  CHECK_NEON(1, uint16_t, {20});
+
+  // 32-bit
+  initialHeapData_.resize(16);
+  uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  for (int i = 0; i < 4; i++) {
+    heap32[i] = 4 * (i + 1);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    addv s1, v0.4s
+  )");
+  CHECK_NEON(1, uint8_t, {40});
 }
 
 TEST_P(InstNeon, and) {
@@ -684,15 +720,15 @@ TEST_P(InstNeon, cmeq) {
 TEST_P(InstNeon, cmhs) {
   // cmhs vd.16b, vn.16b, vm.16b
   initialHeapData_.resize(32);
-  int8_t* heap = reinterpret_cast<int8_t*>(initialHeapData_.data());
+  uint8_t* heap = initialHeapData_.data();
 
   // v0
   heap[0] = 0;
   heap[1] = 0x7F;
   heap[2] = INT8_MAX;
   heap[3] = 1;
-  heap[4] = -128;
-  heap[5] = -1;
+  heap[4] = 128;
+  heap[5] = 1;
   heap[6] = 0xAA;
   heap[7] = 0xBB;
   heap[8] = 0xCC;
@@ -708,7 +744,7 @@ TEST_P(InstNeon, cmhs) {
   heap[16] = INT8_MAX;
   heap[17] = 0x7F;
   heap[18] = 0;
-  heap[19] = -128;
+  heap[19] = 128;
   heap[20] = 1;
   heap[21] = 0;
   heap[22] = 0xAA;
@@ -736,10 +772,10 @@ TEST_P(InstNeon, cmhs) {
   )");
 
   CHECK_NEON(2, uint8_t,
-             {0x00, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+             {0x00, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
               0xFF, 0xFF, 0xFF, 0xFF, 0xFF});
   CHECK_NEON(3, uint8_t,
-             {0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
+             {0xFF, 0xFF, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
               0x00, 0xFF, 0xFF, 0xFF, 0xFF});
 }
 
@@ -1002,6 +1038,64 @@ TEST_P(InstNeon, ext) {
               0, 0, 0, 0, 0});
 }
 
+TEST_P(InstNeon, fabd) {
+  // 32-bit v.4s
+  initialHeapData_.resize(32);
+  float* fheap = reinterpret_cast<float*>(initialHeapData_.data());
+  fheap[0] = 1.0;
+  fheap[1] = -42.75;
+  fheap[2] = -2.5;
+  fheap[3] = 32768;
+  fheap[4] = -0.125;
+  fheap[5] = 321.0;
+  fheap[6] = -0.0;
+  fheap[7] = std::nanf("");
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #16]
+    fabd v2.4s, v0.4s, v1.4s
+  )");
+  EXPECT_EQ((getVectorRegisterElement<float, 0>(2)), 1.125);
+  EXPECT_EQ((getVectorRegisterElement<float, 1>(2)), 363.75);
+  EXPECT_EQ((getVectorRegisterElement<float, 2>(2)), 2.5);
+  EXPECT_TRUE(std::isnan(getVectorRegisterElement<float, 3>(2)));
+
+  // 64-bit v.2s
+  initialHeapData_.resize(64);
+  double* dheap = reinterpret_cast<double*>(initialHeapData_.data());
+  dheap[0] = 1.0;
+  dheap[1] = -42.75;
+  dheap[2] = -2.5;
+  dheap[3] = 32768;
+  dheap[4] = -0.125;
+  dheap[5] = 321.0;
+  dheap[6] = -0.0;
+  dheap[7] = std::nanf("");
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #32]
+    ldr q2, [x0, #16]
+    ldr q3, [x0, #48]
+
+    fabd v4.2d, v0.2d, v1.2d
+    fabd v5.2d, v2.2d, v3.2d
+  )");
+  EXPECT_EQ((getVectorRegisterElement<double, 0>(4)), 1.125);
+  EXPECT_EQ((getVectorRegisterElement<double, 1>(4)), 363.75);
+  EXPECT_EQ((getVectorRegisterElement<double, 0>(5)), 2.5);
+  EXPECT_TRUE(std::isnan(getVectorRegisterElement<double, 1>(5)));
+}
+
 TEST_P(InstNeon, fabs) {
   initialHeapData_.resize(32);
   float* fheap = reinterpret_cast<float*>(initialHeapData_.data());
@@ -1281,12 +1375,14 @@ TEST_P(InstNeon, fcmgt) {
 
     ldr q0, [x0]
     ldr q1, [x0, #16]
+    fcmgt v2.2d, v0.2d, v1.2d
     
-    fcmgt v2.2d, v0.2d, #0.0
-    fcmgt v3.2d, v1.2d, #0.0
+    fcmgt v3.2d, v0.2d, #0.0
+    fcmgt v4.2d, v1.2d, #0.0
   )");
   CHECK_NEON(2, uint64_t, {UINT64_MAX, 0});
-  CHECK_NEON(3, uint64_t, {0, 0});
+  CHECK_NEON(3, uint64_t, {UINT64_MAX, 0});
+  CHECK_NEON(4, uint64_t, {0, 0});
 }
 TEST_P(InstNeon, fcmlt) {
   // Float
@@ -2588,8 +2684,8 @@ TEST_P(InstNeon, uminp) {
 
   )");
   CHECK_NEON(2, uint8_t,
-             {0x00, 0x00, 0xEE, 0x11, 0x22, 0x33, 0x44, 0x55, 0x01, 0x02, 0x03,
-              0x04, 0x05, 0x06, 0x07, 0x08});
+             {0x00, 0xAA, 0xBB, 0xDD, 0x01, 0x03, 0x05, 0x07, 0x00, 0x11, 0x22,
+              0x44, 0xEE, 0xCC, 0xAA, 0x88});
 }
 TEST_P(InstNeon, umaxp) {
   // umaxp vd.16b vn.16b vm.16b
@@ -2642,12 +2738,12 @@ TEST_P(InstNeon, umaxp) {
 
     ldr q0, [x0]
     ldr q1, [x0, #16]
-    umaxp v2.16b, v0.16b, v1.16b
+    umaxp v2.16b, v1.16b, v0.16b
 
   )");
   CHECK_NEON(2, uint8_t,
-             {0x01, 0x00, 0xFF, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0xEE, 0xDD,
-              0xCC, 0xBB, 0xAA, 0x99, 0x88});
+             {0x00, 0xEE, 0x33, 0x55, 0xFF, 0xDD, 0xBB, 0x99, 0x01, 0xFF, 0xCC,
+              0xEE, 0x02, 0x04, 0x06, 0x08});
 }
 
 TEST_P(InstNeon, smax) {
@@ -2931,6 +3027,30 @@ TEST_P(InstNeon, sshll) {
   CHECK_NEON(5, int64_t, {(INT32_MAX - 3), -28});
 }
 
+TEST_P(InstNeon, shrn) {
+  // 8 bytes
+  initialHeapData_.resize(32);
+  uint16_t* heap16 = reinterpret_cast<uint16_t*>(initialHeapData_.data());
+  heap16[0] = 32;
+  heap16[1] = 333;
+  heap16[2] = (UINT16_MAX);
+  heap16[3] = 28;
+  heap16[4] = 64;
+  heap16[5] = 256;
+  heap16[6] = 444;
+  heap16[7] = 56;
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    shrn v1.8b, v0.8h, #2
+  )");
+  CHECK_NEON(1, uint8_t, {8, 83, 255, 7, 16, 64, 111, 14});
+}
 TEST_P(InstNeon, sshr) {
   initialHeapData_.resize(32);
   int32_t* heap = reinterpret_cast<int32_t*>(initialHeapData_.data());
@@ -3414,6 +3534,8 @@ TEST_P(InstNeon, rev) {
       6, uint8_t,
       {0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0x00, 0x00, 0x00});
 
+  EXPECT_GROUP(R"(rev64 v2.4h, v0.4h)", VECTOR_SIMPLE_ARTH_NOSHIFT);
+
   // REV32
   RUN_AARCH64(R"(
     index z0.b, #0, #1
@@ -3600,9 +3722,77 @@ TEST_P(InstNeon, uzp) {
   CHECK_NEON(7, uint32_t, {0xe0c0a08, 0x1e1c1a18, 0xf0d0b09, 0x1f1d1b19});
   CHECK_NEON(8, uint64_t, {0x1e1c1a1816141210, 0x1f1d1b1917151311});
 }
+
+TEST_P(InstNeon, zip) {
+  initialHeapData_.resize(128);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  heap64[0] = 0x0F0D0B0907050301;
+  heap64[1] = 0x1F1D1B1917151311;
+  heap64[2] = 0x0E0C0A0806040200;
+  heap64[3] = 0x1E1C1A1816141210;
+
+  // zip1
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, #0
+    mov x8, #214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #16]
+
+    zip1 v2.16b, v1.16b, v0.16b
+    zip1 v3.8b, v1.8b, v0.8b
+    zip1 v4.8h, v1.8h, v0.8h
+    zip1 v5.4h, v1.4h, v0.4h
+    zip1 v6.4s, v1.4s, v0.4s
+    zip1 v7.2s, v1.2s, v0.2s
+    zip1 v8.2d, v1.2d, v0.2d
+  )");
+
+  CHECK_NEON(2, uint8_t,
+             {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
+              0x0B, 0x0C, 0x0D, 0x0E, 0x0F});
+  CHECK_NEON(3, uint8_t, {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07});
+  CHECK_NEON(4, uint16_t,
+             {0x0200, 0x0301, 0x0604, 0x0705, 0x0A08, 0x0B09, 0x0E0C, 0x0F0D});
+  CHECK_NEON(5, uint16_t, {0x0200, 0x0301, 0x0604, 0x0705});
+  CHECK_NEON(6, uint32_t, {0x06040200, 0x07050301, 0x0E0C0A08, 0x0F0D0B09});
+  CHECK_NEON(7, uint32_t, {0x06040200, 0x07050301});
+  CHECK_NEON(8, uint64_t, {0x0E0C0A0806040200, 0x0F0D0B0907050301});
+
+  // zip2
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, #0
+    mov x8, #214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #16]
+
+    zip2 v2.16b, v1.16b, v0.16b
+    zip2 v3.8b, v1.8b, v0.8b
+    zip2 v4.8h, v1.8h, v0.8h
+    zip2 v5.4h, v1.4h, v0.4h
+    zip2 v6.4s, v1.4s, v0.4s
+    zip2 v7.2s, v1.2s, v0.2s
+    zip2 v8.2d, v1.2d, v0.2d
+  )");
+
+  CHECK_NEON(2, uint8_t,
+             {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A,
+              0x1B, 0x1C, 0x1D, 0x1E, 0x1F});
+  CHECK_NEON(3, uint8_t, {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F});
+  CHECK_NEON(4, uint16_t,
+             {0x1210, 0x1311, 0x1614, 0x1715, 0x1A18, 0x1B19, 0x1E1C, 0x1F1D});
+  CHECK_NEON(5, uint16_t, {0x0A08, 0x0B09, 0x0E0C, 0x0F0D});
+  CHECK_NEON(6, uint32_t, {0x16141210, 0x17151311, 0x1E1C1A18, 0x1F1D1B19});
+  CHECK_NEON(7, uint32_t, {0x0E0C0A08, 0x0F0D0B09});
+  CHECK_NEON(8, uint64_t, {0x1E1C1A1816141210, 0x1F1D1B1917151311});
+}
 INSTANTIATE_TEST_SUITE_P(AArch64, InstNeon,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
\ No newline at end of file
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index 78f93048c2..a54c0c981a 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -8,42 +8,269 @@ namespace {
 using InstSme = AArch64RegressionTest;
 
 #if SIMENG_LLVM_VERSION >= 14
-TEST_P(InstSme, fmopa) {
+TEST_P(InstSme, addha) {
   // 32-bit
   RUN_AARCH64(R"(
     smstart
 
-    fdup z1.s, #2.0
-    fdup z2.s, #5.0
+    zero {za}
+
     ptrue p0.s
-    ptrue p1.s
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
 
-    fmopa za0.s, p0/m, p1/m, z1.s, z2.s
+    dup z0.s, #65
+    index z1.s, #0, #1
 
-    fdup z3.s, #3.0
-    fdup z4.s, #8.0
-    mov x0, #0
-    mov x1, #8
-    addvl x0, x0, #1
-    udiv x0, x0, x1
-    whilelo p2.s, xzr, x0
+    # Add to all rows and elems
+    addha za0.s, p0/m, p0/m, z1.s
 
-    fmopa za2.s, p0/m, p2/m, z3.s, z4.s
+    # Add to all rows, even numbered elements
+    addha za1.s, p0/m, p0/m, z0.s
+    addha za1.s, p0/m, p1/m, z1.s
+
+    # Add to even rows, all elements
+    addha za2.s, p0/m, p0/m, z0.s
+    addha za2.s, p1/m, p0/m, z1.s
+
+    # Even numbered rows, even numbered elements
+    addha za3.s, p0/m, p0/m, z0.s
+    addha za3.s, p1/m, p1/m, z1.s
   )");
-  for (int i = 0; i < (SVL / 32); i++) {
-    CHECK_MAT_ROW(ARM64_REG_ZAS0, i, float,
-                  fillNeon<float>({10.0f}, (SVL / 8)));
-    CHECK_MAT_ROW(ARM64_REG_ZAS2, i, float,
-                  fillNeon<float>({24.0f}, (SVL / 16)));
+  std::vector<uint32_t> full32(64, 0);
+  std::vector<uint32_t> index32(64, 0);
+  std::vector<uint32_t> inter32(64, 0);
+  for (uint16_t i = 0; i < 64; i++) {
+    full32[i] = 65;
+    index32[i] = i;
+    inter32[i] = (i % 2 == 0) ? i : 65;
+  }
+
+  for (uint32_t i = 0; i < (SVL / 32); i++) {
+    // All rows, all elems
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>(index32, (SVL / 8)));
+    CHECK_MAT_COL(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({i}, (SVL / 8)));
+    // All rows, even elements
+    CHECK_MAT_ROW(AARCH64_REG_ZAS1, i, uint32_t,
+                  fillNeon<uint32_t>(inter32, (SVL / 8)));
+    if (i % 2 == 0) {
+      // Even rows, all elements
+      CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t,
+                    fillNeon<uint32_t>(index32, (SVL / 8)));
+      // Even rows, even elements
+      CHECK_MAT_ROW(AARCH64_REG_ZAS3, i, uint32_t,
+                    fillNeon<uint32_t>(inter32, (SVL / 8)));
+    } else {
+      // Even rows, all elements
+      CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t,
+                    fillNeon<uint32_t>(full32, (SVL / 8)));
+      // Even rows, even elements
+      CHECK_MAT_ROW(AARCH64_REG_ZAS3, i, uint32_t,
+                    fillNeon<uint32_t>(full32, (SVL / 8)));
+    }
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
+
+    dup z0.d, #65
+    index z1.d, #0, #1
+
+    # Add to all rows and elems
+    addha za0.d, p0/m, p0/m, z1.d
+
+    # Add to all rows, even numbered elements
+    addha za1.d, p0/m, p0/m, z0.d
+    addha za1.d, p0/m, p1/m, z1.d
+
+    # Add to even rows, all elements
+    addha za2.d, p0/m, p0/m, z0.d
+    addha za2.d, p1/m, p0/m, z1.d
+
+    # Even numbered rows, even numbered elements
+    addha za3.d, p0/m, p0/m, z0.d
+    addha za3.d, p1/m, p1/m, z1.d
+  )");
+  std::vector<uint64_t> full64(32, 0);
+  std::vector<uint64_t> index64(32, 0);
+  std::vector<uint64_t> inter64(32, 0);
+  for (uint16_t i = 0; i < 32; i++) {
+    full64[i] = 65;
+    index64[i] = i;
+    inter64[i] = (i % 2 == 0) ? i : 65;
+  }
+
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    // All rows, all elems
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>(index64, (SVL / 8)));
+    CHECK_MAT_COL(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>({i}, (SVL / 8)));
+    // All rows, even elements
+    CHECK_MAT_ROW(AARCH64_REG_ZAD1, i, uint64_t,
+                  fillNeon<uint64_t>(inter64, (SVL / 8)));
+    if (i % 2 == 0) {
+      // Even rows, all elements
+      CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, uint64_t,
+                    fillNeon<uint64_t>(index64, (SVL / 8)));
+      // Even rows, even elements
+      CHECK_MAT_ROW(AARCH64_REG_ZAD3, i, uint64_t,
+                    fillNeon<uint64_t>(inter64, (SVL / 8)));
+    } else {
+      // Even rows, all elements
+      CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, uint64_t,
+                    fillNeon<uint64_t>(full64, (SVL / 8)));
+      // Even rows, even elements
+      CHECK_MAT_ROW(AARCH64_REG_ZAD3, i, uint64_t,
+                    fillNeon<uint64_t>(full64, (SVL / 8)));
+    }
   }
 }
 
-TEST_P(InstSme, ld1w) {
-  // Horizontal
+TEST_P(InstSme, addva) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.s
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
+
+    dup z0.s, #65
+    index z1.s, #0, #1
+
+    # Add to all cols and elems
+    addva za0.s, p0/m, p0/m, z1.s
+
+    # All cols, even elements
+    addva za1.s, p0/m, p0/m, z0.s
+    addva za1.s, p1/m, p0/m, z1.s
+
+    # Add to even numbered cols, all elements
+    addva za2.s, p0/m, p0/m, z0.s
+    addva za2.s, p0/m, p1/m, z1.s
+
+    # Even numbered cols, even numbered elements
+    addva za3.s, p0/m, p0/m, z0.s
+    addva za3.s, p1/m, p1/m, z1.s
+  )");
+  std::vector<uint32_t> full32(64, 0);
+  std::vector<uint32_t> index32(64, 0);
+  std::vector<uint32_t> inter32(64, 0);
+  for (uint16_t i = 0; i < 64; i++) {
+    full32[i] = 65;
+    index32[i] = i;
+    inter32[i] = (i % 2 == 0) ? i : 65;
+  }
+
+  for (uint32_t i = 0; i < (SVL / 32); i++) {
+    // All cols, all elems
+    CHECK_MAT_COL(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>(index32, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({i}, (SVL / 8)));
+    // All cols, even elements
+    CHECK_MAT_COL(AARCH64_REG_ZAS1, i, uint32_t,
+                  fillNeon<uint32_t>(inter32, (SVL / 8)));
+    if (i % 2 == 0) {
+      // Even cols, all elements
+      CHECK_MAT_COL(AARCH64_REG_ZAS2, i, uint32_t,
+                    fillNeon<uint32_t>(index32, (SVL / 8)));
+      // Even cols, even elements
+      CHECK_MAT_COL(AARCH64_REG_ZAS3, i, uint32_t,
+                    fillNeon<uint32_t>(inter32, (SVL / 8)));
+    } else {
+      // Even cols, all elements
+      CHECK_MAT_COL(AARCH64_REG_ZAS2, i, uint32_t,
+                    fillNeon<uint32_t>(full32, (SVL / 8)));
+      // Even cols, even elements
+      CHECK_MAT_COL(AARCH64_REG_ZAS3, i, uint32_t,
+                    fillNeon<uint32_t>(full32, (SVL / 8)));
+    }
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
+
+    dup z0.d, #65
+    index z1.d, #0, #1
+
+    # Add to all cols and elems
+    addva za0.d, p0/m, p0/m, z1.d
+
+    # All cols, even elements
+    addva za1.d, p0/m, p0/m, z0.d
+    addva za1.d, p1/m, p0/m, z1.d
+
+    # Add to even numbered cols, all elements
+    addva za2.d, p0/m, p0/m, z0.d
+    addva za2.d, p0/m, p1/m, z1.d
+
+    # Even numbered cols, even numbered elements
+    addva za3.d, p0/m, p0/m, z0.d
+    addva za3.d, p1/m, p1/m, z1.d
+  )");
+  std::vector<uint64_t> full64(32, 0);
+  std::vector<uint64_t> index64(32, 0);
+  std::vector<uint64_t> inter64(32, 0);
+  for (uint16_t i = 0; i < 32; i++) {
+    full64[i] = 65;
+    index64[i] = i;
+    inter64[i] = (i % 2 == 0) ? i : 65;
+  }
+
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    // All cols, all elems
+    CHECK_MAT_COL(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>(index64, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>({i}, (SVL / 8)));
+    // All cols, even elements
+    CHECK_MAT_COL(AARCH64_REG_ZAD1, i, uint64_t,
+                  fillNeon<uint64_t>(inter64, (SVL / 8)));
+    if (i % 2 == 0) {
+      // Even cols, all elements
+      CHECK_MAT_COL(AARCH64_REG_ZAD2, i, uint64_t,
+                    fillNeon<uint64_t>(index64, (SVL / 8)));
+      // Even cols, even elements
+      CHECK_MAT_COL(AARCH64_REG_ZAD3, i, uint64_t,
+                    fillNeon<uint64_t>(inter64, (SVL / 8)));
+    } else {
+      // Even cols, all elements
+      CHECK_MAT_COL(AARCH64_REG_ZAD2, i, uint64_t,
+                    fillNeon<uint64_t>(full64, (SVL / 8)));
+      // Even cols, even elements
+      CHECK_MAT_COL(AARCH64_REG_ZAD3, i, uint64_t,
+                    fillNeon<uint64_t>(full64, (SVL / 8)));
+    }
+  }
+}
+
+TEST_P(InstSme, mova_tileToVec) {
+  // 8-bit
   initialHeapData_.resize(SVL / 4);
-  uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
-  std::vector<uint32_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
-  fillHeap<uint32_t>(heap32, src, SVL / 16);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src8 = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78,
+                               0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01};
+  fillHeap<uint8_t>(heap8, src8, SVL / 4);
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
@@ -52,39 +279,77 @@ TEST_P(InstSme, ld1w) {
 
     smstart
 
-    mov x1, #1
-    ptrue p0.s
-    mov w12, #1
-    # Load and broadcast values from heap
-    ld1w {za0h.s[w12, 0]}, p0/z, [x0, x1, lsl #2]
-    ld1w {za0h.s[w12, 2]}, p0/z, [x0]
+    zero {za}
 
-    # Test for inactive lanes
-    mov x1, #0
-    mov x3, #8
-    # TODO change to addsvl when implemented
-    addvl x1, x1, #1
-    udiv x1, x1, x3
-    mov x2, #0
-    whilelo p1.s, xzr, x1
-    ld1w {za1h.s[w12, 0]}, p1/z, [x0, x2, lsl #2]
+    ptrue p0.b
+    pfalse p1.b
+    zip1 p1.b, p0.b, p1.b
+
+    mov w12, #0
+    dup z0.b, #1
+    dup z1.b, #2
+    dup z2.b, #3
+    dup z3.b, #4
+    dup z4.b, #5
+    dup z5.b, #6
+    dup z6.b, #7
+    dup z7.b, #8
+
+    # Horizontal
+    ld1b {za0h.b[w12, #0]}, p0/z, [x0]
+    mova z0.b, p0/m, za0h.b[w12, #0]
+    mova z1.b, p1/m, za0h.b[w12, #0]
+    #Alias
+    mov z4.b, p0/m, za0h.b[w12, #0]
+    mov z5.b, p1/m, za0h.b[w12, #0]
+
+    # Vertical
+    ld1b {za0v.b[w12, #3]}, p0/z, [x0]
+    mova z2.b, p0/m, za0v.b[w12, #3]
+    mova z3.b, p1/m, za0v.b[w12, #3]
+    #Alias
+    mov z6.b, p0/m, za0v.b[w12, #3]
+    mov z7.b, p1/m, za0v.b[w12, #3]
   )");
-  CHECK_MAT_ROW(
-      ARM64_REG_ZAS0, 1, uint64_t,
-      fillNeon<uint64_t>({0x9876543212345678, 0xDEADBEEFABCDEF01}, SVL / 8));
-  CHECK_MAT_ROW(
-      ARM64_REG_ZAS0, 3, uint64_t,
-      fillNeon<uint64_t>({0x12345678DEADBEEF, 0xABCDEF0198765432}, SVL / 8));
-  CHECK_MAT_ROW(ARM64_REG_ZAS1, 1, uint64_t,
-                fillNeonCombined<uint64_t>(
-                    {0x12345678DEADBEEF, 0xABCDEF0198765432}, {0}, SVL / 8));
+  CHECK_NEON(0, uint8_t,
+             fillNeon<uint8_t>({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78,
+                                0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                               SVL / 8));
+  CHECK_NEON(1, uint8_t,
+             fillNeon<uint8_t>({0xDE, 2, 0xBE, 2, 0x12, 2, 0x56, 2, 0x98, 2,
+                                0x54, 2, 0xAB, 2, 0xEF, 2},
+                               SVL / 8));
+  CHECK_NEON(2, uint8_t,
+             fillNeon<uint8_t>({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78,
+                                0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                               SVL / 8));
+  CHECK_NEON(3, uint8_t,
+             fillNeon<uint8_t>({0xDE, 4, 0xBE, 4, 0x12, 4, 0x56, 4, 0x98, 4,
+                                0x54, 4, 0xAB, 4, 0xEF, 4},
+                               SVL / 8));
+  CHECK_NEON(4, uint8_t,
+             fillNeon<uint8_t>({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78,
+                                0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                               SVL / 8));
+  CHECK_NEON(5, uint8_t,
+             fillNeon<uint8_t>({0xDE, 6, 0xBE, 6, 0x12, 6, 0x56, 6, 0x98, 6,
+                                0x54, 6, 0xAB, 6, 0xEF, 6},
+                               SVL / 8));
+  CHECK_NEON(6, uint8_t,
+             fillNeon<uint8_t>({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78,
+                                0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                               SVL / 8));
+  CHECK_NEON(7, uint8_t,
+             fillNeon<uint8_t>({0xDE, 8, 0xBE, 8, 0x12, 8, 0x56, 8, 0x98, 8,
+                                0x54, 8, 0xAB, 8, 0xEF, 8},
+                               SVL / 8));
 
-  // Vertical
+  // 16-bit
   initialHeapData_.resize(SVL / 4);
-  uint32_t* heap32_vert = reinterpret_cast<uint32_t*>(initialHeapData_.data());
-  std::vector<uint32_t> src_vert = {0xDEADBEEF, 0x12345678, 0x98765432,
-                                    0xABCDEF01};
-  fillHeap<uint32_t>(heap32_vert, src_vert, SVL / 16);
+  uint16_t* heap16 = reinterpret_cast<uint16_t*>(initialHeapData_.data());
+  std::vector<uint16_t> src16 = {0xDEAD, 0xBEEF, 0x1234, 0x5678,
+                                 0x9876, 0x5432, 0xABCD, 0xEF01};
+  fillHeap<uint16_t>(heap16, src16, SVL / 8);
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
@@ -93,42 +358,73 @@ TEST_P(InstSme, ld1w) {
 
     smstart
 
-    mov x1, #1
-    ptrue p0.s
-    mov w12, #1
-    # Load and broadcast values from heap
-    ld1w {za0v.s[w12, 0]}, p0/z, [x0, x1, lsl #2]
-    ld1w {za0v.s[w12, 2]}, p0/z, [x0]
+    zero {za}
 
-    # Test for inactive lanes
-    mov x1, #0
-    mov x3, #8
-    # TODO change to addsvl when implemented
-    addvl x1, x1, #1
-    udiv x1, x1, x3
-    mov x2, #0
-    whilelo p1.s, xzr, x1
-    ld1w {za1v.s[w12, 0]}, p1/z, [x0, x2, lsl #2]
+    ptrue p0.h
+    pfalse p1.b
+    zip1 p1.h, p0.h, p1.h
+
+    mov w12, #0
+    dup z0.h, #1
+    dup z1.h, #2
+    dup z2.h, #3
+    dup z3.h, #4
+    dup z4.h, #5
+    dup z5.h, #6
+    dup z6.h, #7
+    dup z7.h, #8
+
+    # Horizontal
+    ld1h {za0h.h[w12, #0]}, p0/z, [x0]
+    mova z0.h, p0/m, za0h.h[w12, #0]
+    mova z1.h, p1/m, za0h.h[w12, #0]
+    #Alias
+    mov z4.h, p0/m, za0h.h[w12, #0]
+    mov z5.h, p1/m, za0h.h[w12, #0]
+
+    # Vertical
+    ld1h {za0v.h[w12, #3]}, p0/z, [x0]
+    mova z2.h, p0/m, za0v.h[w12, #3]
+    mova z3.h, p1/m, za0v.h[w12, #3]
+    #Alias
+    mov z6.h, p0/m, za0v.h[w12, #3]
+    mov z7.h, p1/m, za0v.h[w12, #3]
   )");
-  CHECK_MAT_COL(ARM64_REG_ZAS0, 1, uint32_t,
-                fillNeon<uint32_t>(
-                    {0x12345678, 0x98765432, 0xABCDEF01, 0xDEADBEEF}, SVL / 8));
-  CHECK_MAT_COL(ARM64_REG_ZAS0, 3, uint32_t,
-                fillNeon<uint32_t>(
-                    {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8));
-  CHECK_MAT_COL(
-      ARM64_REG_ZAS1, 1, uint32_t,
-      fillNeonCombined<uint32_t>(
-          {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, {0}, SVL / 8));
-}
+  CHECK_NEON(0, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432,
+                                 0xABCD, 0xEF01},
+                                SVL / 8));
+  CHECK_NEON(1, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 2, 0x1234, 2, 0x9876, 2, 0xABCD, 2},
+                                SVL / 8));
+  CHECK_NEON(2, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432,
+                                 0xABCD, 0xEF01},
+                                SVL / 8));
+  CHECK_NEON(3, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 4, 0x1234, 4, 0x9876, 4, 0xABCD, 4},
+                                SVL / 8));
+  CHECK_NEON(4, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432,
+                                 0xABCD, 0xEF01},
+                                SVL / 8));
+  CHECK_NEON(5, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 6, 0x1234, 6, 0x9876, 6, 0xABCD, 6},
+                                SVL / 8));
+  CHECK_NEON(6, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432,
+                                 0xABCD, 0xEF01},
+                                SVL / 8));
+  CHECK_NEON(7, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 8, 0x1234, 8, 0x9876, 8, 0xABCD, 8},
+                                SVL / 8));
 
-TEST_P(InstSme, st1w) {
-  // Horizontal
+  // 32-bit
   initialHeapData_.resize(SVL / 4);
   uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
-  std::vector<uint32_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
-  fillHeap<uint32_t>(heap32, src, SVL / 16);
-
+  std::vector<uint32_t> src32 = {0xDEADBEEF, 0x12345678, 0x98765432,
+                                 0xABCDEF01};
+  fillHeap<uint32_t>(heap32, src32, SVL / 16);
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
@@ -137,25 +433,64 @@ TEST_P(InstSme, st1w) {
 
     smstart
 
-    sub sp, sp, #4095
-    mov x1, #0
-    mov x4, #0
-    addvl x4, x4, #1
+    zero {za}
+
     ptrue p0.s
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
 
     mov w12, #0
-    ld1w {za0h.s[w12, 0]}, p0/z, [x0, x1, lsl #2]
-    ld1w {za1h.s[w12, 1]}, p0/z, [x0, x1, lsl #2]
-    st1w {za0h.s[w12, 0]}, p0, [sp, x1, lsl #2]
-    st1w {za1h.s[w12, 1]}, p0, [x4]
+    dup z0.s, #1
+    dup z1.s, #2
+    dup z2.s, #3
+    dup z3.s, #4
+    dup z4.s, #5
+    dup z5.s, #6
+    dup z6.s, #7
+    dup z7.s, #8
+
+    # Horizontal
+    ld1w {za0h.s[w12, #0]}, p0/z, [x0]
+    mova z0.s, p0/m, za0h.s[w12, #0]
+    mova z1.s, p1/m, za0h.s[w12, #0]
+    #Alias
+    mov z4.s, p0/m, za0h.s[w12, #0]
+    mov z5.s, p1/m, za0h.s[w12, #0]
+
+    # Vertical
+    ld1w {za0v.s[w12, #3]}, p0/z, [x0]
+    mova z2.s, p0/m, za0v.s[w12, #3]
+    mova z3.s, p1/m, za0v.s[w12, #3]
+    #Alias
+    mov z6.s, p0/m, za0v.s[w12, #3]
+    mov z7.s, p1/m, za0v.s[w12, #3]
   )");
-  for (int i = 0; i < (SVL / 32); i++) {
-    EXPECT_EQ(
-        getMemoryValue<uint32_t>(process_->getStackPointer() - 4095 + (i * 4)),
-        src[i % 4]);
-    EXPECT_EQ(getMemoryValue<uint32_t>((SVL / 8) + (i * 4)), src[i % 4]);
-  }
+  CHECK_NEON(0, uint32_t,
+             fillNeon<uint32_t>(
+                 {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8));
+  CHECK_NEON(1, uint32_t,
+             fillNeon<uint32_t>({0xDEADBEEF, 2, 0x98765432, 2}, SVL / 8));
+  CHECK_NEON(2, uint32_t,
+             fillNeon<uint32_t>(
+                 {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8));
+  CHECK_NEON(3, uint32_t,
+             fillNeon<uint32_t>({0xDEADBEEF, 4, 0x98765432, 4}, SVL / 8));
+  CHECK_NEON(4, uint32_t,
+             fillNeon<uint32_t>(
+                 {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8));
+  CHECK_NEON(5, uint32_t,
+             fillNeon<uint32_t>({0xDEADBEEF, 6, 0x98765432, 6}, SVL / 8));
+  CHECK_NEON(6, uint32_t,
+             fillNeon<uint32_t>(
+                 {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8));
+  CHECK_NEON(7, uint32_t,
+             fillNeon<uint32_t>({0xDEADBEEF, 8, 0x98765432, 8}, SVL / 8));
 
+  // 64-bit
+  initialHeapData_.resize(SVL / 4);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src64 = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
+  fillHeap<uint64_t>(heap64, src64, SVL / 32);
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
@@ -164,32 +499,60 @@ TEST_P(InstSme, st1w) {
 
     smstart
 
-    mov x2, #0
-    mov x4, #8
-    addvl x2, x2, #1
-    udiv x2, x2, x4
-    mov x3, #4
-    whilelo p1.s, xzr, x2
-    mov x5, #800
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
 
     mov w12, #0
-    ld1w {za3h.s[w12, 0]}, p1/z, [x0, x3, lsl #2]
-    st1w {za3h.s[w12, 0]}, p1, [x5]
-    ld1w {za1h.s[w12, 2]}, p1/z, [x0, x3, lsl #2]
-    st1w {za1h.s[w12, 2]}, p1, [x5, x3, lsl #2]
+    dup z0.d, #1
+    dup z1.d, #2
+    dup z2.d, #3
+    dup z3.d, #4
+    dup z4.d, #5
+    dup z5.d, #6
+    dup z6.d, #7
+    dup z7.d, #8
+
+    # Horizontal
+    ld1d {za0h.d[w12, #0]}, p0/z, [x0]
+    mova z0.d, p0/m, za0h.d[w12, #0]
+    mova z1.d, p1/m, za0h.d[w12, #0]
+    #Alias
+    mov z4.d, p0/m, za0h.d[w12, #0]
+    mov z5.d, p1/m, za0h.d[w12, #0]
+
+    # Vertical
+    ld1d {za0v.d[w12, #1]}, p0/z, [x0]
+    mova z2.d, p0/m, za0v.d[w12, #1]
+    mova z3.d, p1/m, za0v.d[w12, #1]
+    #Alias
+    mov z6.d, p0/m, za0v.d[w12, #1]
+    mov z7.d, p1/m, za0v.d[w12, #1]
   )");
-  for (int i = 0; i < (SVL / 64); i++) {
-    EXPECT_EQ(getMemoryValue<uint32_t>(800 + (i * 4)), src[i % 4]);
-    EXPECT_EQ(getMemoryValue<uint32_t>(800 + 16 + (i * 4)), src[i % 4]);
-  }
+  CHECK_NEON(
+      0, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0xDEADBEEF12345678, 2}, SVL / 8));
+  CHECK_NEON(
+      2, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_NEON(3, uint64_t, fillNeon<uint64_t>({0xDEADBEEF12345678, 4}, SVL / 8));
+  CHECK_NEON(
+      4, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_NEON(5, uint64_t, fillNeon<uint64_t>({0xDEADBEEF12345678, 6}, SVL / 8));
+  CHECK_NEON(
+      6, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_NEON(7, uint64_t, fillNeon<uint64_t>({0xDEADBEEF12345678, 8}, SVL / 8));
 
-  // Vertical
+  // 128-bit
+  // Re-use 64-bit heap
   initialHeapData_.resize(SVL / 4);
-  uint32_t* heap32_vert = reinterpret_cast<uint32_t*>(initialHeapData_.data());
-  std::vector<uint32_t> src_vert = {0xDEADBEEF, 0x12345678, 0x98765432,
-                                    0xABCDEF01};
-  fillHeap<uint32_t>(heap32_vert, src_vert, SVL / 16);
-
+  heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  fillHeap<uint64_t>(heap64, src64, SVL / 32);
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
@@ -198,50 +561,2852 @@ TEST_P(InstSme, st1w) {
 
     smstart
 
-    sub sp, sp, #4095
-    mov x1, #0
-    mov x4, #0
-    addvl x4, x4, #1
-    ptrue p0.s
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    # Zip1 twice to get on-off-on-off pattern with quadwords
+    zip1 p1.d, p0.d, p1.d
+    zip1 p1.d, p1.d, p1.d
 
     mov w12, #0
-    ld1w {za0v.s[w12, 0]}, p0/z, [x0, x1, lsl #2]
-    ld1w {za1v.s[w12, 1]}, p0/z, [x0, x1, lsl #2]
-    st1w {za0v.s[w12, 0]}, p0, [sp, x1, lsl #2]
-    st1w {za1v.s[w12, 1]}, p0, [x4]
+    dup z0.d, #1
+    dup z1.d, #2
+    dup z2.d, #3
+    dup z3.d, #4
+    dup z4.d, #5
+    dup z5.d, #6
+    dup z6.d, #7
+    dup z7.d, #8
+
+    # Horizontal
+    ld1d {za0h.d[w12, #0]}, p0/z, [x0]
+    mova z0.q, p0/m, za0h.q[w12, #0]
+    mova z1.q, p1/m, za0h.q[w12, #0]
+    #Alias
+    mov z4.q, p0/m, za0h.q[w12, #0]
+    mov z5.q, p1/m, za0h.q[w12, #0]
+
+    # Vertical
+    mov w12, #1
+    ld1d {z8.d}, p0/z, [x0]
+    mova za0v.q[w12, #0], p0/m, z8.q
+    mova z2.q, p0/m, za0v.q[w12, #0]
+    mova z3.q, p1/m, za0v.q[w12, #0]
+    #Alias
+    mov z6.q, p0/m, za0v.q[w12, #0]
+    mov z7.q, p1/m, za0v.q[w12, #0]
   )");
-  for (int i = 0; i < (SVL / 32); i++) {
-    EXPECT_EQ(
-        getMemoryValue<uint32_t>(process_->getStackPointer() - 4095 + (i * 4)),
-        src_vert[i % 4]);
-    EXPECT_EQ(getMemoryValue<uint32_t>((SVL / 8) + (i * 4)), src_vert[i % 4]);
+  // Horizontal
+  CHECK_NEON(
+      0, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_NEON(1, uint64_t,
+             fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01, 2, 2},
+                                SVL / 8));
+  // Vertical
+  CHECK_NEON(
+      2, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_NEON(3, uint64_t,
+             fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01, 4, 4},
+                                SVL / 8));
+  // Horizontal
+  CHECK_NEON(
+      4, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_NEON(5, uint64_t,
+             fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01, 6, 6},
+                                SVL / 8));
+  // Vertical
+  CHECK_NEON(
+      6, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_NEON(7, uint64_t,
+             fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01, 8, 8},
+                                SVL / 8));
+}
+
+TEST_P(InstSme, mova_b_vecToTile) {
+  // 8-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.b
+    pfalse p1.b
+    zip1 p1.b, p0.b, p1.b
+
+    mov w12, #0
+    dup z0.b, #1
+    dup z1.b, #2
+
+    # Horizontal
+    mova za0h.b[w12, #0], p0/m, z0.b
+    mova za0h.b[w12, #1], p1/m, z1.b
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAB0, 0, uint8_t,
+                fillNeon<uint8_t>({1}, (SVL / 8)));
+  CHECK_MAT_ROW(AARCH64_REG_ZAB0, 1, uint8_t,
+                fillNeon<uint8_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 8; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAB0, i, uint8_t,
+                  fillNeon<uint8_t>({0}, (SVL / 8)));
   }
 
   RUN_AARCH64(R"(
-    # Get heap address
-    mov x0, 0
-    mov x8, 214
-    svc #0
+    smstart
+
+    zero {za}
 
+    ptrue p0.b
+    pfalse p1.b
+    zip1 p1.b, p0.b, p1.b
+
+    mov w12, #0
+    dup z0.b, #1
+    dup z1.b, #2
+
+    # Horizontal Alias
+    mov za0h.b[w12, #0], p0/m, z0.b
+    mov za0h.b[w12, #1], p1/m, z1.b
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAB0, 0, uint8_t,
+                fillNeon<uint8_t>({1}, (SVL / 8)));
+  CHECK_MAT_ROW(AARCH64_REG_ZAB0, 1, uint8_t,
+                fillNeon<uint8_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 8; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAB0, i, uint8_t,
+                  fillNeon<uint8_t>({0}, (SVL / 8)));
+  }
+
+  RUN_AARCH64(R"(
     smstart
 
-    mov x2, #0
-    mov x4, #8
-    addvl x2, x2, #1
-    udiv x2, x2, x4
-    mov x3, #4
-    whilelo p1.s, xzr, x2
-    mov x5, #800
+    zero {za}
+
+    ptrue p0.b
+    pfalse p1.b
+    zip1 p1.b, p0.b, p1.b
 
     mov w12, #0
-    ld1w {za3v.s[w12, 0]}, p1/z, [x0, x3, lsl #2]
-    st1w {za3v.s[w12, 0]}, p1, [x5]
-    ld1w {za1v.s[w12, 2]}, p1/z, [x0, x3, lsl #2]
-    st1w {za1v.s[w12, 2]}, p1, [x5, x3, lsl #2]
+    dup z0.b, #1
+    dup z1.b, #2
+
+    # Vertical
+    mova za0v.b[w12, #0], p0/m, z0.b
+    mova za0v.b[w12, #1], p1/m, z1.b
   )");
-  for (int i = 0; i < (SVL / 64); i++) {
-    EXPECT_EQ(getMemoryValue<uint32_t>(800 + (i * 4)), src_vert[i % 4]);
-    EXPECT_EQ(getMemoryValue<uint32_t>(800 + 16 + (i * 4)), src_vert[i % 4]);
+  CHECK_MAT_COL(AARCH64_REG_ZAB0, 0, uint8_t,
+                fillNeon<uint8_t>({1}, (SVL / 8)));
+  CHECK_MAT_COL(AARCH64_REG_ZAB0, 1, uint8_t,
+                fillNeon<uint8_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 8; i++) {
+    CHECK_MAT_COL(AARCH64_REG_ZAB0, i, uint8_t,
+                  fillNeon<uint8_t>({0}, (SVL / 8)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.b
+    pfalse p1.b
+    zip1 p1.b, p0.b, p1.b
+
+    mov w12, #0
+    dup z0.b, #1
+    dup z1.b, #2
+
+    # Vertical Alias
+    mov za0v.b[w12, #0], p0/m, z0.b
+    mov za0v.b[w12, #1], p1/m, z1.b
+  )");
+  CHECK_MAT_COL(AARCH64_REG_ZAB0, 0, uint8_t,
+                fillNeon<uint8_t>({1}, (SVL / 8)));
+  CHECK_MAT_COL(AARCH64_REG_ZAB0, 1, uint8_t,
+                fillNeon<uint8_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 8; i++) {
+    CHECK_MAT_COL(AARCH64_REG_ZAB0, i, uint8_t,
+                  fillNeon<uint8_t>({0}, (SVL / 8)));
+  }
+}
+
+TEST_P(InstSme, mova_h_vecToTile) {
+  // 16-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.h
+    pfalse p1.b
+    zip1 p1.h, p0.h, p1.h
+
+    mov w12, #0
+    dup z0.h, #1
+    dup z1.h, #2
+
+    # Horizontal
+    mova za0h.h[w12, #0], p0/m, z0.h
+    mova za0h.h[w12, #1], p1/m, z1.h
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAH0, 0, uint16_t,
+                fillNeon<uint16_t>({1}, (SVL / 8)));
+  CHECK_MAT_ROW(AARCH64_REG_ZAH0, 1, uint16_t,
+                fillNeon<uint16_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 16; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAH0, i, uint16_t,
+                  fillNeon<uint16_t>({0}, (SVL / 8)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.h
+    pfalse p1.b
+    zip1 p1.h, p0.h, p1.h
+
+    mov w12, #0
+    dup z0.h, #1
+    dup z1.h, #2
+
+    # Horizontal Alias
+    mov za0h.h[w12, #0], p0/m, z0.h
+    mov za0h.h[w12, #1], p1/m, z1.h
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAH0, 0, uint16_t,
+                fillNeon<uint16_t>({1}, (SVL / 8)));
+  CHECK_MAT_ROW(AARCH64_REG_ZAH0, 1, uint16_t,
+                fillNeon<uint16_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 16; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAH0, i, uint16_t,
+                  fillNeon<uint16_t>({0}, (SVL / 8)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.h
+    pfalse p1.b
+    zip1 p1.h, p0.h, p1.h
+
+    mov w12, #0
+    dup z0.h, #1
+    dup z1.h, #2
+
+    # Vertical
+    mova za0v.h[w12, #0], p0/m, z0.h
+    mova za0v.h[w12, #1], p1/m, z1.h
+  )");
+  CHECK_MAT_COL(AARCH64_REG_ZAH0, 0, uint16_t,
+                fillNeon<uint16_t>({1}, (SVL / 8)));
+  CHECK_MAT_COL(AARCH64_REG_ZAH0, 1, uint16_t,
+                fillNeon<uint16_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 16; i++) {
+    CHECK_MAT_COL(AARCH64_REG_ZAH0, i, uint16_t,
+                  fillNeon<uint16_t>({0}, (SVL / 8)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.h
+    pfalse p1.b
+    zip1 p1.h, p0.h, p1.h
+
+    mov w12, #0
+    dup z0.h, #1
+    dup z1.h, #2
+
+    # Vertical Alias
+    mov za0v.h[w12, #0], p0/m, z0.h
+    mov za0v.h[w12, #1], p1/m, z1.h
+  )");
+  CHECK_MAT_COL(AARCH64_REG_ZAH0, 0, uint16_t,
+                fillNeon<uint16_t>({1}, (SVL / 8)));
+  CHECK_MAT_COL(AARCH64_REG_ZAH0, 1, uint16_t,
+                fillNeon<uint16_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 16; i++) {
+    CHECK_MAT_COL(AARCH64_REG_ZAH0, i, uint16_t,
+                  fillNeon<uint16_t>({0}, (SVL / 8)));
+  }
+}
+
+TEST_P(InstSme, mova_s_vecToTile) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.s
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
+
+    mov w12, #0
+    dup z0.s, #1
+    dup z1.s, #2
+
+    # Horizontal
+    mova za0h.s[w12, #0], p0/m, z0.s
+    mova za0h.s[w12, #1], p1/m, z1.s
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAS0, 0, uint32_t,
+                fillNeon<uint32_t>({1}, (SVL / 8)));
+  CHECK_MAT_ROW(AARCH64_REG_ZAS0, 1, uint32_t,
+                fillNeon<uint32_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 32; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({0}, (SVL / 8)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.s
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
+
+    mov w12, #0
+    dup z0.s, #1
+    dup z1.s, #2
+
+    # Horizontal Alias
+    mov za0h.s[w12, #0], p0/m, z0.s
+    mov za0h.s[w12, #1], p1/m, z1.s
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAS0, 0, uint32_t,
+                fillNeon<uint32_t>({1}, (SVL / 8)));
+  CHECK_MAT_ROW(AARCH64_REG_ZAS0, 1, uint32_t,
+                fillNeon<uint32_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 32; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({0}, (SVL / 8)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.s
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
+
+    mov w12, #0
+    dup z0.s, #1
+    dup z1.s, #2
+
+    # Vertical
+    mova za0v.s[w12, #0], p0/m, z0.s
+    mova za0v.s[w12, #1], p1/m, z1.s
+  )");
+  CHECK_MAT_COL(AARCH64_REG_ZAS0, 0, uint32_t,
+                fillNeon<uint32_t>({1}, (SVL / 8)));
+  CHECK_MAT_COL(AARCH64_REG_ZAS0, 1, uint32_t,
+                fillNeon<uint32_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 32; i++) {
+    CHECK_MAT_COL(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({0}, (SVL / 8)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.s
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
+
+    mov w12, #0
+    dup z0.s, #1
+    dup z1.s, #2
+
+    # Vertical Alias
+    mov za0v.s[w12, #0], p0/m, z0.s
+    mov za0v.s[w12, #1], p1/m, z1.s
+  )");
+  CHECK_MAT_COL(AARCH64_REG_ZAS0, 0, uint32_t,
+                fillNeon<uint32_t>({1}, (SVL / 8)));
+  CHECK_MAT_COL(AARCH64_REG_ZAS0, 1, uint32_t,
+                fillNeon<uint32_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 32; i++) {
+    CHECK_MAT_COL(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({0}, (SVL / 8)));
+  }
+}
+
+TEST_P(InstSme, mova_d_vecToTile) {
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
+
+    mov w12, #0
+    dup z0.d, #1
+    dup z1.d, #2
+
+    # Horizontal
+    mova za0h.d[w12, #0], p0/m, z0.d
+    mova za0h.d[w12, #1], p1/m, z1.d
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAD0, 0, uint64_t,
+                fillNeon<uint64_t>({1}, (SVL / 8)));
+  CHECK_MAT_ROW(AARCH64_REG_ZAD0, 1, uint64_t,
+                fillNeon<uint64_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 64; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>({0}, (SVL / 8)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
+
+    mov w12, #0
+    dup z0.d, #1
+    dup z1.d, #2
+
+    # Horizontal Alias
+    mov za0h.d[w12, #0], p0/m, z0.d
+    mov za0h.d[w12, #1], p1/m, z1.d
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAD0, 0, uint64_t,
+                fillNeon<uint64_t>({1}, (SVL / 8)));
+  CHECK_MAT_ROW(AARCH64_REG_ZAD0, 1, uint64_t,
+                fillNeon<uint64_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 64; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>({0}, (SVL / 8)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
+
+    mov w12, #0
+    dup z0.d, #1
+    dup z1.d, #2
+
+    # Vertical
+    mova za0v.d[w12, #0], p0/m, z0.d
+    mova za0v.d[w12, #1], p1/m, z1.d
+  )");
+  CHECK_MAT_COL(AARCH64_REG_ZAD0, 0, uint64_t,
+                fillNeon<uint64_t>({1}, (SVL / 8)));
+  CHECK_MAT_COL(AARCH64_REG_ZAD0, 1, uint64_t,
+                fillNeon<uint64_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 64; i++) {
+    CHECK_MAT_COL(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>({0}, (SVL / 8)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
+
+    mov w12, #0
+    dup z0.d, #1
+    dup z1.d, #2
+
+    # Vertical Alias
+    mov za0v.d[w12, #0], p0/m, z0.d
+    mov za0v.d[w12, #1], p1/m, z1.d
+  )");
+  CHECK_MAT_COL(AARCH64_REG_ZAD0, 0, uint64_t,
+                fillNeon<uint64_t>({1}, (SVL / 8)));
+  CHECK_MAT_COL(AARCH64_REG_ZAD0, 1, uint64_t,
+                fillNeon<uint64_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 64; i++) {
+    CHECK_MAT_COL(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>({0}, (SVL / 8)));
+  }
+}
+
+TEST_P(InstSme, mova_q_vecToTile) {
+  // 128-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    # Zip1 twice to get on-off-on-off pattern with quadwords
+    zip1 p1.d, p0.d, p1.d
+    zip1 p1.d, p1.d, p1.d
+
+    mov w12, #0
+    dup z0.d, #1
+    dup z1.d, #2
+
+    # Horizontal
+    mova za0h.q[w12, #0], p0/m, z0.q
+    mova za0h.q[w12, #0], p1/m, z1.q
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAQ0, 0, uint64_t,
+                fillNeon<uint64_t>({2, 2, 1, 1}, (SVL / 8)));
+  for (uint16_t i = 1; i < SVL / 128; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t,
+                  fillNeon<uint64_t>({0}, (SVL / 8)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    # Zip1 twice to get on-off-on-off pattern with quadwords
+    zip1 p1.d, p0.d, p1.d
+    zip1 p1.d, p1.d, p1.d
+
+    mov w12, #0
+    dup z0.d, #1
+    dup z1.d, #2
+
+    # Horizontal Alias
+    mov za0h.q[w12, #0], p0/m, z0.q
+    mov za0h.q[w12, #0], p1/m, z1.q
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAQ0, 0, uint64_t,
+                fillNeon<uint64_t>({2, 2, 1, 1}, (SVL / 8)));
+  for (uint16_t i = 1; i < SVL / 128; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t,
+                  fillNeon<uint64_t>({0}, (SVL / 8)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    # Zip1 twice to get on-off-on-off pattern with quadwords
+    zip1 p1.d, p0.d, p1.d
+    zip1 p1.d, p1.d, p1.d
+
+    mov w12, #0
+    dup z0.d, #1
+    dup z1.d, #2
+
+    # Vertical
+    mova za0v.q[w12, #0], p0/m, z0.q
+    mova za0v.q[w12, #0], p1/m, z1.q
+  )");
+  auto onRow = fillNeon<uint64_t>({0}, (SVL / 8));
+  auto offRow = fillNeon<uint64_t>({0}, (SVL / 8));
+  onRow[0] = 2;
+  onRow[1] = 2;
+  offRow[0] = 1;
+  offRow[1] = 1;
+  for (uint16_t i = 0; i < SVL / 128; i++) {
+    if (i % 2 == 0) {
+      CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, onRow);
+    } else {
+      CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, offRow);
+    }
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    # Zip1 twice to get on-off-on-off pattern with quadwords
+    zip1 p1.d, p0.d, p1.d
+    zip1 p1.d, p1.d, p1.d
+
+    mov w12, #0
+    dup z0.d, #1
+    dup z1.d, #2
+
+    # Vertical Alias
+    mov za0v.q[w12, #0], p0/m, z0.q
+    mov za0v.q[w12, #0], p1/m, z1.q
+  )");
+  for (uint16_t i = 0; i < SVL / 128; i++) {
+    if (i % 2 == 0) {
+      CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, onRow);
+    } else {
+      CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, offRow);
+    }
+  }
+}
+
+TEST_P(InstSme, fmopa) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    fdup z1.s, #2.0
+    fdup z2.s, #5.0
+    ptrue p0.s
+    ptrue p1.s
+
+    zero {za}
+
+    fmopa za0.s, p0/m, p1/m, z1.s, z2.s
+
+    fdup z3.s, #3.0
+    fdup z4.s, #8.0
+    mov x0, #0
+    mov x1, #8
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.s, xzr, x0
+
+    fmopa za2.s, p0/m, p2/m, z3.s, z4.s
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, float,
+                  fillNeon<float>({10.0f}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, float,
+                  fillNeon<float>({24.0f}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    fdup z1.d, #2.0
+    fdup z2.d, #5.0
+    ptrue p0.d
+    ptrue p1.d
+
+    zero {za}
+
+    fmopa za0.d, p0/m, p1/m, z1.d, z2.d
+
+    fdup z3.d, #3.0
+    fdup z4.d, #8.0
+    mov x0, #0
+    mov x1, #16
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.d, xzr, x0
+
+    fmopa za2.d, p0/m, p2/m, z3.d, z4.d
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, double,
+                  fillNeon<double>({10.0}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, double,
+                  fillNeon<double>({24.0}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, fmops) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    fdup z1.s, #2.0
+    fdup z2.s, #5.0
+    ptrue p0.s
+    ptrue p1.s
+
+    zero {za}
+
+    fmops za0.s, p0/m, p1/m, z1.s, z2.s
+
+    fdup z3.s, #3.0
+    fdup z4.s, #8.0
+    mov x0, #0
+    mov x1, #8
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.s, xzr, x0
+
+    fmops za2.s, p0/m, p2/m, z3.s, z4.s
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, float,
+                  fillNeon<float>({-10.0f}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, float,
+                  fillNeon<float>({-24.0f}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    fdup z1.d, #2.0
+    fdup z2.d, #5.0
+    ptrue p0.d
+    ptrue p1.d
+
+    zero {za}
+
+    fmops za0.d, p0/m, p1/m, z1.d, z2.d
+
+    fdup z3.d, #3.0
+    fdup z4.d, #8.0
+    mov x0, #0
+    mov x1, #16
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.d, xzr, x0
+
+    fmops za2.d, p0/m, p2/m, z3.d, z4.d
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, double,
+                  fillNeon<double>({-10.0}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, double,
+                  fillNeon<double>({-24.0}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, ld1b) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78,
+                              0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01};
+  fillHeap<uint8_t>(heap8, src, SVL / 4);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.b
+    mov w12, #1
+    # Load and broadcast values from heap
+    ld1b {za0h.b[w12, 0]}, p0/z, [x0, x1]
+    ld1b {za0h.b[w12, 2]}, p0/z, [x0]
+
+    # Test for inactive lanes
+    mov x1, #0
+    mov x3, #2
+    # TODO change to addsvl when implemented
+    addvl x1, x1, #1
+    udiv x1, x1, x3
+    mov x2, #0
+    whilelo p1.b, xzr, x1
+    mov w12, #15
+    ld1b {za0h.b[w12, 0]}, p1/z, [x0, x2]
+  )");
+  CHECK_MAT_ROW(
+      AARCH64_REG_ZAB0, 1, uint8_t,
+      fillNeon<uint8_t>({0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98, 0x76,
+                         0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01, 0xDE},
+                        SVL / 8));
+  CHECK_MAT_ROW(
+      AARCH64_REG_ZAB0, 3, uint8_t,
+      fillNeon<uint8_t>({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98,
+                         0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                        SVL / 8));
+  CHECK_MAT_ROW(AARCH64_REG_ZAB0, 15, uint8_t,
+                fillNeonCombined<uint8_t>(
+                    {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98, 0x76,
+                     0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                    {0}, SVL / 8));
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  uint8_t* heap8_vert = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src_vert = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34,
+                                   0x56, 0x78, 0x98, 0x76, 0x54, 0x32,
+                                   0xAB, 0xCD, 0xEF, 0x01};
+  fillHeap<uint8_t>(heap8_vert, src_vert, SVL / 4);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.b
+    mov w12, #1
+    # Load and broadcast values from heap
+    ld1b {za0v.b[w12, 0]}, p0/z, [x0, x1]
+    ld1b {za0v.b[w12, 2]}, p0/z, [x0]
+
+    # Test for inactive lanes
+    mov x1, #0
+    mov x3, #2
+    # TODO change to addsvl when implemented
+    addvl x1, x1, #1
+    udiv x1, x1, x3
+    mov x2, #0
+    whilelo p1.b, xzr, x1
+    mov w12, #15
+    ld1b {za0v.b[w12, 0]}, p1/z, [x0, x2]
+  )");
+  CHECK_MAT_COL(
+      AARCH64_REG_ZAB0, 1, uint8_t,
+      fillNeon<uint8_t>({0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98, 0x76,
+                         0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01, 0xDE},
+                        SVL / 8));
+  CHECK_MAT_COL(
+      AARCH64_REG_ZAB0, 3, uint8_t,
+      fillNeon<uint8_t>({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98,
+                         0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                        SVL / 8));
+  CHECK_MAT_COL(AARCH64_REG_ZAB0, 15, uint8_t,
+                fillNeonCombined<uint8_t>(
+                    {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98, 0x76,
+                     0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                    {0}, SVL / 8));
+}
+
+TEST_P(InstSme, ld1d) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
+  fillHeap<uint64_t>(heap64, src, SVL / 32);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.d
+    mov w12, #0
+    # Load and broadcast values from heap
+    ld1d {za0h.d[w12, 0]}, p0/z, [x0, x1, lsl #3]
+    ld1d {za0h.d[w12, 1]}, p0/z, [x0]
+
+    # Test for inactive lanes
+    mov x1, #0
+    mov x3, #16
+    # TODO change to addsvl when implemented
+    addvl x1, x1, #1
+    udiv x1, x1, x3
+    mov x2, #0
+    whilelo p1.d, xzr, x1
+    ld1d {za1h.d[w12, 1]}, p1/z, [x0, x2, lsl #3]
+  )");
+  CHECK_MAT_ROW(
+      AARCH64_REG_ZAD0, 0, uint64_t,
+      fillNeon<uint64_t>({0x98765432ABCDEF01, 0xDEADBEEF12345678}, SVL / 8));
+  CHECK_MAT_ROW(
+      AARCH64_REG_ZAD0, 1, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_MAT_ROW(AARCH64_REG_ZAD1, 1, uint64_t,
+                fillNeonCombined<uint64_t>(
+                    {0xDEADBEEF12345678, 0x98765432ABCDEF01}, {0}, SVL / 8));
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  uint64_t* heap64_vert = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src_vert = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
+  fillHeap<uint64_t>(heap64_vert, src_vert, SVL / 32);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.d
+    mov w12, #0
+    # Load and broadcast values from heap
+    ld1d {za0v.d[w12, 0]}, p0/z, [x0, x1, lsl #3]
+    ld1d {za0v.d[w12, 1]}, p0/z, [x0]
+
+    # Test for inactive lanes
+    mov x1, #0
+    mov x3, #16
+    # TODO change to addsvl when implemented
+    addvl x1, x1, #1
+    udiv x1, x1, x3
+    mov x2, #0
+    whilelo p1.d, xzr, x1
+    ld1d {za1v.d[w12, 1]}, p1/z, [x0, x2, lsl #3]
+  )");
+  CHECK_MAT_COL(
+      AARCH64_REG_ZAD0, 0, uint64_t,
+      fillNeon<uint64_t>({0x98765432ABCDEF01, 0xDEADBEEF12345678}, SVL / 8));
+  CHECK_MAT_COL(
+      AARCH64_REG_ZAD0, 1, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_MAT_COL(AARCH64_REG_ZAD1, 1, uint64_t,
+                fillNeonCombined<uint64_t>(
+                    {0xDEADBEEF12345678, 0x98765432ABCDEF01}, {0}, SVL / 8));
+}
+
+TEST_P(InstSme, ld1h) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint16_t* heap16 = reinterpret_cast<uint16_t*>(initialHeapData_.data());
+  std::vector<uint16_t> src = {0xDEAD, 0xBEEF, 0x1234, 0x5678,
+                               0x9876, 0x5432, 0xABCD, 0xEF01};
+  fillHeap<uint16_t>(heap16, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.h
+    mov w12, #1
+    # Load and broadcast values from heap
+    ld1h {za0h.h[w12, 0]}, p0/z, [x0, x1, lsl #1]
+    ld1h {za0h.h[w12, 2]}, p0/z, [x0]
+
+    # Test for inactive lanes
+    mov x1, #0
+    mov x3, #4
+    # TODO change to addsvl when implemented
+    addvl x1, x1, #1
+    udiv x1, x1, x3
+    mov x2, #0
+    whilelo p1.h, xzr, x1
+    ld1h {za1h.h[w12, 0]}, p1/z, [x0, x2, lsl #1]
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAH0, 1, uint16_t,
+                fillNeon<uint16_t>({0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432,
+                                    0xABCD, 0xEF01, 0xDEAD},
+                                   SVL / 8));
+  CHECK_MAT_ROW(AARCH64_REG_ZAH0, 3, uint16_t,
+                fillNeon<uint16_t>({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876,
+                                    0x5432, 0xABCD, 0xEF01},
+                                   SVL / 8));
+  CHECK_MAT_ROW(AARCH64_REG_ZAH1, 1, uint16_t,
+                fillNeonCombined<uint16_t>({0xDEAD, 0xBEEF, 0x1234, 0x5678,
+                                            0x9876, 0x5432, 0xABCD, 0xEF01},
+                                           {0}, SVL / 8));
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  uint16_t* heap16_vert = reinterpret_cast<uint16_t*>(initialHeapData_.data());
+  std::vector<uint16_t> src_vert = {0xDEAD, 0xBEEF, 0x1234, 0x5678,
+                                    0x9876, 0x5432, 0xABCD, 0xEF01};
+  fillHeap<uint16_t>(heap16_vert, src_vert, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.h
+    mov w12, #1
+    # Load and broadcast values from heap
+    ld1h {za0v.h[w12, 0]}, p0/z, [x0, x1, lsl #1]
+    ld1h {za0v.h[w12, 2]}, p0/z, [x0]
+
+    # Test for inactive lanes
+    mov x1, #0
+    mov x3, #4
+    # TODO change to addsvl when implemented
+    addvl x1, x1, #1
+    udiv x1, x1, x3
+    mov x2, #0
+    whilelo p1.h, xzr, x1
+    ld1h {za1v.h[w12, 0]}, p1/z, [x0, x2, lsl #1]
+  )");
+  CHECK_MAT_COL(AARCH64_REG_ZAH0, 1, uint16_t,
+                fillNeon<uint16_t>({0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432,
+                                    0xABCD, 0xEF01, 0xDEAD},
+                                   SVL / 8));
+  CHECK_MAT_COL(AARCH64_REG_ZAH0, 3, uint16_t,
+                fillNeon<uint16_t>({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876,
+                                    0x5432, 0xABCD, 0xEF01},
+                                   SVL / 8));
+  CHECK_MAT_COL(AARCH64_REG_ZAH1, 1, uint16_t,
+                fillNeonCombined<uint16_t>({0xDEAD, 0xBEEF, 0x1234, 0x5678,
+                                            0x9876, 0x5432, 0xABCD, 0xEF01},
+                                           {0}, SVL / 8));
+}
+
+TEST_P(InstSme, ld1q) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src = {0xDEADBEEF12345678, 0x98765432ABCDEF01,
+                               0x98765432ABCDEF01, 0xDEADBEEF12345678};
+  fillHeap<uint64_t>(heap64, src, SVL / 32);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.b
+    mov w12, #1
+    # Load and broadcast values from heap
+    ld1q {za0h.q[w12, 0]}, p0/z, [x0, x1, lsl #4]
+
+    # Test for inactive lanes - zip twice to get on-off for 128-bits
+    pfalse p1.b
+    zip1 p0.d, p0.d, p1.d
+    zip1 p0.d, p0.d, p0.d
+    ld1q {za15h.q[w12, 0]}, p0/z, [x0]
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAQ0, 1 % (SVL / 128), uint64_t,
+                fillNeon<uint64_t>({0x98765432ABCDEF01, 0xDEADBEEF12345678,
+                                    0xDEADBEEF12345678, 0x98765432ABCDEF01},
+                                   SVL / 8));
+  CHECK_MAT_ROW(AARCH64_REG_ZAQ15, 1 % (SVL / 128), uint64_t,
+                fillNeon<uint64_t>(
+                    {0xDEADBEEF12345678, 0x98765432ABCDEF01, 0, 0}, SVL / 8));
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  fillHeap<uint64_t>(heap64, src, SVL / 32);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.b
+    mov w12, #1
+    # Load and broadcast values from heap
+    ld1q {za0v.q[w12, 0]}, p0/z, [x0, x1, lsl #4]
+
+    # Test for inactive lanes - zip twice to get on-off for 128-bits
+    pfalse p1.b
+    zip1 p0.d, p0.d, p1.d
+    zip1 p0.d, p0.d, p0.d
+    ld1q {za15v.q[w12, 0]}, p0/z, [x0]
+  )");
+  // Can't check Q columns as CHECK_MAT_COL isn't set up for doing this with
+  // uint64_t.
+  // Instead, manually place values into 1st column of Q tile (as per
+  // asm above) and check each Q row.
+  auto row0 = fillNeon<uint64_t>({0}, (SVL / 8));
+  auto row1 = fillNeon<uint64_t>({0}, (SVL / 8));
+  auto zeroRow = fillNeon<uint64_t>({0}, (SVL / 8));
+  // MOD SVL / 64 as dealing with uint64_t even though its a 128-bit tile
+  row0[2 % (SVL / 64)] = 0x98765432ABCDEF01;
+  row0[3 % (SVL / 64)] = 0xDEADBEEF12345678;
+  row1[2 % (SVL / 64)] = 0xDEADBEEF12345678;
+  row1[3 % (SVL / 64)] = 0x98765432ABCDEF01;
+  for (uint16_t i = 0; i < SVL / 128; i++) {
+    if (i % 2 == 0) {
+      CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, row0);
+      CHECK_MAT_ROW(AARCH64_REG_ZAQ15, i, uint64_t, row1);
+    } else {
+      CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, row1);
+      CHECK_MAT_ROW(AARCH64_REG_ZAQ15, i, uint64_t, zeroRow);
+    }
+  }
+}
+
+TEST_P(InstSme, ld1w) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  std::vector<uint32_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
+  fillHeap<uint32_t>(heap32, src, SVL / 16);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.s
+    mov w12, #1
+    # Load and broadcast values from heap
+    ld1w {za0h.s[w12, 0]}, p0/z, [x0, x1, lsl #2]
+    ld1w {za0h.s[w12, 2]}, p0/z, [x0]
+
+    # Test for inactive lanes
+    mov x1, #0
+    mov x3, #8
+    # TODO change to addsvl when implemented
+    addvl x1, x1, #1
+    udiv x1, x1, x3
+    mov x2, #0
+    whilelo p1.s, xzr, x1
+    ld1w {za1h.s[w12, 0]}, p1/z, [x0, x2, lsl #2]
+  )");
+  CHECK_MAT_ROW(
+      AARCH64_REG_ZAS0, 1, uint64_t,
+      fillNeon<uint64_t>({0x9876543212345678, 0xDEADBEEFABCDEF01}, SVL / 8));
+  CHECK_MAT_ROW(
+      AARCH64_REG_ZAS0, 3, uint64_t,
+      fillNeon<uint64_t>({0x12345678DEADBEEF, 0xABCDEF0198765432}, SVL / 8));
+  CHECK_MAT_ROW(AARCH64_REG_ZAS1, 1, uint64_t,
+                fillNeonCombined<uint64_t>(
+                    {0x12345678DEADBEEF, 0xABCDEF0198765432}, {0}, SVL / 8));
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  uint32_t* heap32_vert = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  std::vector<uint32_t> src_vert = {0xDEADBEEF, 0x12345678, 0x98765432,
+                                    0xABCDEF01};
+  fillHeap<uint32_t>(heap32_vert, src_vert, SVL / 16);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.s
+    mov w12, #1
+    # Load and broadcast values from heap
+    ld1w {za0v.s[w12, 0]}, p0/z, [x0, x1, lsl #2]
+    ld1w {za0v.s[w12, 2]}, p0/z, [x0]
+
+    # Test for inactive lanes
+    mov x1, #0
+    mov x3, #8
+    # TODO change to addsvl when implemented
+    addvl x1, x1, #1
+    udiv x1, x1, x3
+    mov x2, #0
+    whilelo p1.s, xzr, x1
+    ld1w {za1v.s[w12, 0]}, p1/z, [x0, x2, lsl #2]
+  )");
+  CHECK_MAT_COL(AARCH64_REG_ZAS0, 1, uint32_t,
+                fillNeon<uint32_t>(
+                    {0x12345678, 0x98765432, 0xABCDEF01, 0xDEADBEEF}, SVL / 8));
+  CHECK_MAT_COL(AARCH64_REG_ZAS0, 3, uint32_t,
+                fillNeon<uint32_t>(
+                    {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8));
+  CHECK_MAT_COL(
+      AARCH64_REG_ZAS1, 1, uint32_t,
+      fillNeonCombined<uint32_t>(
+          {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, {0}, SVL / 8));
+}
+
+TEST_P(InstSme, ldr) {
+  // Horizontal
+  initialHeapData_.resize(SVL);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78,
+                              0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01};
+  fillHeap<uint8_t>(heap8, src, SVL);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    ptrue p0.b
+    mov w12, #0
+    # Load and broadcast values from heap
+    ldr za[w12, 0], [x0]
+    ldr za[w12, 2], [x0, #2, mul vl]
+  )");
+  CHECK_MAT_ROW(
+      AARCH64_REG_ZAB0, 0, uint8_t,
+      fillNeon<uint8_t>({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98,
+                         0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                        SVL / 8));
+  CHECK_MAT_ROW(AARCH64_REG_ZAB0, 1, uint8_t, fillNeon<uint8_t>({0}, SVL / 8));
+  CHECK_MAT_ROW(
+      AARCH64_REG_ZAB0, 2, uint8_t,
+      fillNeon<uint8_t>({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98,
+                         0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                        SVL / 8));
+
+  for (uint16_t i = 3; i < SVL / 8; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAB0, i, uint8_t,
+                  fillNeon<uint8_t>({0}, SVL / 8));
+  }
+}
+
+TEST_P(InstSme, smopa) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.b, #8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    smopa za0.s, p0/m, p1/m, z1.b, z2.b
+
+    dup z3.b, #7
+    dup z4.b, #4
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    smopa za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({112}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.h, #8
+    dup z2.h, #3
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    smopa za0.d, p0/m, p1/m, z1.h, z2.h
+
+    dup z3.h, #7
+    dup z4.h, #4
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    smopa za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({112}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, smops) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.b, #8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    smops za0.s, p0/m, p1/m, z1.b, z2.b
+
+    dup z3.b, #7
+    dup z4.b, #4
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    smops za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({-96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({-112}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.h, #8
+    dup z2.h, #3
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    smops za0.d, p0/m, p1/m, z1.h, z2.h
+
+    dup z3.h, #7
+    dup z4.h, #4
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    smops za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({-96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({-112}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, st1b) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78,
+                              0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01};
+  fillHeap<uint8_t>(heap8, src, SVL / 4);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.b
+
+    mov w12, #0
+    ld1b {za0h.b[w12, 0]}, p0/z, [x0, x1]
+    ld1b {za0h.b[w12, 3]}, p0/z, [x0, x1]
+    st1b {za0h.b[w12, 0]}, p0, [sp, x1]
+    st1b {za0h.b[w12, 3]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 8); i++) {
+    EXPECT_EQ(
+        getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 4095 + i),
+        src[i % 16]);
+    EXPECT_EQ(getMemoryValue<uint8_t>((SVL / 8) + i), src[i % 16]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #16
+    ptrue p0.b
+    pfalse p1.b
+    zip1 p1.b, p0.b, p1.b
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #1
+    # Load entire row
+    ld1b {za0h.b[w12, 0]}, p0/z, [x0, x3]
+    # Store all 0s to memory
+    st1b {za0h.b[w12, 5]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1b {za0h.b[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1b {za0h.b[w13, 1]}, p0/z, [x0, x3]
+    # Store all 0s to memory
+    st1b {za0h.b[w12, 5]}, p0, [x6, x3]
+    # Store odd indexed elements to memory
+    st1b {za0h.b[w13, 1]}, p1, [x6, x3]
+  )");
+  for (uint64_t i = 0; i < (SVL / 8); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint8_t>(400 + i), src[i % 16]);
+    EXPECT_EQ(getMemoryValue<uint8_t>(400 + (i + 1)), 0);
+    EXPECT_EQ(getMemoryValue<uint8_t>(800 + 16 + i), src[i % 16]);
+    EXPECT_EQ(getMemoryValue<uint8_t>(800 + 16 + (i + 1)), 0);
+  }
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  uint8_t* heap8_vert = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src_vert = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34,
+                                   0x56, 0x78, 0x98, 0x76, 0x54, 0x32,
+                                   0xAB, 0xCD, 0xEF, 0x01};
+  fillHeap<uint8_t>(heap8_vert, src_vert, SVL / 4);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.b
+
+    mov w12, #0
+    ld1b {za0v.b[w12, 0]}, p0/z, [x0, x1]
+    ld1b {za0v.b[w12, 1]}, p0/z, [x0, x1]
+    st1b {za0v.b[w12, 0]}, p0, [sp, x1]
+    st1b {za0v.b[w12, 1]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 8); i++) {
+    EXPECT_EQ(
+        getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 4095 + i),
+        src_vert[i % 16]);
+    EXPECT_EQ(getMemoryValue<uint8_t>((SVL / 8) + i), src_vert[i % 16]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #16
+    ptrue p0.b
+    pfalse p1.b
+    zip1 p1.b, p0.b, p1.b
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #1
+    # Load entire row
+    ld1b {za0v.b[w12, 0]}, p0/z, [x0, x3]
+    # Store all 0s to memory
+    st1b {za0v.b[w12, 5]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1b {za0v.b[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1b {za0v.b[w13, 1]}, p0/z, [x0, x3]
+    # Store all 0s to memory
+    st1b {za0v.b[w12, 5]}, p0, [x6, x3]
+    # Store odd indexed elements to memory
+    st1b {za0v.b[w13, 1]}, p1, [x6, x3]
+  )");
+  for (uint64_t i = 0; i < (SVL / 8); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint8_t>(400 + i), src[i % 16]);
+    EXPECT_EQ(getMemoryValue<uint8_t>(400 + (i + 1)), 0);
+    EXPECT_EQ(getMemoryValue<uint8_t>(800 + 16 + i), src[i % 16]);
+    EXPECT_EQ(getMemoryValue<uint8_t>(800 + 16 + (i + 1)), 0);
+  }
+}
+
+TEST_P(InstSme, st1d) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
+  fillHeap<uint64_t>(heap64, src, SVL / 32);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.d
+
+    mov w12, #0
+    ld1d {za0h.d[w12, 0]}, p0/z, [x0, x1, lsl #3]
+    ld1d {za1h.d[w12, 1]}, p0/z, [x0, x1, lsl #3]
+    st1d {za0h.d[w12, 0]}, p0, [sp, x1, lsl #3]
+    st1d {za1h.d[w12, 1]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 8)),
+              src[i % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>((SVL / 8) + (i * 8)), src[i % 2]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #2
+    ptrue p0.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #1
+    # Load entire row
+    ld1d {za3h.d[w12, 0]}, p0/z, [x0, x3, lsl #3]
+    # Store all 0s to memory
+    st1d {za0h.d[w12, 0]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1d {za3h.d[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1d {za1h.d[w13, 1]}, p0/z, [x0, x3, lsl #3]
+    # Store all 0s to memory
+    st1d {za0h.d[w12, 0]}, p0, [x6, x3, lsl #3]
+    # Store odd indexed elements to memory
+    st1d {za1h.d[w13, 1]}, p1, [x6, x3, lsl #3]
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + (i * 8)), src[i % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + ((i + 1) * 8)), 0);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 16 + (i * 8)), src[i % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 16 + ((i + 1) * 8)), 0);
+  }
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  uint64_t* heap64_vert = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src_vert = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
+  fillHeap<uint64_t>(heap64_vert, src_vert, SVL / 32);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.d
+
+    mov w12, #0
+    ld1d {za0v.d[w12, 0]}, p0/z, [x0, x1, lsl #3]
+    ld1d {za1v.d[w12, 1]}, p0/z, [x0, x1, lsl #3]
+    st1d {za0v.d[w12, 0]}, p0, [sp, x1, lsl #3]
+    st1d {za1v.d[w12, 1]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 8)),
+              src_vert[i % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>((SVL / 8) + (i * 8)), src_vert[i % 2]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #2
+    ptrue p0.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #1
+    # Load entire row
+    ld1d {za3v.d[w12, 0]}, p0/z, [x0, x3, lsl #3]
+    # Store all 0s to memory
+    st1d {za0v.d[w12, 0]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1d {za3v.d[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1d {za1v.d[w13, 1]}, p0/z, [x0, x3, lsl #3]
+    # Store all 0s to memory
+    st1d {za0v.d[w12, 0]}, p0, [x6, x3, lsl #3]
+    # Store odd indexed elements to memory
+    st1d {za1v.d[w13, 1]}, p1, [x6, x3, lsl #3]
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + (i * 8)), src_vert[i % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + ((i + 1) * 8)), 0);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 16 + (i * 8)), src_vert[i % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 16 + ((i + 1) * 8)), 0);
+  }
+}
+
+TEST_P(InstSme, st1h) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint16_t* heap16 = reinterpret_cast<uint16_t*>(initialHeapData_.data());
+  std::vector<uint16_t> src = {0xDEAD, 0xBEEF, 0x1234, 0x5678,
+                               0x9876, 0x5432, 0xABCD, 0xEF01};
+  fillHeap<uint16_t>(heap16, src, SVL / 8);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.h
+
+    mov w12, #0
+    ld1h {za0h.h[w12, 0]}, p0/z, [x0, x1, lsl #1]
+    ld1h {za1h.h[w12, 1]}, p0/z, [x0, x1, lsl #1]
+    st1h {za0h.h[w12, 0]}, p0, [sp, x1, lsl #1]
+    st1h {za1h.h[w12, 1]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 16); i++) {
+    EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 2)),
+              src[i % 8]);
+    EXPECT_EQ(getMemoryValue<uint16_t>((SVL / 8) + (i * 2)), src[i % 8]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #8
+    ptrue p0.h
+    pfalse p1.b
+    zip1 p1.h, p0.h, p1.h
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #1
+    # Load entire row
+    ld1h {za0h.h[w12, 0]}, p0/z, [x0, x3, lsl #1]
+    # Store all 0s to memory
+    st1h {za1h.h[w12, 0]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1h {za0h.h[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1h {za0h.h[w13, 1]}, p0/z, [x0, x3, lsl #1]
+    # Store all 0s to memory
+    st1h {za1h.h[w12, 0]}, p0, [x6, x3, lsl #1]
+    # Store odd indexed elements to memory
+    st1h {za0h.h[w13, 1]}, p1, [x6, x3, lsl #1]
+  )");
+  for (uint64_t i = 0; i < (SVL / 16); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint16_t>(400 + (i * 2)), src[i % 8]);
+    EXPECT_EQ(getMemoryValue<uint16_t>(400 + ((i + 1) * 2)), 0);
+    EXPECT_EQ(getMemoryValue<uint16_t>(800 + 16 + (i * 2)), src[i % 8]);
+    EXPECT_EQ(getMemoryValue<uint16_t>(800 + 16 + ((i + 1) * 2)), 0);
+  }
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  uint16_t* heap16_vert = reinterpret_cast<uint16_t*>(initialHeapData_.data());
+  std::vector<uint16_t> src_vert = {0xDEAD, 0xBEEF, 0x1234, 0x5678,
+                                    0x9876, 0x5432, 0xABCD, 0xEF01};
+  fillHeap<uint16_t>(heap16_vert, src_vert, SVL / 8);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.h
+
+    mov w12, #0
+    ld1h {za0v.h[w12, 0]}, p0/z, [x0, x1, lsl #1]
+    ld1h {za1v.h[w12, 1]}, p0/z, [x0, x1, lsl #1]
+    st1h {za0v.h[w12, 0]}, p0, [sp, x1, lsl #1]
+    st1h {za1v.h[w12, 1]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 16); i++) {
+    EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 2)),
+              src_vert[i % 8]);
+    EXPECT_EQ(getMemoryValue<uint16_t>((SVL / 8) + (i * 2)), src_vert[i % 8]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #8
+    ptrue p0.h
+    pfalse p1.b
+    zip1 p1.h, p0.h, p1.h
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #1
+    # Load entire row
+    ld1h {za0v.h[w12, 0]}, p0/z, [x0, x3, lsl #1]
+    # Store all 0s to memory
+    st1h {za1v.h[w12, 0]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1h {za0v.h[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1h {za0v.h[w13, 1]}, p0/z, [x0, x3, lsl #1]
+    # Store all 0s to memory
+    st1h {za1v.h[w12, 0]}, p0, [x6, x3, lsl #1]
+    # Store odd indexed elements to memory
+    st1h {za0v.h[w13, 1]}, p1, [x6, x3, lsl #1]
+  )");
+  for (uint64_t i = 0; i < (SVL / 16); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint16_t>(400 + (i * 2)), src[i % 8]);
+    EXPECT_EQ(getMemoryValue<uint16_t>(400 + ((i + 1) * 2)), 0);
+    EXPECT_EQ(getMemoryValue<uint16_t>(800 + 16 + (i * 2)), src[i % 8]);
+    EXPECT_EQ(getMemoryValue<uint16_t>(800 + 16 + ((i + 1) * 2)), 0);
+  }
+}
+
+TEST_P(InstSme, st1q) {
+  // Horizontal
+  initialHeapData_.resize(SVL);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
+  fillHeap<uint64_t>(heap64, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.b
+
+    mov w12, #0
+    mov w13, #1
+    ld1q {za0h.q[w12, 0]}, p0/z, [x0, x1, lsl #4]
+    ld1q {za1h.q[w13, 0]}, p0/z, [x0, x1, lsl #4]
+    st1q {za0h.q[w12, 0]}, p0, [sp, x1, lsl #4]
+    st1q {za1h.q[w13, 0]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 128); i++) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + ((2 * i) * 8)),
+              src[(2 * i) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + ((2 * i + 1) * 8)),
+              src[(2 * i + 1) % 2]);
+
+    EXPECT_EQ(getMemoryValue<uint64_t>((SVL / 8) + ((2 * i) * 8)),
+              src[(2 * i) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>((SVL / 8) + ((2 * i + 1) * 8)),
+              src[(2 * i + 1) % 2]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #2
+    ptrue p0.d
+    pfalse p1.b
+    # Do zip1 twice to get on-off for 128-bit
+    zip1 p1.d, p0.d, p1.d
+    zip1 p1.d, p1.d, p1.d
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #2
+    # Load entire row
+    ld1q {za3h.q[w12, 0]}, p0/z, [x0, x3, lsl #4]
+    # Store all 0s to memory
+    st1q {za0h.q[w12, 0]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1q {za3h.q[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1q {za1h.q[w13, 0]}, p0/z, [x0, x3, lsl #4]
+    # Store all 0s to memory
+    st1q {za0h.q[w12, 0]}, p0, [x6, x3, lsl #4]
+    # Store odd indexed elements to memory
+    st1q {za1h.q[w13, 0]}, p1, [x6, x3, lsl #4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 128); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + ((2 * i) * 8)), src[(2 * i) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + ((2 * i + 1) * 8)),
+              src[(2 * i + 1) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + (((2 * i) + 2) * 8)), 0);
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + (((2 * i + 1) + 2) * 8)), 0);
+
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 32 + ((2 * i) * 8)),
+              src[(2 * i) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 32 + ((2 * i + 1) * 8)),
+              src[(2 * i + 1) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 32 + (((2 * i) + 2) * 8)), 0);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 32 + (((2 * i + 1) + 2) * 8)), 0);
+  }
+
+  // Vertical
+  initialHeapData_.resize(SVL);
+  uint64_t* heap64_vert = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src_vert = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
+  fillHeap<uint64_t>(heap64_vert, src_vert, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.b
+
+    mov w12, #0
+    mov w13, #1
+    ld1q {za0v.q[w12, 0]}, p0/z, [x0, x1, lsl #4]
+    ld1q {za1v.q[w13, 0]}, p0/z, [x0, x1, lsl #4]
+    st1q {za0v.q[w12, 0]}, p0, [sp, x1, lsl #4]
+    st1q {za1v.q[w13, 0]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 128); i++) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + ((2 * i) * 8)),
+              src_vert[(2 * i) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + ((2 * i + 1) * 8)),
+              src_vert[(2 * i + 1) % 2]);
+
+    EXPECT_EQ(getMemoryValue<uint64_t>((SVL / 8) + ((2 * i) * 8)),
+              src_vert[(2 * i) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>((SVL / 8) + ((2 * i + 1) * 8)),
+              src_vert[(2 * i + 1) % 2]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #2
+    ptrue p0.d
+    pfalse p1.b
+    # Do zip1 twice to get on-off for 128-bit
+    zip1 p1.d, p0.d, p1.d
+    zip1 p1.d, p1.d, p1.d
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #2
+    # Load entire row
+    ld1q {za3v.q[w12, 0]}, p0/z, [x0, x3, lsl #4]
+    # Store all 0s to memory
+    st1q {za0v.q[w12, 0]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1q {za3v.q[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1q {za1v.q[w13, 0]}, p0/z, [x0, x3, lsl #4]
+    # Store all 0s to memory
+    st1q {za0v.q[w12, 0]}, p0, [x6, x3, lsl #4]
+    # Store odd indexed elements to memory
+    st1q {za1v.q[w13, 0]}, p1, [x6, x3, lsl #4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 128); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + ((2 * i) * 8)), src[(2 * i) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + ((2 * i + 1) * 8)),
+              src[(2 * i + 1) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + (((2 * i) + 2) * 8)), 0);
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + (((2 * i + 1) + 2) * 8)), 0);
+
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 32 + ((2 * i) * 8)),
+              src[(2 * i) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 32 + ((2 * i + 1) * 8)),
+              src[(2 * i + 1) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 32 + (((2 * i) + 2) * 8)), 0);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 32 + (((2 * i + 1) + 2) * 8)), 0);
+  }
+}
+
+TEST_P(InstSme, st1w) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  std::vector<uint32_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
+  fillHeap<uint32_t>(heap32, src, SVL / 16);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.s
+
+    mov w12, #0
+    ld1w {za0h.s[w12, 0]}, p0/z, [x0, x1, lsl #2]
+    ld1w {za1h.s[w12, 1]}, p0/z, [x0, x1, lsl #2]
+    st1w {za0h.s[w12, 0]}, p0, [sp, x1, lsl #2]
+    st1w {za1h.s[w12, 1]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 4)),
+              src[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>((SVL / 8) + (i * 4)), src[i % 4]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #4
+    ptrue p0.s
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #1
+    # Load entire row
+    ld1w {za3h.s[w12, 0]}, p0/z, [x0, x3, lsl #2]
+    # Store all 0s to memory
+    st1w {za0h.s[w12, 0]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1w {za3h.s[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1w {za1h.s[w13, 1]}, p0/z, [x0, x3, lsl #2]
+    # Store all 0s to memory
+    st1w {za0h.s[w12, 0]}, p0, [x6, x3, lsl #2]
+    # Store odd indexed elements to memory
+    st1w {za1h.s[w13, 1]}, p1, [x6, x3, lsl #2]
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(400 + (i * 4)), src[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>(400 + ((i + 1) * 4)), 0);
+    EXPECT_EQ(getMemoryValue<uint32_t>(800 + 16 + (i * 4)), src[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>(800 + 16 + ((i + 1) * 4)), 0);
+  }
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  uint32_t* heap32_vert = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  std::vector<uint32_t> src_vert = {0xDEADBEEF, 0x12345678, 0x98765432,
+                                    0xABCDEF01};
+  fillHeap<uint32_t>(heap32_vert, src_vert, SVL / 16);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.s
+
+    mov w12, #0
+    ld1w {za0v.s[w12, 0]}, p0/z, [x0, x1, lsl #2]
+    ld1w {za1v.s[w12, 1]}, p0/z, [x0, x1, lsl #2]
+    st1w {za0v.s[w12, 0]}, p0, [sp, x1, lsl #2]
+    st1w {za1v.s[w12, 1]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 4)),
+              src_vert[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>((SVL / 8) + (i * 4)), src_vert[i % 4]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #4
+    ptrue p0.s
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #1
+    # Load entire row
+    ld1w {za3v.s[w12, 0]}, p0/z, [x0, x3, lsl #2]
+    # Store all 0s to memory
+    st1w {za0v.s[w12, 0]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1w {za3v.s[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1w {za1v.s[w13, 1]}, p0/z, [x0, x3, lsl #2]
+    # Store all 0s to memory
+    st1w {za0v.s[w12, 0]}, p0, [x6, x3, lsl #2]
+    # Store odd indexed elements to memory
+    st1w {za1v.s[w13, 1]}, p1, [x6, x3, lsl #2]
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(400 + (i * 4)), src_vert[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>(400 + ((i + 1) * 4)), 0);
+    EXPECT_EQ(getMemoryValue<uint32_t>(800 + 16 + (i * 4)), src_vert[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>(800 + 16 + ((i + 1) * 4)), 0);
+  }
+}
+
+TEST_P(InstSme, str) {
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    dup z0.b, #2
+    dup z1.b, #5
+    ptrue p0.b
+    ptrue p1.b
+
+    # Fill first 32-bit ZA tile with 40 in every element
+    umopa za0.s, p0/m, p1/m, z0.b, z1.b
+
+    dup z0.b, #1
+    dup z1.b, #5
+
+    # Fill third 32-bit ZA tile with 20 in every element
+    umopa za2.s, p0/m, p1/m, z0.b, z1.b
+
+    mov x2, #600
+    mov w12, #0
+
+    # ZA sub tiles are interleaved, so 0th, 4th, 8th... rows will be for za0.s
+    # 2nd, 6th, 10th ... rows will be for za2.s
+    str za[w12, #0], [x2]
+    str za[w12, #1], [x2, #1, mul vl]
+    str za[w12, #2], [x2, #2, mul vl]
+    str za[w12, #3], [x2, #3, mul vl]
+    
+    # Store 8th row (3rd row of za0.s)
+    add w12, w12, #8
+    mov x3, #0
+    addvl x3, x3, #4
+    add x2, x2, x3
+    str za[w12, #0], [x2]
+
+    # Store 10th row (3rd row of za2.s)
+    add w12, w12, #2
+    mov x3, #0
+    addvl x3, x3, #1
+    add x2, x2, x3
+    str za[w12, #0], [x2]
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({40}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS1, i, uint32_t,
+                  fillNeon<uint32_t>({0}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t,
+                  fillNeon<uint32_t>({20}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS3, i, uint32_t,
+                  fillNeon<uint32_t>({0}, (SVL / 8)));
+  }
+  const uint64_t SVL_bytes = SVL / 8;
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    const uint64_t off = i * sizeof(uint32_t);
+    EXPECT_EQ(getMemoryValue<uint32_t>(600 + off), 40);
+    EXPECT_EQ(getMemoryValue<uint32_t>(600 + SVL_bytes + off), 0);
+    EXPECT_EQ(getMemoryValue<uint32_t>(600 + (2 * SVL_bytes) + off), 20);
+    EXPECT_EQ(getMemoryValue<uint32_t>(600 + (3 * SVL_bytes) + off), 0);
+    EXPECT_EQ(getMemoryValue<uint32_t>(600 + (4 * SVL_bytes) + off), 40);
+    EXPECT_EQ(getMemoryValue<uint32_t>(600 + (5 * SVL_bytes) + off), 20);
+  }
+}
+
+TEST_P(InstSme, sumopa) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.b, #-8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    sumopa za0.s, p0/m, p1/m, z1.b, z2.b
+
+    dup z3.b, #-7
+    dup z4.b, #4
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    sumopa za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({-96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({-112}, (SVL / 16)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    # z1 is signed, z2 is unsigned so will become 255
+    dup z1.b, #3
+    dup z2.b, #-1
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    sumopa za0.s, p0/m, p1/m, z1.b, z2.b
+
+    # z3 is signed, z4 is unsigned so will become 254
+    dup z3.b, #7
+    dup z4.b, #-2
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    sumopa za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({3060}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({7112}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.h, #-8
+    dup z2.h, #3
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    sumopa za0.d, p0/m, p1/m, z1.h, z2.h
+
+    dup z3.h, #-7
+    dup z4.h, #4
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    sumopa za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({-96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({-112}, (SVL / 16)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    # z1 is signed, z2 is unsigned so will become 65535
+    dup z1.h, #3
+    dup z2.h, #-1
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    sumopa za0.d, p0/m, p1/m, z1.h, z2.h
+
+    # z3 is signed, z4 is unsigned so will become 65534
+    dup z3.h, #7
+    dup z4.h, #-2
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    sumopa za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({786420}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({1834952}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, sumops) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.b, #-8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    sumops za0.s, p0/m, p1/m, z1.b, z2.b
+
+    dup z3.b, #-7
+    dup z4.b, #4
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    sumops za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({112}, (SVL / 16)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    # z1 is signed, z2 is unsigned so will become 255
+    dup z1.b, #3
+    dup z2.b, #-1
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    sumops za0.s, p0/m, p1/m, z1.b, z2.b
+
+    # z3 is signed, z4 is unsigned so will become 254
+    dup z3.b, #7
+    dup z4.b, #-2
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    sumops za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({-3060}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({-7112}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.h, #-8
+    dup z2.h, #3
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    sumops za0.d, p0/m, p1/m, z1.h, z2.h
+
+    dup z3.h, #-7
+    dup z4.h, #4
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    sumops za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({112}, (SVL / 16)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    # z1 is signed, z2 is unsigned so will become 255
+    dup z1.h, #3
+    dup z2.h, #-1
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    sumops za0.d, p0/m, p1/m, z1.h, z2.h
+
+    # z3 is signed, z4 is unsigned so will become 254
+    dup z3.h, #7
+    dup z4.h, #-2
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    sumops za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({-786420}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({-1834952}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, umopa) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.b, #8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    umopa za0.s, p0/m, p1/m, z1.b, z2.b
+
+    dup z3.b, #7
+    dup z4.b, #4
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    umopa za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t,
+                  fillNeon<uint32_t>({112}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.h, #8
+    dup z2.h, #3
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    umopa za0.d, p0/m, p1/m, z1.h, z2.h
+
+    dup z3.h, #7
+    dup z4.h, #4
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    umopa za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>({96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, uint64_t,
+                  fillNeon<uint64_t>({112}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, umops) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.b, #8
+    dup z2.b, #3
+    dup z3.b, #2
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    umopa za0.s, p0/m, p1/m, z1.b, z2.b
+    umops za0.s, p0/m, p1/m, z1.b, z3.b
+
+    dup z3.b, #7
+    dup z4.b, #4
+    dup z5.b, #3
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    umopa za2.s, p0/m, p2/m, z3.b, z4.b
+    umops za2.s, p0/m, p2/m, z3.b, z5.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({32}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t,
+                  fillNeon<uint32_t>({28}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.h, #8
+    dup z2.h, #3
+    dup z3.h, #2
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    umopa za0.d, p0/m, p1/m, z1.h, z2.h
+    umops za0.d, p0/m, p1/m, z1.h, z3.h
+
+    dup z3.h, #7
+    dup z4.h, #4
+    dup z5.h, #3
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    umopa za2.d, p0/m, p2/m, z3.h, z4.h
+    umops za2.d, p0/m, p2/m, z3.h, z5.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>({32}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, uint64_t,
+                  fillNeon<uint64_t>({28}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, usmopa) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.b, #8
+    dup z2.b, #-3
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    usmopa za0.s, p0/m, p1/m, z1.b, z2.b
+
+    dup z3.b, #7
+    dup z4.b, #-4
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    usmopa za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({-96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({-112}, (SVL / 16)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    # z1 is unsigned so will become 253, z2 is signed
+    dup z1.b, #-3
+    dup z2.b, #2
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    usmopa za0.s, p0/m, p1/m, z1.b, z2.b
+
+    # z3 is unsigned so will become 254, z4 is unsigned
+    dup z3.b, #-2
+    dup z4.b, #7
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    usmopa za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({2024}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({7112}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.h, #8
+    dup z2.h, #-3
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    usmopa za0.d, p0/m, p1/m, z1.h, z2.h
+
+    dup z3.h, #7
+    dup z4.h, #-4
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    usmopa za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({-96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({-112}, (SVL / 16)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    # z1 is unsigned so will become 65533, z2 is unsigned
+    dup z1.h, #-3
+    dup z2.h, #2
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    usmopa za0.d, p0/m, p1/m, z1.h, z2.h
+
+    # z3 is unsigned so will become 65534, z4 is signed
+    dup z3.h, #-2
+    dup z4.h, #7
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    usmopa za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({524264}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({1834952}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, usmops) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.b, #8
+    dup z2.b, #-3
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    usmops za0.s, p0/m, p1/m, z1.b, z2.b
+
+    dup z3.b, #7
+    dup z4.b, #-4
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    usmops za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({112}, (SVL / 16)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    # z1 is unsigned so will become 253, z2 is signed
+    dup z1.b, #-3
+    dup z2.b, #2
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    usmops za0.s, p0/m, p1/m, z1.b, z2.b
+
+    # z3 is unsigned so will become 254, z4 is signed
+    dup z3.b, #-2
+    dup z4.b, #7
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    usmops za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({-2024}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({-7112}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.h, #8
+    dup z2.h, #-3
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    usmops za0.d, p0/m, p1/m, z1.h, z2.h
+
+    dup z3.h, #7
+    dup z4.h, #-4
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    usmops za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({112}, (SVL / 16)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    # z1 is unsigned so will become 65533, z2 is signed
+    dup z1.h, #-3
+    dup z2.h, #2
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    usmops za0.d, p0/m, p1/m, z1.h, z2.h
+
+    # z3 is unsigned so will become 65534, z4 is signed
+    dup z3.h, #-2
+    dup z4.h, #7
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    usmops za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({-524264}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({-1834952}, (SVL / 16)));
   }
 }
 
@@ -251,8 +3416,9 @@ TEST_P(InstSme, zero) {
 
     zero {za}
   )");
-  for (int i = 0; i < (SVL / 8); i++) {
-    CHECK_MAT_ROW(ARM64_REG_ZA, i, uint64_t, fillNeon<uint64_t>({0}, SVL / 8));
+  for (uint64_t i = 0; i < (SVL / 8); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint64_t,
+                  fillNeon<uint64_t>({0}, SVL / 8));
   }
 
   initialHeapData_.resize(SVL / 4);
@@ -287,13 +3453,13 @@ TEST_P(InstSme, zero) {
 
     zero {za0.s, za2.s}
   )");
-  for (int i = 0; i < (SVL / 32); i++) {
-    CHECK_MAT_ROW(ARM64_REG_ZAS0, i, uint32_t,
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
                   fillNeon<uint32_t>({0}, SVL / 8));
-    CHECK_MAT_ROW(ARM64_REG_ZAS2, i, uint32_t,
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t,
                   fillNeon<uint32_t>({0}, SVL / 8));
   }
-  CHECK_MAT_COL(ARM64_REG_ZAS1, 3, uint32_t,
+  CHECK_MAT_COL(AARCH64_REG_ZAS1, 3, uint32_t,
                 fillNeon<uint32_t>(
                     {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8));
 }
diff --git a/test/regression/aarch64/instructions/store.cc b/test/regression/aarch64/instructions/store.cc
index fe552fa379..6d6876b494 100644
--- a/test/regression/aarch64/instructions/store.cc
+++ b/test/regression/aarch64/instructions/store.cc
@@ -21,10 +21,14 @@ TEST_P(InstStore, stlr) {
     stlrb w3, [sp]
     add sp, sp, #1
   )");
-  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() - 4), 0xAB);
-  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() - 3), 0x12);
-  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() - 2), 0xCD);
-  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() - 1), 0x34);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 4),
+            0xAB);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 3),
+            0x12);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 2),
+            0xCD);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 1),
+            0x34);
 
   // stlr
   RUN_AARCH64(R"(
@@ -46,12 +50,14 @@ TEST_P(InstStore, stlr) {
     add sp, sp, #4
   )");
 
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 24),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 24),
             0xFFFFFFFFFFFFFFFF);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 16), 0xBEEF);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 8),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 16),
+            0xBEEF);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 8),
             0xFFFFFFFF);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 4), 0xBABA);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 4),
+            0xBABA);
 }
 
 TEST_P(InstStore, strb) {
@@ -70,12 +76,17 @@ TEST_P(InstStore, strb) {
     mov x6, -16
     strb w1, [sp, x6, sxtx]
   )");
-  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() - 4), 0xAB);
-  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() - 3), 0x12);
-  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() - 2), 0xCD);
-  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() - 1), 0x34);
-  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer()), 0xAB);
-  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() - 18), 0x12);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 4),
+            0xAB);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 3),
+            0x12);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 2),
+            0xCD);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 1),
+            0x34);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer()), 0xAB);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 18),
+            0x12);
 }
 
 TEST_P(InstStore, strh) {
@@ -94,12 +105,18 @@ TEST_P(InstStore, strh) {
     mov x6, -16
     strh w1, [sp, x6, sxtx]
   )");
-  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getStackPointer() - 8), 0xABAB);
-  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getStackPointer() - 6), 0x1234);
-  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getStackPointer() - 4), 0xCD89);
-  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getStackPointer() - 2), 0x3401);
-  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getStackPointer()), 0xABAB);
-  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getStackPointer() - 20), 0x1234);
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 8),
+            0xABAB);
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 6),
+            0x1234);
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 4),
+            0xCD89);
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 2),
+            0x3401);
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer()),
+            0xABAB);
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 20),
+            0x1234);
 }
 
 TEST_P(InstStore, strd) {
@@ -119,12 +136,18 @@ TEST_P(InstStore, strd) {
     mov x6, -16
     str d1, [sp, x6, sxtx]
   )");
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 40), 2.0);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 32), -0.125);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 24), 7.5);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 16), 16.0);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 8), 2.0);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 56), -0.125);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 40),
+            2.0);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 32),
+            -0.125);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 24),
+            7.5);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 16),
+            16.0);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 8),
+            2.0);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 56),
+            -0.125);
 }
 
 TEST_P(InstStore, strq) {
@@ -135,11 +158,16 @@ TEST_P(InstStore, strq) {
     str q0, [sp], -32
     str q1, [sp, #16]!
   )");
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 8), 0.125);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 16), 0.125);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 24), 0.25);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 32), 0.25);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(31), process_->getStackPointer() - 32);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 8),
+            0.125);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 16),
+            0.125);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 24),
+            0.25);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 32),
+            0.25);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31),
+            process_->getInitialStackPointer() - 32);
 }
 
 TEST_P(InstStore, strs) {
@@ -159,12 +187,17 @@ TEST_P(InstStore, strs) {
     mov x6, -8
     str s1, [sp, x6, sxtx]
   )");
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 20), 2.f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 16), -0.125f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 12), 7.5f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 8), 16.f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 4), 2.f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 28), -0.125f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 20),
+            2.f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 16),
+            -0.125f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 12),
+            7.5f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 8),
+            16.f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 4), 2.f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 28),
+            -0.125f);
 }
 
 TEST_P(InstStore, strw) {
@@ -183,17 +216,17 @@ TEST_P(InstStore, strw) {
     mov x6, -16
     str w1, [sp, x6, sxtx]
   )");
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 16),
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 16),
             0xABABull << 16);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 12),
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 12),
             0x1234ull << 16);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 8),
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 8),
             0xCD89ull << 16);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 4),
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 4),
             0x3401ull << 16);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer()),
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer()),
             0xABABull << 16);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 24),
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 24),
             0x1234ull << 16);
 }
 
@@ -214,17 +247,17 @@ TEST_P(InstStore, strx) {
     mov x6, -16
     str x1, [sp, x6, sxtx]
   )");
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 32),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 32),
             0xABABull << 32);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 24),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 24),
             0x1234ull << 32);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 16),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 16),
             0xCD89ull << 32);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 8),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 8),
             0x3401ull << 32);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer()),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer()),
             0xABABull << 32);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 48),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 48),
             0x1234ull << 32);
 }
 
@@ -253,7 +286,8 @@ TEST_P(InstStore, st1_single_struct) {
     add sp, sp, #15
     st1 {v0.b}[12], [sp]
   )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(31), process_->getStackPointer() - 16);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31),
+            process_->getInitialStackPointer() - 16);
   EXPECT_EQ(getMemoryValue<uint8_t>(getGeneralRegister<uint64_t>(31) - 48),
             static_cast<uint8_t>(1));
   EXPECT_EQ(getMemoryValue<uint8_t>(getGeneralRegister<uint64_t>(31) - 32),
@@ -287,7 +321,8 @@ TEST_P(InstStore, st1_single_struct) {
     add sp, sp, #14
     st1 {v0.h}[7], [sp]
   )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(31), process_->getStackPointer() - 16);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31),
+            process_->getInitialStackPointer() - 16);
   EXPECT_EQ(getMemoryValue<uint16_t>(getGeneralRegister<uint64_t>(31) - 48),
             0xab);
   EXPECT_EQ(getMemoryValue<uint16_t>(getGeneralRegister<uint64_t>(31) - 32),
@@ -319,7 +354,8 @@ TEST_P(InstStore, st1_single_struct) {
     add sp, sp, #12
     st1 {v0.s}[3], [sp]
   )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(31), process_->getStackPointer() - 16);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31),
+            process_->getInitialStackPointer() - 16);
   EXPECT_EQ(getMemoryValue<float>(getGeneralRegister<uint64_t>(31) - 48), 0.5f);
   EXPECT_EQ(getMemoryValue<uint32_t>(getGeneralRegister<uint64_t>(31) - 32), 1);
   EXPECT_EQ(getMemoryValue<uint32_t>(getGeneralRegister<uint64_t>(31) - 16), 2);
@@ -344,48 +380,187 @@ TEST_P(InstStore, st1_single_struct) {
     st1 {v0.d}[1], [sp], x4
     st1 {v1.d}[0], [sp]
   )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(31), process_->getStackPointer() - 16);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31),
+            process_->getInitialStackPointer() - 16);
   EXPECT_EQ(getMemoryValue<double>(getGeneralRegister<uint64_t>(31) - 32), 0.5);
   EXPECT_EQ(getMemoryValue<uint64_t>(getGeneralRegister<uint64_t>(31) - 16),
             1000UL);
   EXPECT_EQ(getMemoryValue<uint64_t>(getGeneralRegister<uint64_t>(31)), 2000UL);
 }
 
-TEST_P(InstStore, st1twov) {
-  // V.16B
+TEST_P(InstStore, st1_multi_struct) {
+  // two reg, 16b elements
   RUN_AARCH64(R"(
+    mov x0, #32
     movi v0.16b, #1
     movi v1.16b, #2
-    sub sp, sp, #32
+    sub sp, sp, #96
+    st1 {v0.16b, v1.16b}, [sp], #32
+    st1 {v0.16b, v1.16b}, [sp], x0
     st1 {v0.16b, v1.16b}, [sp]
   )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(31), process_->getStackPointer() - 32);
-  for (int i = 0; i < 16; i++) {
-    EXPECT_EQ(getMemoryValue<uint8_t>(getGeneralRegister<uint64_t>(31) + i),
-              (static_cast<uint8_t>(1)));
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31),
+            process_->getInitialStackPointer() - 32);
+  for (int j = 2; j >= 0; j--) {
+    uint64_t base = getGeneralRegister<uint64_t>(31) - (j * 32);
+    for (int i = 0; i < 16; i++) {
+      EXPECT_EQ(getMemoryValue<uint8_t>(base + i), (static_cast<uint8_t>(1)));
+    }
+    for (uint64_t i = 16; i < 32; i++) {
+      EXPECT_EQ(getMemoryValue<uint8_t>(base + i), (static_cast<uint8_t>(2)));
+    }
   }
-  for (uint64_t i = 16; i < 32; i++) {
-    EXPECT_EQ(getMemoryValue<uint8_t>(getGeneralRegister<uint64_t>(31) + i),
-              (static_cast<uint8_t>(2)));
+
+  // two reg, 2d elements
+  RUN_AARCH64(R"(
+    mov x0, #32
+    mov x1, #1
+    mov x2, #2
+    dup v0.2d, x1
+    dup v1.2d, x2
+    sub sp, sp, #96
+    st1 {v0.2d, v1.2d}, [sp], #32
+    st1 {v0.2d, v1.2d}, [sp], x0
+    st1 {v0.2d, v1.2d}, [sp]
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31),
+            process_->getInitialStackPointer() - 32);
+  for (int j = 2; j >= 0; j--) {
+    uint64_t base = getGeneralRegister<uint64_t>(31) - (j * 32);
+    for (int i = 0; i < 2; i++) {
+      EXPECT_EQ(getMemoryValue<uint64_t>(base + (i * 8)),
+                (static_cast<uint64_t>(1)));
+    }
+    for (uint64_t i = 2; i < 4; i++) {
+      EXPECT_EQ(getMemoryValue<uint64_t>(base + (i * 8)),
+                (static_cast<uint64_t>(2)));
+    }
   }
 
-  // V.4S
+  // two reg, 4s elements
   RUN_AARCH64(R"(
+    mov x0, #32
     movi v0.4s, #1
     movi v1.4s, #2
-    sub sp, sp, #32
+    sub sp, sp, #96
+    st1 {v0.4s, v1.4s}, [sp], #32
+    st1 {v0.4s, v1.4s}, [sp], x0
     st1 {v0.4s, v1.4s}, [sp]
   )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(31), process_->getStackPointer() - 32);
-  for (int i = 0; i < 4; i++) {
-    EXPECT_EQ(
-        getMemoryValue<uint32_t>(getGeneralRegister<uint64_t>(31) + (i * 4)),
-        (static_cast<uint32_t>(1)));
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31),
+            process_->getInitialStackPointer() - 32);
+  for (int j = 2; j >= 0; j--) {
+    uint64_t base = getGeneralRegister<uint64_t>(31) - (j * 32);
+    for (int i = 0; i < 4; i++) {
+      EXPECT_EQ(getMemoryValue<uint32_t>(base + (i * 4)),
+                (static_cast<uint32_t>(1)));
+    }
+    for (uint64_t i = 4; i < 8; i++) {
+      EXPECT_EQ(getMemoryValue<uint32_t>(base + (i * 4)),
+                (static_cast<uint32_t>(2)));
+    }
   }
-  for (uint64_t i = 4; i < 8; i++) {
-    EXPECT_EQ(
-        getMemoryValue<uint32_t>(getGeneralRegister<uint64_t>(31) + (i * 4)),
-        (static_cast<uint32_t>(2)));
+
+  // four reg, 16b elements
+  RUN_AARCH64(R"(
+    mov x0, #64
+    movi v0.16b, #1
+    movi v1.16b, #2
+    movi v2.16b, #3
+    movi v3.16b, #4
+    sub sp, sp, #192
+    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [sp], #64
+    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [sp], x0
+    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [sp]
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31),
+            process_->getInitialStackPointer() - 64);
+  for (int j = 2; j >= 0; j--) {
+    uint64_t base = getGeneralRegister<uint64_t>(31) - (j * 64);
+    for (int i = 0; i < 16; i++) {
+      EXPECT_EQ(getMemoryValue<uint8_t>(base + i), (static_cast<uint8_t>(1)));
+    }
+    for (uint64_t i = 16; i < 32; i++) {
+      EXPECT_EQ(getMemoryValue<uint8_t>(base + i), (static_cast<uint8_t>(2)));
+    }
+    for (int i = 32; i < 48; i++) {
+      EXPECT_EQ(getMemoryValue<uint8_t>(base + i), (static_cast<uint8_t>(3)));
+    }
+    for (uint64_t i = 48; i < 64; i++) {
+      EXPECT_EQ(getMemoryValue<uint8_t>(base + i), (static_cast<uint8_t>(4)));
+    }
+  }
+
+  // four reg, 2d elements
+  RUN_AARCH64(R"(
+    mov x0, #64
+    mov x1, #1
+    mov x2, #2
+    mov x3, #3
+    mov x4, #4
+    dup v0.2d, x1
+    dup v1.2d, x2
+    dup v2.2d, x3
+    dup v3.2d, x4
+    sub sp, sp, #192
+    st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [sp], #64
+    st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [sp], x0
+    st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [sp]
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31),
+            process_->getInitialStackPointer() - 64);
+  for (int j = 2; j >= 0; j--) {
+    uint64_t base = getGeneralRegister<uint64_t>(31) - (j * 64);
+    for (int i = 0; i < 2; i++) {
+      EXPECT_EQ(getMemoryValue<uint64_t>(base + (i * 8)),
+                (static_cast<uint64_t>(1)));
+    }
+    for (uint64_t i = 2; i < 4; i++) {
+      EXPECT_EQ(getMemoryValue<uint64_t>(base + (i * 8)),
+                (static_cast<uint64_t>(2)));
+    }
+    for (int i = 4; i < 6; i++) {
+      EXPECT_EQ(getMemoryValue<uint64_t>(base + (i * 8)),
+                (static_cast<uint64_t>(3)));
+    }
+    for (uint64_t i = 6; i < 8; i++) {
+      EXPECT_EQ(getMemoryValue<uint64_t>(base + (i * 8)),
+                (static_cast<uint64_t>(4)));
+    }
+  }
+
+  // four reg, 4s elements
+  RUN_AARCH64(R"(
+    mov x0, #64
+    movi v0.4s, #1
+    movi v1.4s, #2
+    movi v2.4s, #3
+    movi v3.4s, #4
+    sub sp, sp, #192
+    st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [sp], #64
+    st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [sp], x0
+    st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [sp]
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31),
+            process_->getInitialStackPointer() - 64);
+  for (int j = 2; j >= 0; j--) {
+    uint64_t base = getGeneralRegister<uint64_t>(31) - (j * 64);
+    for (int i = 0; i < 4; i++) {
+      EXPECT_EQ(getMemoryValue<uint32_t>(base + (i * 4)),
+                (static_cast<uint32_t>(1)));
+    }
+    for (uint64_t i = 4; i < 8; i++) {
+      EXPECT_EQ(getMemoryValue<uint32_t>(base + (i * 4)),
+                (static_cast<uint32_t>(2)));
+    }
+    for (int i = 8; i < 12; i++) {
+      EXPECT_EQ(getMemoryValue<uint32_t>(base + (i * 4)),
+                (static_cast<uint32_t>(3)));
+    }
+    for (uint64_t i = 12; i < 16; i++) {
+      EXPECT_EQ(getMemoryValue<uint32_t>(base + (i * 4)),
+                (static_cast<uint32_t>(4)));
+    }
   }
 }
 
@@ -411,9 +586,12 @@ TEST_P(InstStore, st1fourv_post) {
 
       st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [x1], x2
   )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(31), process_->getStackPointer() - 64);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(0), process_->getStackPointer() - 32);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(1), process_->getStackPointer() - 15);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31),
+            process_->getInitialStackPointer() - 64);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(0),
+            process_->getInitialStackPointer() - 32);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(1),
+            process_->getInitialStackPointer() - 15);
   for (int i = 0; i < 2; i++) {
     EXPECT_EQ(
         getMemoryValue<uint32_t>(getGeneralRegister<uint64_t>(31) + (i * 4)),
@@ -463,9 +641,11 @@ TEST_P(InstStore, st1fourv_post) {
       st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], x2
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(31),
-            process_->getStackPointer() - 128);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(0), process_->getStackPointer() - 64);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(1), process_->getStackPointer() - 47);
+            process_->getInitialStackPointer() - 128);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(0),
+            process_->getInitialStackPointer() - 64);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(1),
+            process_->getInitialStackPointer() - 47);
   for (int i = 0; i < 4; i++) {
     EXPECT_EQ(
         getMemoryValue<uint32_t>(getGeneralRegister<uint64_t>(31) + (i * 4)),
@@ -506,7 +686,8 @@ TEST_P(InstStore, st2_multi_struct) {
     st2 {v2.4s, v3.4s}, [sp], x1
     st2 {v0.4s, v1.4s}, [sp], #32
   )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(31), process_->getStackPointer());
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31),
+            process_->getInitialStackPointer());
   for (int i = 0; i < 4; i++) {
     EXPECT_EQ(
         getMemoryValue<float>(getGeneralRegister<uint64_t>(31) - 32 + 8 * i),
@@ -538,14 +719,22 @@ TEST_P(InstStore, stpd) {
     stp d3, d0, [sp, 16]
   )");
 
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 64), 2.0);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 56), -0.125);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 48), -0.125);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 40), 7.5);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 32), 7.5);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 24), 16.0);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 16), 16.0);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 8), 2.0);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 64),
+            2.0);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 56),
+            -0.125);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 48),
+            -0.125);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 40),
+            7.5);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 32),
+            7.5);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 24),
+            16.0);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 16),
+            16.0);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 8),
+            2.0);
 }
 
 TEST_P(InstStore, stps) {
@@ -560,14 +749,21 @@ TEST_P(InstStore, stps) {
     stp s2, s3, [sp, 8]!
     stp s3, s0, [sp, 8]
   )");
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 32), 2.f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 28), -0.125f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 24), -0.125f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 20), 7.5f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 16), 7.5f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 12), 16.f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 8), 16.f);
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 4), 2.f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 32),
+            2.f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 28),
+            -0.125f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 24),
+            -0.125f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 20),
+            7.5f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 16),
+            7.5f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 12),
+            16.f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 8),
+            16.f);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 4), 2.f);
 }
 
 TEST_P(InstStore, stpwi) {
@@ -576,8 +772,10 @@ TEST_P(InstStore, stpwi) {
     movz w1, #42
     stp w0, w1, [sp, -8]
   )");
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 8), 7u);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 4), 42u);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 8),
+            7u);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 4),
+            42u);
 }
 
 TEST_P(InstStore, stpq) {
@@ -592,23 +790,40 @@ TEST_P(InstStore, stpq) {
     stp q2, q3, [sp, 32]!
     stp q3, q0, [sp, 32] 
   )");
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 128), 2.f);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 120), 2.f);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 112), -0.125f);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 104), -0.125f);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 96), -0.125f);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 88), -0.125f);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 80), 7.5f);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 72), 7.5f);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 64), 7.5f);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 56), 7.5f);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 48), 16.f);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 40), 16.f);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 32), 16.f);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 24), 16.f);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 16), 2.f);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 8), 2.f);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(31), process_->getStackPointer() - 64);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 128),
+            2.f);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 120),
+            2.f);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 112),
+            -0.125f);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 104),
+            -0.125f);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 96),
+            -0.125f);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 88),
+            -0.125f);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 80),
+            7.5f);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 72),
+            7.5f);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 64),
+            7.5f);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 56),
+            7.5f);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 48),
+            16.f);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 40),
+            16.f);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 32),
+            16.f);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 24),
+            16.f);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 16),
+            2.f);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 8),
+            2.f);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31),
+            process_->getInitialStackPointer() - 64);
 }
 
 TEST_P(InstStore, stpx) {
@@ -626,12 +841,18 @@ TEST_P(InstStore, stpx) {
     stp x2, x3, [sp]
     stp x4, x5, [sp, #16]!
   )");
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1024), 7u);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1016), 42u);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1008), 8u);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 1000), 43u);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 992), 9u);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 984), 44u);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1024),
+            7u);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1016),
+            42u);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1008),
+            8u);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 1000),
+            43u);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 992),
+            9u);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 984),
+            44u);
 }
 
 TEST_P(InstStore, stur) {
@@ -639,32 +860,38 @@ TEST_P(InstStore, stur) {
     movz w0, #42
     stur w0, [sp, #-4]
   )");
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() - 4), 42u);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 4),
+            42u);
 
   RUN_AARCH64(R"(
     movz x0, #42
     stur x0, [sp, #-8]
   )");
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 8), 42u);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 8),
+            42u);
 
   RUN_AARCH64(R"(
     fmov s0, -0.125
     stur s0, [sp, #-4]
   )");
-  EXPECT_EQ(getMemoryValue<float>(process_->getStackPointer() - 4), -0.125);
+  EXPECT_EQ(getMemoryValue<float>(process_->getInitialStackPointer() - 4),
+            -0.125);
 
   RUN_AARCH64(R"(
     fmov d0, -0.125
     stur d0, [sp, #-8]
   )");
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 8), -0.125);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 8),
+            -0.125);
 
   RUN_AARCH64(R"(
     fmov v0.2d, -0.125
     stur q0, [sp, #-16]
   )");
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 16), -0.125);
-  EXPECT_EQ(getMemoryValue<double>(process_->getStackPointer() - 8), -0.125);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 16),
+            -0.125);
+  EXPECT_EQ(getMemoryValue<double>(process_->getInitialStackPointer() - 8),
+            -0.125);
 }
 
 TEST_P(InstStore, sturh) {
@@ -674,13 +901,14 @@ TEST_P(InstStore, sturh) {
     movz w1, #128
     sturh w1, [sp, #-4]
   )");
-  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getStackPointer() - 2), 42u);
-  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getStackPointer() - 4), 128u);
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 2),
+            42u);
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 4),
+            128u);
 }
 
 INSTANTIATE_TEST_SUITE_P(AArch64, InstStore,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
\ No newline at end of file
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index d1f8e6ea2b..6a52d46b95 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -6,6 +6,7 @@
 namespace {
 
 using InstSve = AArch64RegressionTest;
+using namespace simeng::arch::aarch64::InstructionGroups;
 
 TEST_P(InstSve, addvl) {
   // 64-bits
@@ -1883,13 +1884,15 @@ TEST_P(InstSve, eor) {
   CHECK_PREDICATE(2, uint64_t, res_p2);
   CHECK_PREDICATE(3, uint64_t, {0, 0, 0, 0});
   auto res_p4 = fillPred(VL / 8, {0}, 1);
-  for (int i = 0; i < (VL / 8); i++) {
+  for (uint64_t i = 0; i < (VL / 8); i++) {
     uint64_t shifted_active = 1ull << (i % 64);
     res_p4[i / 64] |=
         (p1[i / 64] & shifted_active) == shifted_active ? 0 : shifted_active;
   }
   CHECK_PREDICATE(4, uint64_t, res_p4);
 
+  EXPECT_GROUP(R"(not p4.b, p0/z, p1.b)", PREDICATE);
+
   // Vectors, Predicated
   RUN_AARCH64(R"(
     # 8-bit
@@ -1954,7 +1957,7 @@ TEST_P(InstSve, eor) {
   )");
   auto res_0 = fillNeon<uint8_t>({0}, VL / 8);
   int val = 8;
-  for (int i = 0; i < (VL / 8); i++) {
+  for (uint64_t i = 0; i < (VL / 8); i++) {
     res_0[i] = val ^ 15;
     val += 2;
   }
@@ -1963,7 +1966,7 @@ TEST_P(InstSve, eor) {
 
   auto res_3 = fillNeon<uint16_t>({0}, VL / 8);
   val = 8;
-  for (int i = 0; i < (VL / 16); i++) {
+  for (uint64_t i = 0; i < (VL / 16); i++) {
     res_3[i] = val ^ 15;
     val += 2;
   }
@@ -1972,7 +1975,7 @@ TEST_P(InstSve, eor) {
 
   auto res_6 = fillNeon<uint32_t>({0}, VL / 8);
   val = 8;
-  for (int i = 0; i < (VL / 32); i++) {
+  for (uint64_t i = 0; i < (VL / 32); i++) {
     res_6[i] = val ^ 15;
     val += 2;
   }
@@ -1981,12 +1984,22 @@ TEST_P(InstSve, eor) {
 
   auto res_9 = fillNeon<uint64_t>({0}, VL / 8);
   val = 8;
-  for (int i = 0; i < (VL / 64); i++) {
+  for (uint64_t i = 0; i < (VL / 64); i++) {
     res_9[i] = val ^ 15;
     val += 2;
   }
   CHECK_NEON(9, uint64_t, res_9);
   CHECK_NEON(10, uint64_t, fillNeonCombined<uint64_t>({12}, {15}, VL / 8));
+
+  // Vectors, Unpredicated
+  RUN_AARCH64(R"(
+    # 64-bit
+    dup z1.d, #15
+    dup z2.d, #3
+
+    eor z0.d, z1.d, z2.d
+  )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({12}, VL / 8));
 }
 
 TEST_P(InstSve, inc) {
@@ -2232,6 +2245,36 @@ TEST_P(InstSve, add) {
   CHECK_NEON(2, uint32_t, fillNeon<uint32_t>({12}, VL / 8));
   CHECK_NEON(3, uint64_t, fillNeon<uint64_t>({10}, VL / 8));
 
+  // Immediate
+  RUN_AARCH64(R"(
+    dup z0.b, #8
+    dup z1.h, #7
+    dup z2.s, #6
+    dup z3.d, #5
+    dup z4.b, #8
+    dup z5.h, #7
+    dup z6.s, #6
+    dup z7.d, #5
+
+    add z0.b, z0.b, #8
+    add z1.h, z1.h, #0x7
+    add z2.s, z2.s, #128
+    add z3.d, z3.d, #0x5
+
+    add z4.b, z4.b, #0x8, LSL #0
+    add z5.h, z5.h, #7, LSL #8
+    add z6.s, z6.s, #0x80, LSL #8
+    add z7.d, z7.d, #5, LSL #8
+  )");
+  CHECK_NEON(0, uint8_t, fillNeon<uint8_t>({16}, VL / 8));
+  CHECK_NEON(1, uint16_t, fillNeon<uint16_t>({14}, VL / 8));
+  CHECK_NEON(2, uint32_t, fillNeon<uint32_t>({134}, VL / 8));
+  CHECK_NEON(3, uint64_t, fillNeon<uint64_t>({10}, VL / 8));
+  CHECK_NEON(4, uint8_t, fillNeon<uint8_t>({16}, VL / 8));
+  CHECK_NEON(5, uint16_t, fillNeon<uint16_t>({1799}, VL / 8));
+  CHECK_NEON(6, uint32_t, fillNeon<uint32_t>({32774}, VL / 8));
+  CHECK_NEON(7, uint64_t, fillNeon<uint64_t>({1285}, VL / 8));
+
   // Predicated
   RUN_AARCH64(R"(
     mov x0, #0
@@ -2762,7 +2805,7 @@ TEST_P(InstSve, fadda) {
   )");
   float fresultA = 2.75f;
   float fresultB = 2.75f;
-  for (int i = 0; i < VL / 64; i++) {
+  for (uint64_t i = 0; i < VL / 64; i++) {
     fresultA += fsrc[i % 8];
     fresultB += fsrc[(i + VL / 64) % 8];
   }
@@ -2801,7 +2844,7 @@ TEST_P(InstSve, fadda) {
   )");
   double resultA = 2.75;
   double resultB = 2.75;
-  for (int i = 0; i < VL / 128; i++) {
+  for (uint64_t i = 0; i < VL / 128; i++) {
     resultA += dsrc[i % 8];
     resultB += dsrc[(i + VL / 128) % 8];
   }
@@ -3760,8 +3803,9 @@ TEST_P(InstSve, fmla_indexed) {
   )");
   std::vector<float> resultsA;
   std::vector<float> resultsB;
-  float itemA;
-  float itemB;
+  // Redundant initialisation to prevent warnings
+  float itemA = 0.f;
+  float itemB = 0.f;
   for (size_t i = 0; i < (VL / 32); i++) {
     if (i % 4 == 0) {
       itemA = 5.0f + (5.0f * static_cast<float>(i + 1));
@@ -3789,8 +3833,9 @@ TEST_P(InstSve, fmla_indexed) {
   )");
   std::vector<double> resultsC;
   std::vector<double> resultsD;
-  double itemC;
-  double itemD;
+  // Redundant initialisation to prevent warnings
+  double itemC = 0.f;
+  double itemD = 0.f;
   for (size_t i = 0; i < (VL / 64); i++) {
     if (i % 2 == 0) {
       itemC = 5.0 + (5.0 * static_cast<double>(i));
@@ -4025,7 +4070,7 @@ TEST_P(InstSve, frintn) {
   initialHeapData_.resize(VL / 8);
   float* fheap = reinterpret_cast<float*>(initialHeapData_.data());
   std::vector<float> fsrcA = {1.0f,  -42.5f,   -0.125f, 0.0f,
-                              40.5f, -684.72f, -0.15f,  107.86f};
+                              41.5f, -684.72f, -0.15f,  107.86f};
   std::vector<float> fsrcB = {-34.5f,  -0.917f, 0.0f,    80.72f,
                               -125.5f, -0.01f,  701.90f, 7.5f};
   fillHeapCombined<float>(fheap, fsrcA, fsrcB, VL / 32);
@@ -4044,18 +4089,19 @@ TEST_P(InstSve, frintn) {
     ptrue p0.s
     whilelo p1.s, xzr, x2
 
-    dup z0.s, #15
-    dup z1.s, #13
+    fdup z0.s, #2.0
+    fdup z1.s, #3.0
     ld1w {z2.s}, p0/z, [x0, x1, lsl #2]
 
     frintn z0.s, p0/m, z2.s
     frintn z1.s, p1/m, z2.s
   )");
-  std::vector<int32_t> results32A = {1, -42, 0, 0, 40, -685, 0, 108};
-  std::vector<int32_t> results32B = {-34, -1, 0, 81, -126, 0, 702, 8};
-  CHECK_NEON(0, int32_t,
-             fillNeonCombined<int32_t>(results32A, results32B, VL / 8));
-  CHECK_NEON(1, int32_t, fillNeonCombined<int32_t>(results32A, {13}, VL / 8));
+  std::vector<float> results32A = {1.0f,  -42.0f,  0.0f, 0.0f,
+                                   42.0f, -685.0f, 0.0f, 108.0f};
+  std::vector<float> results32B = {-34.0f,  -1.0f, 0.0f,   81.0f,
+                                   -126.0f, 0.0f,  702.0f, 8.0f};
+  CHECK_NEON(0, float, fillNeonCombined<float>(results32A, results32B, VL / 8));
+  CHECK_NEON(1, float, fillNeonCombined<float>(results32A, {3.0}, VL / 8));
 
   // 64-bit
   initialHeapData_.resize(VL / 8);
@@ -4078,18 +4124,18 @@ TEST_P(InstSve, frintn) {
     ptrue p0.d
     whilelo p1.d, xzr, x2
 
-    dup z0.d, #15
-    dup z1.d, #13
+    fdup z0.d, #2.0
+    fdup z1.d, #3.0
     ld1d {z2.d}, p0/z, [x0, x1, lsl #3]
 
     frintn z0.d, p0/m, z2.d
     frintn z1.d, p1/m, z2.d
   )");
-  std::vector<int64_t> results64A = {1, -42, 0, 0};
-  std::vector<int64_t> results64B = {40, -685, -4, 108};
-  CHECK_NEON(0, int64_t,
-             fillNeonCombined<int64_t>(results64A, results64B, VL / 8));
-  CHECK_NEON(1, int64_t, fillNeonCombined<int64_t>(results64A, {13}, VL / 8));
+  std::vector<double> results64A = {1.0, -42.0, 0.0, 0.0};
+  std::vector<double> results64B = {40.0, -685.0, -4.0, 108.0};
+  CHECK_NEON(0, double,
+             fillNeonCombined<double>(results64A, results64B, VL / 8));
+  CHECK_NEON(1, double, fillNeonCombined<double>(results64A, {3.0}, VL / 8));
 }
 
 TEST_P(InstSve, fsqrt) {
@@ -4639,8 +4685,9 @@ TEST_P(InstSve, ld1rqw) {
     # Load and broadcast values from heap
     ptrue p0.s
     add x1, x0, #-8
+    mov x3, #4
     ld1rqw {z0.s}, p0/z, [x0]
-    ld1rqw {z1.s}, p0/z, [x1, #16]
+    ld1rqw {z1.s}, p0/z, [x1, x3, lsl #2]
 
     # Test for inactive lanes
     ptrue p1.s, vl1
@@ -4690,12 +4737,11 @@ TEST_P(InstSve, ld1rw) {
 }
 
 TEST_P(InstSve, ld1b) {
-  initialHeapData_.resize(VL / 8);
+  initialHeapData_.resize(VL / 4);
   uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
-  fillHeap<uint8_t>(heap8,
-                    {0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12, 0x32, 0x54,
-                     0x76, 0x98, 0x01, 0xEF, 0xCD, 0xAB},
-                    VL / 8);
+  std::vector<uint8_t> src = {0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12,
+                              0x32, 0x54, 0x76, 0x98, 0x01, 0xEF, 0xCD, 0xAB};
+  fillHeap<uint8_t>(heap8, src, VL / 4);
 
   RUN_AARCH64(R"(
     # Get heap address
@@ -4716,6 +4762,7 @@ TEST_P(InstSve, ld1b) {
     mov x2, #0
     whilelo p1.b, xzr, x1
     ld1b {z1.b}, p1/z, [x0, x2]
+    ld1b {z2.b}, p1/z, [x0, #1, mul vl]
   )");
   CHECK_NEON(0, uint8_t,
              fillNeon<uint8_t>({0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12,
@@ -4725,6 +4772,8 @@ TEST_P(InstSve, ld1b) {
              fillNeon<uint8_t>({0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12,
                                 0x32, 0x54, 0x76, 0x98, 0x01, 0xEF, 0xCD, 0xAB},
                                VL / 16));
+  std::rotate(src.begin(), src.begin() + ((VL / 8) % 16), src.end());
+  CHECK_NEON(2, uint8_t, fillNeon<uint8_t>(src, VL / 16));
 }
 
 TEST_P(InstSve, ld1sw_gather) {
@@ -4755,27 +4804,55 @@ TEST_P(InstSve, ld1sw_gather) {
 }
 
 TEST_P(InstSve, ld1w_gather) {
+  initialHeapData_.resize(VL / 4);
+  uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  std::vector<uint32_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
+  fillHeap<uint32_t>(heap32, src, VL / 32);
+
   // Scalar plus vector
   // 64-bit
   RUN_AARCH64(R"(
-    mov x0, #800
-    index z1.d, x0, #8
-    dup z2.d, #8
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
 
-    ptrue p0.d
+    ptrue p0.s
+    ptrue p1.d
+    
     mov x1, #0
-    mov x2, #16
+    mov x3, #8
     addvl x1, x1, #1
-    udiv x1, x1, x2
-    whilelo p1.d, xzr, x1
-
-    # Put data into memory so we have something to load
-    st1d {z2.d}, p0, [z1.d]  
+    udiv x1, x1, x3
+    whilelo p2.s, xzr, x1
+    
+    mov x1, #0
+    mov x3, #16
+    addvl x1, x1, #1
+    udiv x1, x1, x3
+    whilelo p3.d, xzr, x1
 
-    index z4.d, #0, #2
-    ld1w {z5.d}, p1/z, [x0, z4.d, lsl #2]
+    index z3.s, #0, #4
+    index z4.d, #0, #1
+    ld1w {z5.s}, p0/z, [x0, z3.s, sxtw]
+    ld1w {z6.d}, p1/z, [x0, z4.d, lsl #2]
+    ld1w {z7.s}, p2/z, [x0, z3.s, sxtw]
+    ld1w {z8.d}, p3/z, [x0, z4.d, lsl #2]
   )");
-  CHECK_NEON(5, uint64_t, fillNeonCombined<uint64_t>({8}, {0}, VL / 8));
+  CHECK_NEON(5, uint32_t,
+             fillNeon<uint32_t>(
+                 {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, VL / 8));
+  CHECK_NEON(6, uint64_t,
+             fillNeon<uint64_t>(
+                 {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, VL / 8));
+  CHECK_NEON(
+      7, uint32_t,
+      fillNeonCombined<uint32_t>(
+          {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, {0}, VL / 8));
+  CHECK_NEON(
+      8, uint64_t,
+      fillNeonCombined<uint64_t>(
+          {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, {0}, VL / 8));
 }
 
 TEST_P(InstSve, ld1d_gather) {
@@ -6088,11 +6165,11 @@ TEST_P(InstSve, smulh) {
 }
 
 TEST_P(InstSve, st1b) {
-  initialHeapData_.resize(VL / 8);
+  initialHeapData_.resize(VL / 4);
   uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
   std::vector<uint8_t> src = {0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12,
                               0x32, 0x54, 0x76, 0x98, 0x01, 0xEF, 0xCD, 0xAB};
-  fillHeap<uint8_t>(heap8, src, VL / 8);
+  fillHeap<uint8_t>(heap8, src, VL / 4);
 
   RUN_AARCH64(R"(
     # Get heap address
@@ -6100,12 +6177,14 @@ TEST_P(InstSve, st1b) {
     mov x8, 214
     svc #0
 
+    sub sp, sp, #4095
+    mov x10, sp
     sub sp, sp, #4095
     mov x1, #0
     ptrue p0.b
 
     ld1b {z0.b}, p0/z, [x0, x1]
-    st1b {z0.b}, p0, [sp, x1]
+    st1b {z0.b}, p0, [x10, x1]
 
     mov x2, #0
     mov x4, #2
@@ -6114,16 +6193,25 @@ TEST_P(InstSve, st1b) {
     mov x3, #0
     whilelo p1.b, xzr, x2
 
+    mov x5, #4
+    mul x2, x2, x5
+
     ld1b {z1.b}, p1/z, [x0, x3]
     st1b {z1.b}, p1, [x2, x3]
+    st1b {z1.b}, p1, [sp, #4, mul vl]
   )");
 
-  for (int i = 0; i < (VL / 8); i++) {
-    EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() - 4095 + i),
-              src[i % 16]);
+  for (uint64_t i = 0; i < (VL / 8); i++) {
+    EXPECT_EQ(
+        getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 4095 + i),
+        src[i % 16]);
+  }
+  for (uint64_t i = 0; i < (VL / 16); i++) {
+    EXPECT_EQ(getMemoryValue<uint8_t>(4 * (VL / 16) + i), src[i % 16]);
   }
-  for (int i = 0; i < (VL / 16); i++) {
-    EXPECT_EQ(getMemoryValue<uint8_t>((VL / 16) + i), src[i % 16]);
+  uint64_t base = process_->getInitialStackPointer() - 8190 + 4 * (VL / 8);
+  for (uint64_t i = 0; i < (VL / 16); i++) {
+    EXPECT_EQ(getMemoryValue<uint8_t>(base + i), src[i % 16]);
   }
 }
 
@@ -6160,8 +6248,9 @@ TEST_P(InstSve, st1b_scatter) {
   )");
 
   for (uint64_t i = 0; i < VL / 64; i++) {
-    EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() - (3 * i)),
-              src[(8 * i) % 16]);
+    EXPECT_EQ(
+        getMemoryValue<uint8_t>(process_->getInitialStackPointer() - (3 * i)),
+        src[(8 * i) % 16]);
   }
 
   for (uint64_t i = 0; i < VL / 128; i++) {
@@ -6278,19 +6367,19 @@ TEST_P(InstSve, st1d) {
     st1d {z1.d}, p1, [x2, x3, lsl #3]
   )");
 
-  for (int i = 0; i < (VL / 64); i++) {
-    EXPECT_EQ(
-        getMemoryValue<uint64_t>(process_->getStackPointer() - 4095 + (i * 8)),
-        src[i % 4]);
+  for (uint64_t i = 0; i < (VL / 64); i++) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 8)),
+              src[i % 4]);
   }
-  for (int i = 0; i < (VL / 64); i++) {
+  for (uint64_t i = 0; i < (VL / 64); i++) {
     EXPECT_EQ(getMemoryValue<uint64_t>(65792 + (i * 8)), src[i % 4]);
   }
   std::rotate(src.begin(), src.begin() + 2, src.end());
-  for (int i = 0; i < (VL / 128); i++) {
+  for (uint64_t i = 0; i < (VL / 128); i++) {
     EXPECT_EQ(getMemoryValue<uint64_t>((VL / 128) + 16 + (i * 8)), src[i % 4]);
   }
-  for (int i = 0; i < (VL / 128); i++) {
+  for (uint64_t i = 0; i < (VL / 128); i++) {
     EXPECT_EQ(getMemoryValue<uint64_t>((VL / 128) + (VL / 2) + (i * 8)),
               src[i % 4]);
   }
@@ -6318,17 +6407,17 @@ TEST_P(InstSve, st2d) {
     st2d {z2.d, z3.d}, p1, [x6, #4, mul vl]
   )");
 
-  for (int i = 0; i < (VL / 64); i++) {
-    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 4095 +
-                                       (2 * i * 8)),
+  for (uint64_t i = 0; i < (VL / 64); i++) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + (2 * i * 8)),
               3);
-    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() - 4095 +
-                                       (2 * i * 8) + 8),
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + (2 * i * 8) + 8),
               4);
   }
 
   int index = 4 * (VL / 64) * 8;
-  for (int i = 0; i < (VL / 128); i++) {
+  for (uint64_t i = 0; i < (VL / 128); i++) {
     EXPECT_EQ(getMemoryValue<uint64_t>(300 + index + (2 * i * 8)), 5);
     EXPECT_EQ(getMemoryValue<uint64_t>(300 + index + (2 * i * 8) + 8), 6);
   }
@@ -6423,12 +6512,12 @@ TEST_P(InstSve, st1w) {
     st1w {z2.s}, p0, [x4]
   )");
 
-  for (int i = 0; i < (VL / 32); i++) {
-    EXPECT_EQ(
-        getMemoryValue<uint32_t>(process_->getStackPointer() - 4095 + (i * 4)),
-        src[i % 4]);
+  for (uint64_t i = 0; i < (VL / 32); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 4)),
+              src[i % 4]);
   }
-  for (int i = 0; i < (VL / 32); i++) {
+  for (uint64_t i = 0; i < (VL / 32); i++) {
     EXPECT_EQ(getMemoryValue<uint32_t>((VL / 8) + (i * 4)), src[i % 4]);
   }
 
@@ -6451,80 +6540,67 @@ TEST_P(InstSve, st1w) {
     st1w {z1.s}, p1, [x2, x3, lsl #2]
   )");
 
-  for (int i = 0; i < (VL / 64); i++) {
+  for (uint64_t i = 0; i < (VL / 64); i++) {
     EXPECT_EQ(getMemoryValue<uint32_t>((VL / 64) + (VL / 2) + (i * 4)),
               src[i % 4]);
   }
-  for (int i = 0; i < (VL / 64); i++) {
+  for (uint64_t i = 0; i < (VL / 64); i++) {
     EXPECT_EQ(getMemoryValue<uint32_t>((VL / 64) + 16 + (i * 4)), src[i % 4]);
   }
 
-  // 64-bit
-  // initialHeapData_.resize(64);
-  // uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
-  // heap64[0] = 0xDEADBEEFDEADBEEF;
-  // heap64[1] = 0x1234567812345678;
-  // heap64[2] = 0x9876543298765432;
-  // heap64[3] = 0xABCDEF01ABCDEF01;
-  // heap64[4] = 0xDEADBEEFDEADBEEF;
-  // heap64[5] = 0x1234567812345678;
-  // heap64[6] = 0x9876543298765432;
-  // heap64[7] = 0xABCDEF01ABCDEF01;
-
-  // RUN_AARCH64(R"(
-  //   # Get heap address
-  //   mov x0, 0
-  //   mov x8, 214
-  //   svc #0
-
-  //   mov x1, #0
-  //   mov x4, #64
-  //   mov x5, #3
-  //   ptrue p0.d
-  //   ld1w {z0.d}, p0/z, [x0, x1, lsl #3]
-  //   ld1w {z2.d}, p0/z, [x0, x1, lsl #3]
-  //   st1w {z0.d}, p0, [sp, x1, lsl #2]
-  //   st1w {z2.d}, p0, [x4, x5, lsl #2]
-  // )");
-  // CHECK_NEON(0, uint64_t,
-  //            {0xDEADBEEFDEADBEEFu, 0x1234567812345678u,
-  //            0x9876543298765432u,
-  //             0xABCDEF01ABCDEF01u, 0xDEADBEEFDEADBEEFu,
-  //             0x1234567812345678u, 0x9876543298765432u,
-  //             0xABCDEF01ABCDEF01u});
-  // CHECK_NEON(2, uint64_t,
-  //            {0xDEADBEEFDEADBEEFu, 0x1234567812345678u,
-  //            0x9876543298765432u,
-  //             0xABCDEF01ABCDEF01u, 0xDEADBEEFDEADBEEFu,
-  //             0x1234567812345678u, 0x9876543298765432u,
-  //             0xABCDEF01ABCDEF01u});
-
-  // EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer()),
-  // 0xDEADBEEF);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer()
-  // + 4),
-  //           0x12345678);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() + 8),
-  //           0x98765432);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() + 12),
-  //           0xABCDEF01);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() + 16),
-  //           0xDEADBEEF);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() + 20),
-  //           0x12345678);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() + 24),
-  //           0x98765432);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() + 28),
-  //           0xABCDEF01);
-
-  // EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 * 4)), 0xDEADBEEF);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 * 4) + 4), 0x12345678);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 * 4) + 8), 0x98765432);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 * 4) + 12), 0xABCDEF01);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 * 4) + 16), 0xDEADBEEF);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 * 4) + 20), 0x12345678);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 * 4) + 24), 0x98765432);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 * 4) + 28), 0xABCDEF01);
+  // 64 - bit
+  initialHeapData_.resize(VL / 8);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> srcA = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
+  std::vector<uint64_t> srcB = {0xDEADBEEFDEADBEEF, 0x1234567812345678,
+                                0x9876543298765432, 0xABCDEF01ABCDEF01};
+  fillHeapCombined(heap64, srcA, srcB, VL / 64);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    sub sp, sp, #4095
+
+    ptrue p0.d
+    mov x2, #0
+    mov x5, #16
+    addvl x2, x2, #1
+    udiv x2, x2, x5
+    mov x3, #2
+    whilelo p1.d, xzr, x2
+
+    mov x1, #0
+    mov x6, #64
+    mov x7, #3
+
+    ld1d {z0.d}, p1/z, [x0, x1, lsl #3]
+    ld1d {z2.d}, p0/z, [x0, x1, lsl #3]
+    st1w {z0.d}, p1, [sp, x1, lsl #2]
+    st1w {z2.d}, p0, [x6, x7, lsl #2]
+  )");
+
+  CHECK_NEON(0, uint64_t, fillNeonCombined<uint64_t>(srcA, {0ull}, VL / 8));
+  CHECK_NEON(2, uint64_t, fillNeonCombined<uint64_t>(srcA, srcB, VL / 8));
+
+  std::array<uint32_t, (256 / sizeof(uint32_t))> srcC =
+      fillNeonCombined<uint32_t>(
+          {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, {0ul}, VL / 16);
+  for (uint64_t i = 0; i < (VL / 64); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 4)),
+              srcC[i]);
+  }
+
+  std::array<uint32_t, (256 / sizeof(uint32_t))> srcD =
+      fillNeonCombined<uint32_t>(
+          {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01},
+          {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, VL / 16);
+  for (uint64_t i = 0; i < (VL / 64); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 + i) * 4), srcD[i]);
+  }
 }
 
 TEST_P(InstSve, str_predicate) {
@@ -6543,9 +6619,10 @@ TEST_P(InstSve, str_predicate) {
     ldr p0, [x0, #0, mul vl]
     str p0, [sp, #0, mul vl]
   )");
-  for (int i = 0; i < (VL / 64); i++) {
-    EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() - 4095 + i),
-              0xFF);
+  for (uint64_t i = 0; i < (VL / 64); i++) {
+    EXPECT_EQ(
+        getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 4095 + i),
+        0xFF);
   }
 
   fillHeap<uint8_t>(heap8, {0xDE}, VL / 64);
@@ -6560,8 +6637,8 @@ TEST_P(InstSve, str_predicate) {
     ldr p0, [x0, #0, mul vl]
     str p0, [sp, #1, mul vl]
   )");
-  for (int i = 0; i < (VL / 64); i++) {
-    EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() -
+  for (uint64_t i = 0; i < (VL / 64); i++) {
+    EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() -
                                       (4095 - (VL / 64)) + i),
               0xDE);
   }
@@ -6578,8 +6655,8 @@ TEST_P(InstSve, str_predicate) {
     ldr p0, [x0, #0, mul vl]
     str p0, [sp, #2, mul vl]
   )");
-  for (int i = 0; i < (VL / 64); i++) {
-    EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() -
+  for (uint64_t i = 0; i < (VL / 64); i++) {
+    EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() -
                                       (4095 - (VL / 64) * 2) + i),
               0x12);
   }
@@ -6596,8 +6673,8 @@ TEST_P(InstSve, str_predicate) {
     ldr p0, [x0, #0, mul vl]
     str p0, [sp, #3, mul vl]
   )");
-  for (int i = 0; i < (VL / 64); i++) {
-    EXPECT_EQ(getMemoryValue<uint8_t>(process_->getStackPointer() -
+  for (uint64_t i = 0; i < (VL / 64); i++) {
+    EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() -
                                       (4095 - (VL / 64) * 3) + i),
               0x98);
   }
@@ -6625,12 +6702,12 @@ TEST_P(InstSve, str_vector) {
     str z0, [sp, #0, mul vl]
     str z1, [x1, #4, mul vl]
   )");
-  for (int i = 0; i < (VL / 64); i++) {
-    EXPECT_EQ(
-        getMemoryValue<uint64_t>(process_->getStackPointer() - 4095 + (i * 8)),
-        src[i % 8]);
+  for (uint64_t i = 0; i < (VL / 64); i++) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 8)),
+              src[i % 8]);
   }
-  for (int i = 0; i < (VL / 64); i++) {
+  for (uint64_t i = 0; i < (VL / 64); i++) {
     EXPECT_EQ(getMemoryValue<uint64_t>((VL / 8) + (VL / 2) + (i * 8)),
               src[i % 8]);
   }
@@ -6728,7 +6805,7 @@ TEST_P(InstSve, trn1) {
   std::vector<uint8_t> result8;
   int i1 = 0;
   int i2 = 10;
-  for (int i = 0; i < VL / 16; i++) {
+  for (uint64_t i = 0; i < VL / 16; i++) {
     result8.push_back(i1);
     result8.push_back(i2);
     i1 += 2;
@@ -6746,7 +6823,7 @@ TEST_P(InstSve, trn1) {
   std::vector<uint16_t> result16;
   i1 = 0;
   i2 = 10;
-  for (int i = 0; i < VL / 32; i++) {
+  for (uint64_t i = 0; i < VL / 32; i++) {
     result16.push_back(i1);
     result16.push_back(i2);
     i1 += 2;
@@ -6764,7 +6841,7 @@ TEST_P(InstSve, trn1) {
   std::vector<uint32_t> result32;
   i1 = 0;
   i2 = 10;
-  for (int i = 0; i < VL / 64; i++) {
+  for (uint64_t i = 0; i < VL / 64; i++) {
     result32.push_back(i1);
     result32.push_back(i2);
     i1 += 2;
@@ -6782,7 +6859,7 @@ TEST_P(InstSve, trn1) {
   std::vector<uint64_t> result64;
   i1 = 0;
   i2 = 10;
-  for (int i = 0; i < VL / 128; i++) {
+  for (uint64_t i = 0; i < VL / 128; i++) {
     result64.push_back(i1);
     result64.push_back(i2);
     i1 += 2;
@@ -6802,7 +6879,7 @@ TEST_P(InstSve, trn2) {
   std::vector<uint8_t> result8;
   int i1 = 1;
   int i2 = 11;
-  for (int i = 0; i < VL / 16; i++) {
+  for (uint64_t i = 0; i < VL / 16; i++) {
     result8.push_back(i1);
     result8.push_back(i2);
     i1 += 2;
@@ -6820,7 +6897,7 @@ TEST_P(InstSve, trn2) {
   std::vector<uint16_t> result16;
   i1 = 1;
   i2 = 11;
-  for (int i = 0; i < VL / 32; i++) {
+  for (uint64_t i = 0; i < VL / 32; i++) {
     result16.push_back(i1);
     result16.push_back(i2);
     i1 += 2;
@@ -6838,7 +6915,7 @@ TEST_P(InstSve, trn2) {
   std::vector<uint32_t> result32;
   i1 = 1;
   i2 = 11;
-  for (int i = 0; i < VL / 64; i++) {
+  for (uint64_t i = 0; i < VL / 64; i++) {
     result32.push_back(i1);
     result32.push_back(i2);
     i1 += 2;
@@ -6856,7 +6933,7 @@ TEST_P(InstSve, trn2) {
   std::vector<uint64_t> result64;
   i1 = 1;
   i2 = 11;
-  for (int i = 0; i < VL / 128; i++) {
+  for (uint64_t i = 0; i < VL / 128; i++) {
     result64.push_back(i1);
     result64.push_back(i2);
     i1 += 2;
@@ -7933,7 +8010,7 @@ TEST_P(InstSve, zip) {
 }
 
 #if SIMENG_LLVM_VERSION >= 14
-// If LLVm version supports SVE2 :
+// If LLVM version supports SVE2 :
 TEST_P(InstSve, psel) {
   RUN_AARCH64(R"(
     mov w13, #0
diff --git a/test/regression/riscv/CMakeLists.txt b/test/regression/riscv/CMakeLists.txt
index 24050bfa48..fdc4826d82 100644
--- a/test/regression/riscv/CMakeLists.txt
+++ b/test/regression/riscv/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_executable(regression-riscv
             RISCVRegressionTest.cc
             RISCVRegressionTest.hh
+            Exception.cc
             LoadStoreQueue.cc
             InorderPipeline.cc
             SmokeTest.cc
@@ -12,13 +13,22 @@ add_executable(regression-riscv
             instructions/jump.cc
             instructions/branch.cc
             instructions/atomic.cc
-               )
+            instructions/float.cc
+            instructions/compressed.cc
+        )
+
+configure_file(${capstone_SOURCE_DIR}/arch/RISCV/RISCVGenInstrInfo.inc RISCVGenInstrInfo.inc COPYONLY)
+
 target_include_directories(regression-riscv PRIVATE
-                           ${CMAKE_CURRENT_SOURCE_DIR})
+                            ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(regression-riscv PRIVATE
+                            ${CMAKE_CURRENT_BINARY_DIR})
+
 target_link_libraries(regression-riscv regression-test-base)
+target_compile_options(regression-riscv PRIVATE ${SIMENG_COMPILE_OPTIONS})
 
 # Define a macro so that tests can find data files
 target_compile_definitions(regression-riscv PRIVATE
   "SIMENG_RISCV_TEST_ROOT=\"${CMAKE_CURRENT_SOURCE_DIR}\"")
 
-add_test(NAME regression-riscv-test COMMAND regression-riscv)
+add_test(NAME regression-riscv-test COMMAND regression-riscv)
\ No newline at end of file
diff --git a/test/regression/riscv/Exception.cc b/test/regression/riscv/Exception.cc
new file mode 100644
index 0000000000..fba1592ad0
--- /dev/null
+++ b/test/regression/riscv/Exception.cc
@@ -0,0 +1,144 @@
+#include <algorithm>
+#include <limits>
+
+#include "RISCVRegressionTest.hh"
+
+namespace {
+
+using Exception = RISCVRegressionTest;
+
+/** RISCV opcodes. Each opcode represents a unique RISCV operation. */
+namespace Opcode {
+#define GET_INSTRINFO_ENUM
+#include "RISCVGenInstrInfo.inc"
+}  // namespace Opcode
+
+// Test that an invalid capstone instruction id raises an encoding unallocated
+// exception
+TEST_P(Exception, encoding_unallocated) {
+  // Initialise heap with an unallocated encoding
+  initialHeapData_.resize(4);
+  uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heap[0] = 0xDEDEDEDE;
+  RUN_RISCV(R"(
+    # Get heap address
+    li a0, 0
+    li a7, 214
+    ecall
+    mv t0, a0
+
+    jalr a0, t0, 0
+  )");
+  const char err[] =
+      "\n[SimEng:ExceptionHandler] Encountered unallocated instruction "
+      "encoding exception";
+  EXPECT_EQ(stdout_.substr(0, strlen(err)), err);
+}
+
+// Test that an instruction with no implemented execution logic raises a
+// not-yet-implemented exception
+TEST_P(Exception, not_yet_implemented) {
+  // RISCV capstone has no undefined instruction opcode like aarch64, use a
+  // currently unsupported instruction instead
+  RUN_RISCV(R"(
+    ebreak
+  )");
+  const char err[] =
+      "\n[SimEng:ExceptionHandler] Encountered execution not-yet-implemented "
+      "exception";
+  EXPECT_EQ(stdout_.substr(0, strlen(err)), err);
+}
+
+// Test for InstructionException::AliasNotYetImplemented omitted. Obtaining an
+// instruction encoding that will consistently trigger an AliasNotYetImplemented
+// exception is not feasible due to the continual updates to our alias reversion
+// support and the difficulty of generating the bytes for an instruction alias
+// not yet supported.
+
+// Test for InstructionException::MisalignedPC omitted. As defined by the ISA,
+// RISCV implemented branch instructions use branch offsets that are restricted
+// to be multiples of 2. Therefore, it is currently not possible to trigger a
+// MisalignedPC exception.
+
+// Test that trying to load data from an address outside the bounds of the
+// process image raises a data abort exception
+TEST_P(Exception, data_abort) {
+  RUN_RISCV(R"(
+    li a0, 10000
+    mul a0, a0, a0
+    ld t0, 0(a0)
+  )");
+  const char err[] =
+      "\n[SimEng:ExceptionHandler] Encountered data abort exception";
+  EXPECT_EQ(stdout_.substr(0, strlen(err)), err);
+}
+
+// Test that an unsupported SVC call raises an exception
+TEST_P(Exception, unsupported_svc) {
+  RUN_RISCV(R"(
+    li a7, 3
+    ecall
+  )");
+
+  // EQ comparison on the full exception output to ensure the correct system
+  // call ID of 3 is printed
+  int ecallOpcodeId = Opcode::RISCV_ECALL;
+  std::string err =
+      std::string(
+          "\n[SimEng:ExceptionHandler] Encountered supervisor call "
+          "exception\n[SimEng:ExceptionHandler]  Generated by "
+          "instruction: \n[SimEng:ExceptionHandler]    0x0000000000000004: 73 "
+          "00 00 00     ecall \n[SimEng:ExceptionHandler]      opcode ID: ") +
+      std::to_string(ecallOpcodeId) +
+      std::string("\n\n[SimEng:ExceptionHandler] Unrecognised syscall: 3");
+  EXPECT_EQ(stdout_.substr(0, err.size()), err.c_str());
+}
+
+// TODO: Write test for InstructionException::HypervisorCall once it has a
+// trigger case
+// TODO: Write test for InstructionException::SecureMonitorCall once it has a
+// trigger case
+
+// Test that trying to process an instruction with no supporting issue port
+// raises a no available port exception
+TEST_P(Exception, no_available_port) {
+  RUN_RISCV(R"(
+    fld ft0, 0(a0)
+    fld ft1, 8(a0)
+    
+    fadd.d ft4, ft0, ft1
+  )");
+  std::string err;
+  // Exception raised on outoforder core archetype only
+  if (std::get<0>(GetParam()) == OUTOFORDER) {
+    err =
+        "\n[SimEng:ExceptionHandler] Encountered unsupported execution "
+        "port exception";
+  } else {
+    // Placeholder string for non-outoforder core to be replaced when
+    // appropriate. Ensures changes to this test case won't be forgotten if
+    // updates to other core archetypes are carried out such that they can now
+    // raise an InstructionException::NoAvailablePort exception
+    err =
+        "\n[SimEng:ExceptionHandler] Encountered unallocated instruction "
+        "encoding exception";
+  }
+  EXPECT_EQ(stdout_.substr(0, err.size()), err.c_str());
+}
+
+// TODO: Write test for InstructionException::IllegalInstruction
+// TODO: Write test for an successful InstructionException::PipelineFlush
+// TODO: Write test for errored InstructionException::PipelineFlush once it has
+// a trigger case
+
+INSTANTIATE_TEST_SUITE_P(
+    RISCV, Exception,
+    ::testing::Values(
+        std::make_tuple(EMULATION, "{}"), std::make_tuple(INORDER, "{}"),
+        std::make_tuple(
+            OUTOFORDER,
+            "{Ports: {'0': {Portname: 0, Instruction-Group-Support: [INT, "
+            "LOAD, STORE, BRANCH]}}}")),
+    paramToString);
+
+}  // namespace
diff --git a/test/regression/riscv/InorderPipeline.cc b/test/regression/riscv/InorderPipeline.cc
index f3c8c8fb00..aa54e3ce6a 100644
--- a/test/regression/riscv/InorderPipeline.cc
+++ b/test/regression/riscv/InorderPipeline.cc
@@ -31,8 +31,7 @@ TEST_P(inorderPipeline, prematureMulticycleHalting) {
 }
 
 INSTANTIATE_TEST_SUITE_P(RISCV, inorderPipeline,
-                         ::testing::Values(std::make_tuple(INORDER,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(INORDER, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/regression/riscv/LoadStoreQueue.cc b/test/regression/riscv/LoadStoreQueue.cc
index 4f84f2c143..d502fec699 100644
--- a/test/regression/riscv/LoadStoreQueue.cc
+++ b/test/regression/riscv/LoadStoreQueue.cc
@@ -22,7 +22,7 @@ TEST_P(LoadStoreQueue, RAW) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 42u);
 }
 
-// Test multiple simulteneous RAW violations are flushed correctly.
+// Test multiple simultaneous RAW violations are flushed correctly.
 TEST_P(LoadStoreQueue, RAWx2) {
   initialHeapData_.resize(8);
   reinterpret_cast<uint64_t*>(initialHeapData_.data())[0] = -1;
@@ -99,9 +99,11 @@ TEST_P(LoadStoreQueue, SpeculativeInvalidLoad) {
 
 INSTANTIATE_TEST_SUITE_P(
     RISCV, LoadStoreQueue,
-    ::testing::Values(std::make_tuple(EMULATION, YAML::Load("{}")),
-                      std::make_tuple(INORDER, YAML::Load("{}")),
-                      std::make_tuple(OUTOFORDER, YAML::Load("{}"))),
+    ::testing::Values(std::make_tuple(EMULATION, "{}"),
+                      std::make_tuple(INORDER, "{}"),
+                      std::make_tuple(OUTOFORDER,
+                                      "{L1-Data-Memory: "
+                                      "{Interface-Type: Fixed}}")),
     paramToString);
 
 }  // namespace
\ No newline at end of file
diff --git a/test/regression/riscv/RISCVRegressionTest.cc b/test/regression/riscv/RISCVRegressionTest.cc
index 8e3527e8b5..d96dab9bf5 100644
--- a/test/regression/riscv/RISCVRegressionTest.cc
+++ b/test/regression/riscv/RISCVRegressionTest.cc
@@ -5,47 +5,67 @@
 
 using namespace simeng::arch::riscv;
 
-void RISCVRegressionTest::run(const char* source) {
-  // Initialise LLVM
-  LLVMInitializeRISCVTargetInfo();
-  LLVMInitializeRISCVTargetMC();
-  LLVMInitializeRISCVAsmParser();
+void RISCVRegressionTest::run(const char* source, bool compressed) {
+  initialiseLLVM();
+  std::string subtargetFeatures = getSubtargetFeaturesString(compressed);
 
-  RegressionTest::run(source, "riscv64", "+m,+a,+f,+d");
+  RegressionTest::run(source, "riscv64", subtargetFeatures.c_str());
 }
-// TODO create yaml
-YAML::Node RISCVRegressionTest::generateConfig() const {
-  YAML::Node config = YAML::Load(RISCV_CONFIG);
+
+void RISCVRegressionTest::checkGroup(
+    const char* source, const std::vector<uint16_t>& expectedGroups,
+    bool compressed) {
+  initialiseLLVM();
+  std::string subtargetFeatures = getSubtargetFeaturesString(compressed);
+
+  RegressionTest::checkGroup(source, "riscv64", subtargetFeatures.c_str(),
+                             expectedGroups);
+}
+
+void RISCVRegressionTest::generateConfig() const {
+  // Re-generate the default config for the rv64 ISA
+  simeng::config::SimInfo::generateDefault(simeng::config::ISA::RV64, true);
+
+  // Add the base additional RISCV test suite config options
+  simeng::config::SimInfo::addToConfig(RISCV_ADDITIONAL_CONFIG);
+  std::string mode;
   switch (std::get<0>(GetParam())) {
     case EMULATION:
-      config["Core"]["Simulation-Mode"] = "emulation";
+      mode = "emulation";
       break;
     case INORDER:
-      config["Core"]["Simulation-Mode"] = "inorderpipeline";
+      mode = "inorderpipelined";
       break;
     case OUTOFORDER:
-      config["Core"]["Simulation-Mode"] = "outoforder";
+      mode = "outoforder";
       break;
   }
-  return config;
+
+  simeng::config::SimInfo::addToConfig("{Core: {Simulation-Mode: " + mode +
+                                       "}}");
+
+  // Add the test specific config options
+  simeng::config::SimInfo::addToConfig(std::get<1>(GetParam()));
 }
 
 std::unique_ptr<simeng::arch::Architecture>
-RISCVRegressionTest::createArchitecture(simeng::kernel::Linux& kernel,
-                                        YAML::Node config) const {
-  return std::make_unique<Architecture>(kernel, config);
+RISCVRegressionTest::instantiateArchitecture(
+    simeng::kernel::Linux& kernel) const {
+  return std::make_unique<Architecture>(kernel);
 }
 
 std::unique_ptr<simeng::pipeline::PortAllocator>
-RISCVRegressionTest::createPortAllocator() const {
-  // TODO: this is currently tightly coupled to the number of execution units,
-  // which is specified in the out-of-order core model
-  const std::vector<std::vector<uint16_t>> portArrangement = {
-      {simeng::arch::riscv::InstructionGroups::INT,
-       simeng::arch::riscv::InstructionGroups::BRANCH,
-       simeng::arch::riscv::InstructionGroups::LOAD,
-       simeng::arch::riscv::InstructionGroups::STORE}};
-
+RISCVRegressionTest::createPortAllocator(ryml::ConstNodeRef config) const {
+  // Extract the port arrangement from the config file
+  std::vector<std::vector<uint16_t>> portArrangement(
+      config["Ports"].num_children());
+  for (size_t i = 0; i < config["Ports"].num_children(); i++) {
+    auto config_groups = config["Ports"][i]["Instruction-Group-Support-Nums"];
+    // Read groups in associated port
+    for (size_t j = 0; j < config_groups.num_children(); j++) {
+      portArrangement[i].push_back(config_groups[j].as<uint16_t>());
+    }
+  }
   return std::make_unique<simeng::pipeline::BalancedPortAllocator>(
       portArrangement);
 }
diff --git a/test/regression/riscv/RISCVRegressionTest.hh b/test/regression/riscv/RISCVRegressionTest.hh
index 2dca8b073e..a99abd5c54 100644
--- a/test/regression/riscv/RISCVRegressionTest.hh
+++ b/test/regression/riscv/RISCVRegressionTest.hh
@@ -4,31 +4,36 @@
 #include "simeng/arch/riscv/Architecture.hh"
 #include "simeng/arch/riscv/Instruction.hh"
 
-#define RISCV_CONFIG                                                           \
-  ("{Core: {ISA: rv64, Simulation-Mode: emulation, Clock-Frequency: 2.5}, "    \
-   "Fetch: {Fetch-Block-Size: 32, Loop-Buffer-Size: 64, "                      \
-   "Loop-Detection-Threshold: 4}, Process-Image: {Heap-Size: 100000, "         \
-   "Stack-Size: 100000}, Register-Set: {GeneralPurpose-Count: 154, "           \
-   "FloatingPoint-Count: 90}, Pipeline-Widths: {Commit: 4, Dispatch-Rate: 4, " \
-   "FrontEnd: 4, LSQ-Completion: 2}, Queue-Sizes: {ROB: 180, Load: 64, "       \
-   "Store: 36}, Branch-Predictor: {BTB-Tag-Bits: 11, Saturating-Count-Bits: "  \
-   "2, Global-History-Length: 10, RAS-entries: 5, Fallback-Static-Predictor: " \
-   "2}, L1-Data-Memory: {Interface-Type: Fixed}, L1-Instruction-Memory: "      \
-   "{Interface-Type: Flat}, LSQ-L1-Interface: {Access-Latency: 4, Exclusive: " \
-   "False, Load-Bandwidth: 32, Store-Bandwidth: 16, "                          \
-   "Permitted-Requests-Per-Cycle: 2, Permitted-Loads-Per-Cycle: 2, "           \
-   "Permitted-Stores-Per-Cycle: 1}, Ports: {'0': {Portname: Port 0, "          \
-   "Instruction-Group-Support: [0, 10, 11, 12 ]}}, Reservation-Stations: "     \
-   "{'0': {Size: 60, Dispatch-Rate: 4, Ports: [0]}}, Execution-Units: "        \
-   "{'0': {Pipelined: true}}, Latencies: {'0': {Instruction-Group: {0: '7'}, " \
-   "Execution-Latency: 39, Execution-Throughput: 39}}}")
+[[maybe_unused]] static const char* RISCV_ADDITIONAL_CONFIG = R"YAML(
+{
+  Core:
+    {
+      Clock-Frequency-GHz: 2.5,
+    },
+  Register-Set:
+    {
+      GeneralPurpose-Count: 154,
+      FloatingPoint-Count: 90,
+    },
+  L1-Data-Memory:
+    {
+      Interface-Type: Flat,
+    },
+  L1-Instruction-Memory:
+    {
+      Interface-Type: Flat,
+    },
+  Ports:
+    {
+      '0': { Portname: 0, Instruction-Group-Support: [INT, FLOAT, LOAD, STORE, BRANCH] },
+    },
+}
+)YAML";
 
 /** A helper function to convert the supplied parameters of
  * INSTANTIATE_TEST_SUITE_P into test name. */
 inline std::string paramToString(
-    const testing::TestParamInfo<std::tuple<CoreType, YAML::Node>> val) {
-  YAML::Node config = YAML::Load(RISCV_CONFIG);
-
+    const testing::TestParamInfo<std::tuple<CoreType, std::string>> val) {
   // Get core type as string
   std::string coreString = "";
   switch (std::get<0>(val.param)) {
@@ -48,32 +53,101 @@ inline std::string paramToString(
   return coreString;
 }
 
-/** A helper macro to run a snippet of RISCV assembly code, returning from
+/** A helper macro to run a snippet of RISC-V assembly code, returning from
  * the calling function if a fatal error occurs. Four bytes containing zeros
  * are appended to the source to ensure that the program will terminate with
- * an illegal instruction exception instead of running into the heap. */
+ * an unallocated instruction encoding exception instead of running into the
+ * heap. */
 #define RUN_RISCV(source)                      \
   {                                            \
     std::string sourceWithTerminator = source; \
     sourceWithTerminator += "\n.word 0";       \
-    run(sourceWithTerminator.c_str());         \
+    run(sourceWithTerminator.c_str(), false);  \
   }                                            \
   if (HasFatalFailure()) return
 
-/** The test fixture for all RISCV regression tests. */
+/** A helper macro to run a snippet of RISC-V assembly code, returning from
+ * the calling function if a fatal error occurs. Four bytes containing zeros
+ * are appended to the source to ensure that the program will terminate with
+ * an illegal instruction exception instead of running into the heap. This
+ * specifically targets the compressed extension allowing for the RUN_RISCV
+ * macro to ignore it, otherwise LLVM eagerly emits compressed instructions for
+ * non-compressed assembly. */
+#define RUN_RISCV_COMP(source)                 \
+  {                                            \
+    std::string sourceWithTerminator = source; \
+    sourceWithTerminator += "\n.word 0";       \
+    run(sourceWithTerminator.c_str(), true);   \
+  }                                            \
+  if (HasFatalFailure()) return
+
+/** A helper macro to predecode the first instruction in a snippet of RISC-V
+ * assembly code and check the assigned group(s) for each micro-op matches the
+ * expected group(s). Returns from the calling function if a fatal error occurs.
+ * Four bytes containing zeros are appended to the source to ensure that the
+ * program will terminate with an unallocated instruction encoding exception
+ * instead of running into the heap.
+ */
+#define EXPECT_GROUP(source, ...)                                   \
+  {                                                                 \
+    std::string sourceWithTerminator = source;                      \
+    sourceWithTerminator += "\n.word 0";                            \
+    checkGroup(sourceWithTerminator.c_str(), {__VA_ARGS__}, false); \
+  }                                                                 \
+  if (HasFatalFailure()) return
+
+/** A helper macro to predecode the first instruction in a snippet of RISC-V
+ * assembly code and check the assigned group(s) for each micro-op matches the
+ * expected group(s). Returns from the calling function if a fatal error occurs.
+ * Four bytes containing zeros are appended to the source to ensure that the
+ * program will terminate with an unallocated instruction encoding exception
+ * instead of running into the heap. This specifically targets the compressed
+ * extension allowing for the EXPECT_GROUP macro to ignore it, otherwise LLVM
+ * eagerly emits compressed instructions for non-compressed assembly. */
+#define EXPECT_GROUP_COMP(source, ...)                             \
+  {                                                                \
+    std::string sourceWithTerminator = source;                     \
+    sourceWithTerminator += "\n.word 0";                           \
+    checkGroup(sourceWithTerminator.c_str(), {__VA_ARGS__}, true); \
+  }                                                                \
+  if (HasFatalFailure()) return
+
+/** The test fixture for all RISC-V regression tests. */
 class RISCVRegressionTest : public RegressionTest {
  protected:
   virtual ~RISCVRegressionTest() {}
 
   /** Run the assembly code in `source`. */
-  void run(const char* source);
+  void run(const char* source, bool compressed);
+
+  /** Run the first instruction in source through predecode and check the
+   * groups. */
+  void checkGroup(const char* source,
+                  const std::vector<uint16_t>& expectedGroups, bool compressed);
 
   /** Generate a default YAML-formatted configuration. */
-  YAML::Node generateConfig() const override;
+  void generateConfig() const override;
+
+  /** Instantiate an ISA specific architecture from a kernel. */
+  virtual std::unique_ptr<simeng::arch::Architecture> instantiateArchitecture(
+      simeng::kernel::Linux& kernel) const override;
 
-  /** Create an ISA instance from a kernel. */
-  virtual std::unique_ptr<simeng::arch::Architecture> createArchitecture(
-      simeng::kernel::Linux& kernel, YAML::Node config) const override;
+  /** Initialise LLVM */
+  void initialiseLLVM() {
+    LLVMInitializeRISCVTargetInfo();
+    LLVMInitializeRISCVTargetMC();
+    LLVMInitializeRISCVAsmParser();
+  }
+
+  /** Get subtarget feature string. Use compressed instructions only if
+   * requested */
+  std::string getSubtargetFeaturesString(bool compressed) {
+    std::string subtargetFeatures = "+m,+a,+f,+d";
+    if (compressed) {
+      subtargetFeatures.append(",+c");
+    }
+    return subtargetFeatures;
+  }
 
   /** Get the value of a general purpose register. */
   template <typename T>
@@ -81,7 +155,14 @@ class RISCVRegressionTest : public RegressionTest {
     return getRegister<T>({simeng::arch::riscv::RegisterType::GENERAL, tag});
   }
 
+  /** Get the value of a floating point register. */
+  template <typename T>
+  T getFPRegister(uint8_t tag) const {
+    return getRegister<T>({simeng::arch::riscv::RegisterType::FLOAT, tag});
+  }
+
   /** Create a port allocator for an out-of-order core model. */
-  virtual std::unique_ptr<simeng::pipeline::PortAllocator> createPortAllocator()
-      const override;
+  virtual std::unique_ptr<simeng::pipeline::PortAllocator> createPortAllocator(
+      ryml::ConstNodeRef config =
+          simeng::config::SimInfo::getConfig()) const override;
 };
\ No newline at end of file
diff --git a/test/regression/riscv/SmokeTest.cc b/test/regression/riscv/SmokeTest.cc
index a1b7ffd361..c990094a9e 100644
--- a/test/regression/riscv/SmokeTest.cc
+++ b/test/regression/riscv/SmokeTest.cc
@@ -3,6 +3,7 @@
 namespace {
 
 using SmokeTest = RISCVRegressionTest;
+using namespace simeng::arch::riscv::InstructionGroups;
 
 // Test that a trivial instruction will execute
 TEST_P(SmokeTest, instruction) {
@@ -10,13 +11,16 @@ TEST_P(SmokeTest, instruction) {
     addi a5,a5,32
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(15), 32u);
+  EXPECT_GROUP(R"(addi a5,a5,32)", INT_SIMPLE_ARTH);
 }
 
 INSTANTIATE_TEST_SUITE_P(
     RISCV, SmokeTest,
-    ::testing::Values(std::make_tuple(EMULATION, YAML::Load("{}")),
-                      std::make_tuple(INORDER, YAML::Load("{}")),
-                      std::make_tuple(OUTOFORDER, YAML::Load("{}"))),
+    ::testing::Values(std::make_tuple(EMULATION, "{}"),
+                      std::make_tuple(INORDER, "{}"),
+                      std::make_tuple(OUTOFORDER,
+                                      "{L1-Data-Memory: "
+                                      "{Interface-Type: Fixed}}")),
     paramToString);
 
 }  // namespace
diff --git a/test/regression/riscv/Syscall.cc b/test/regression/riscv/Syscall.cc
index fc324cdddf..c40fc5754a 100644
--- a/test/regression/riscv/Syscall.cc
+++ b/test/regression/riscv/Syscall.cc
@@ -1,7 +1,9 @@
-#include <dirent.h>
 #include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <unistd.h>
 
-#include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <string>
@@ -15,67 +17,6 @@ using Syscall = RISCVRegressionTest;
 /** The maximum size of a filesystem path. */
 static const size_t LINUX_PATH_MAX = 4096;
 
-TEST_P(Syscall, getrandom) {
-  initialHeapData_.resize(24);
-  memset(initialHeapData_.data(), -1, 16);
-
-  RUN_RISCV(R"(
-      # Get heap address
-      li a0, 0
-      li a7, 214
-      ecall
-
-      # store inital heap address
-      mv t0, a0
-
-      # Save 8 random bytes to the heap
-      # getrandom(buf * = [a], buflen = 8, no flags)
-      li a1, 8
-      li a7, 278
-      ecall
-
-      # Save another 8 random bytes to the heap
-      # getrandom(buf * = [a], buflen = 8, no flags)
-      addi a0, t0, 8
-      li a1, 8
-      li a7, 278
-      ecall
-    )");
-
-  // Check getrandom returned 8 (8 bytes were requested)
-  EXPECT_EQ(getGeneralRegister<int64_t>(10), 8);
-
-  int heapStart = getGeneralRegister<int64_t>(5);
-  for (size_t i = 0; i < 8; i++) {
-    printf("compare %x == %x\n", getMemoryValue<uint8_t>(heapStart + i),
-           getMemoryValue<uint8_t>(heapStart + 8 + i));
-  }
-
-  // check that the returned bytes aren't all equal to -1.
-  // heap was initialised to -1 so check bytes have changed
-  bool allUnchanged = true;
-  for (size_t i = 0; i < 16; i++) {
-    if (getMemoryValue<uint8_t>(heapStart + i) != 0xFF) {
-      allUnchanged = false;
-      break;
-    }
-  }
-  EXPECT_EQ(allUnchanged, false);
-
-  // Check that the returned bytes from the two syscalls dont all match.
-  // If they do then the returned bytes surely werent random
-  bool allMatch = true;
-  for (char i = 0; i < 8; i++) {
-    if (getMemoryValue<uint8_t>(heapStart + i) !=
-        getMemoryValue<uint8_t>(heapStart + 8 + i)) {
-      allMatch = false;
-      break;
-    }
-  }
-
-  EXPECT_EQ(allMatch, false);
-}
-
 TEST_P(Syscall, ioctl) {
   // TIOCGWINSZ: test it returns zero and sets the output to anything
   initialHeapData_.resize(8);
@@ -86,20 +27,80 @@ TEST_P(Syscall, ioctl) {
     li a7, 214
     ecall
 
-    # ioctl(fd=1, request=0x5413, argp=a0)
+    # ioctl(fd=1, request=TIOCGWINSZ, argp=a0)
     mv a2, a0
     li a1, 0x5413
     li a0, 1
     li a7, 29
     ecall
   )");
-  EXPECT_EQ(getGeneralRegister<int64_t>(0), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(10), 0);
+  // Winsize changes between inside and outside of RUN_RISCV statement hence
+  // we cannot reliably test against a known value
   EXPECT_NE(getMemoryValue<uint16_t>(process_->getHeapStart() + 0), -1);
   EXPECT_NE(getMemoryValue<uint16_t>(process_->getHeapStart() + 2), -1);
   EXPECT_NE(getMemoryValue<uint16_t>(process_->getHeapStart() + 4), -1);
   EXPECT_NE(getMemoryValue<uint16_t>(process_->getHeapStart() + 6), -1);
 }
 
+TEST_P(Syscall, ftruncate) {
+  const char filepath[] = SIMENG_RISCV_TEST_ROOT "/data/truncate-test.txt";
+
+  // Copy filepath to heap
+  initialHeapData_.resize(strlen(filepath) + 1);
+  memcpy(initialHeapData_.data(), filepath, strlen(filepath) + 1);
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a0, 0
+    li a7, 214
+    ecall
+    mv t0, a0
+
+    # <input> = openat(AT_FDCWD, filepath, O_WRONLY, S_IRUSR)
+    mv a1, t0
+    li a0, -100
+    li a2, 0x0001
+    li a3, 400
+    li a7, 56
+    ecall
+    mv t1, a0
+
+    # ftruncate(fd, length) - increase length of file
+    mv a0, t1
+    li a1, 100
+    li a7, 46
+    ecall
+    mv t2, a0
+
+    # ftruncate(fd, length) - decrease length of file
+    mv a0, t1
+    li a1, 10
+    li a7, 46
+    ecall
+    mv t3, a0
+
+    # close(fd)
+    mv a0, t1
+    li a7, 57
+    ecall
+  )");
+  // Check returned 0
+  EXPECT_EQ(getGeneralRegister<int64_t>(10), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(7), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(28), 0);
+  // Check file has been truncated
+  std::ifstream truncatedFileI(filepath);
+  std::string fileContents;
+  getline(truncatedFileI, fileContents);
+  truncatedFileI.close();
+  EXPECT_EQ(fileContents, "This is a ");
+  // Reset file
+  std::ofstream truncatedFileO(filepath);
+  truncatedFileO << "This is a test file for the ftruncate syscall";
+  truncatedFileO.close();
+}
+
 TEST_P(Syscall, faccessat) {
   const char filepath[] = "./tempFile.txt";
   initialHeapData_.resize(strlen(filepath) + 1);
@@ -185,7 +186,14 @@ TEST_P(Syscall, faccessat) {
   unlink(filepath);
 
   char abs_filepath[LINUX_PATH_MAX];
-  realpath(SIMENG_RISCV_TEST_ROOT "/data/input.txt", abs_filepath);
+
+  if (!realpath(SIMENG_RISCV_TEST_ROOT "/data/input.txt", abs_filepath)) {
+    // Something went wrong
+    std::cerr << "[SimEng:syscall] realpath failed with errno = " << errno
+              << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
   initialHeapData_.resize(strlen(abs_filepath) + 1);
   // Copy abs_filepath to heap
   memcpy(initialHeapData_.data(), abs_filepath, strlen(abs_filepath) + 1);
@@ -211,7 +219,13 @@ TEST_P(Syscall, faccessat) {
   // Check syscall works using dirfd instead of AT_FDCWD
   const char file[] = "input.txt\0";
   char dirPath[LINUX_PATH_MAX];
-  realpath(SIMENG_RISCV_TEST_ROOT "/data/\0", dirPath);
+
+  if (!realpath(SIMENG_RISCV_TEST_ROOT "/data/\0", dirPath)) {
+    // Something went wrong
+    std::cerr << "[SimEng:syscall] realpath failed with errno = " << errno
+              << std::endl;
+    exit(EXIT_FAILURE);
+  }
 
   initialHeapData_.resize(strlen(dirPath) + strlen(file) + 2);
   // Copy dirPath to heap
@@ -286,7 +300,66 @@ TEST_P(Syscall, getdents64) {
   EXPECT_EQ(getGeneralRegister<int64_t>(7), 120);
 }
 
-// Test reading from and seeking through a file
+TEST_P(Syscall, lseek) {
+  const char filepath[] = SIMENG_RISCV_TEST_ROOT "/data/input.txt";
+
+  // Copy filepath to heap
+  initialHeapData_.resize(strlen(filepath) + 1);
+  memcpy(initialHeapData_.data(), filepath, strlen(filepath) + 1);
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a0, 0
+    li a7, 214
+    ecall
+    mv t0, a0
+
+    # <input> = openat(AT_FDCWD, filepath, O_WRONLY, S_IRUSR)
+    li a0, -100
+    mv a1, t0
+    li a2, 0x0000
+    li a3, 400
+    li a7, 56
+    ecall
+    mv t1, a0
+
+    # lseek(fd=<input>, offset=8, whence=SEEK_SET) - seek to offset
+    mv a0, t1
+    li a1, 8
+    li a2, 0
+    li a7, 62
+    ecall
+    mv t2, a0
+
+    # lseek(fd=<input>, offset=8, whence=SEEK_CUR) - seek to current location plus offset
+    mv a0, t1
+    li a1, 8
+    li a2, 1
+    li a7, 62
+    ecall
+    mv t3, a0
+
+    # lseek(fd=<input>, offset=8, whence=SEEK_END) - seek to the size of the file plus offset
+    mv a0, t1
+    li a1, 8
+    li a2, 2
+    li a7, 62
+    ecall
+    mv t4, a0
+
+    # close(fd)
+    mv a0, t1
+    li a7, 57
+    ecall
+  )");
+
+  EXPECT_EQ(getGeneralRegister<int64_t>(7), 8);
+  EXPECT_EQ(getGeneralRegister<int64_t>(28), 16);
+  EXPECT_EQ(getGeneralRegister<int64_t>(29), 35);
+}
+
+// Test reading from and seeking through a file (tests openat, readv, read, and
+// lseek syscalls)
 TEST_P(Syscall, file_read) {
   const char filepath[] = SIMENG_RISCV_TEST_ROOT "/data/input.txt";
 
@@ -354,6 +427,21 @@ TEST_P(Syscall, file_read) {
     sub a1, sp, t4
     li a2, 2
     li a7, 65
+    ecall    
+    
+    # lseek(fd=<input>, offset=0, whence=SEEK_SET)
+    mv a0, t1
+    li a1, 0
+    li a2, 0
+    li a7, 62
+    ecall    
+    
+    # read(fd=<input>, buf=sp, count=26)
+    mv a0, t1
+    li t5, 64 
+    sub a1, sp, t5
+    li a2, 26
+    li a7, 63
     ecall
 
     # close(fd=<input>)
@@ -362,14 +450,23 @@ TEST_P(Syscall, file_read) {
     ecall
   )");
 
-  // Check result of read operations
-  const char reference[] = "ABCD\0UV\0EFGH\0\0\0\0MNOPQRST";
-  char* data = processMemory_ + process_->getHeapStart();
-  for (int i = 0; i < sizeof(reference); i++) {
-    EXPECT_EQ(data[i], reference[i]) << "at index i=" << i << '\n';
+  // Check result of readv operations
+  const char refReadv[] = "ABCD\0UV\0EFGH\0\0\0\0MNOPQRST";
+  char* dataReadv = processMemory_ + process_->getHeapStart();
+  for (size_t i = 0; i < strlen(refReadv); i++) {
+    EXPECT_EQ(dataReadv[i], refReadv[i]) << "at index i=" << i << '\n';
+  }
+
+  // Check result of read operation
+  const char refRead[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+  char* dataRead = processMemory_ + process_->getInitialStackPointer() - 64;
+  for (size_t i = 0; i < strlen(refRead); i++) {
+    EXPECT_EQ(dataRead[i], refRead[i]) << "at index i=" << i << '\n';
   }
 }
 
+// Test reading from and seeking through a file (tests openat, writev, and write
+// syscalls)
 TEST_P(Syscall, file_write) {
   const char str[] = "Hello, World!\n";
   const char filepath[] = "./simeng-fileio-test.txt";
@@ -420,6 +517,13 @@ TEST_P(Syscall, file_write) {
     li a7, 66
     ecall
 
+    # write(fd=<tempfile>, buf=a1, count=14)
+    mv a0, t1
+    mv a1, t0
+    li a2, 14
+    li a7, 64
+    ecall
+
     # close(fd=<tempfile>)
     mv a0, t1
     li a7, 57
@@ -430,11 +534,50 @@ TEST_P(Syscall, file_write) {
   char outdata[15];
   std::ifstream outfile(filepath);
   ASSERT_TRUE(outfile.good());
+  outfile.read(outdata, 14);
+  EXPECT_FALSE(outfile.eof());
+  EXPECT_EQ(strncmp(str, outdata, 14), 0);
   outfile.read(outdata, 15);
   EXPECT_TRUE(outfile.eof());
   EXPECT_EQ(strncmp(str, outdata, 14), 0);
 }
 
+// Tests that writing to the standard out file descriptor functions correctly
+TEST_P(Syscall, stdout) {
+  const char str[] = "Hello, World!\n";
+  for (char c : str) {
+    initialHeapData_.push_back(c);
+  }
+  RUN_RISCV(R"(
+    # load temporary for subtracts
+    li t6, 32
+
+    # Get heap address
+    li a0, 0
+    li a7, 214
+    ecall
+
+    # iovec = {{a0, 10}, {a0+10, 4}}
+    sd a0, -32(sp)
+    li a1, 10
+    sd a1, -24(sp)
+    addi a0, a0, 10
+    sd a0, -16(sp)
+    li a1, 4
+    sd a1, -8(sp)
+
+    # writev(fd=1, iov=iovec, iovcnt=2)
+    li a0, 1
+    sub a1, sp, t6
+    li a2, 2
+    li a7, 66
+    ecall
+  )");
+  EXPECT_EQ(stdout_.substr(0, strlen(str)), str);
+  EXPECT_EQ(getGeneralRegister<int64_t>(10), strlen(str));
+}
+
+// Tests that an openat syscall on a non-existent file returns an error value
 TEST_P(Syscall, filenotfound) {
   // Copy filepath to heap
   const char filepath[] = "./nonexistent-file";
@@ -457,258 +600,51 @@ TEST_P(Syscall, filenotfound) {
   )");
 
   // Check return value is -1
-  EXPECT_EQ(getGeneralRegister<uint64_t>(10), -1);
+  EXPECT_EQ(getGeneralRegister<int64_t>(10), -1);
 }
 
-TEST_P(Syscall, mmap) {
-  // Test for 3 consecutive allocations
+// Test that readlinkat works for supported cases
+TEST_P(Syscall, readlinkat) {
+  const char path[] = "/proc/self/exe";
+
+  std::string reference =
+      SIMENG_SOURCE_DIR + std::string("/SimEngDefaultProgram");
+  // Copy path to heap
+  initialHeapData_.resize(strlen(path) + reference.size() + 1);
+  memcpy(initialHeapData_.data(), path, strlen(path) + 1);
+
   RUN_RISCV(R"(
-    # mmap(addr=NULL, length=65536, prot=3, flags=34, fd=-1, offset=0)
+    # Get heap address
     li a0, 0
-    li a1, 65536
-    li a2, 3
-    li a3, 34
-    li a4, -1
-    li a5, 0
-    li a7, 222
+    li a7, 214
     ecall
     mv t0, a0
 
-    # mmap(addr=NULL, length=1024, prot=3, flags=34, fd=-1, offset=0)
-    li a0, 0
-    li a1, 1024
-    li a2, 3
-    li a3, 34
-    li a4, -1
-    li a5, 0
-    li a7, 222
-    ecall
-    mv t1, a0
-
-    # mmap(addr=NULL, length=16384, prot=3, flags=34, fd=-1, offset=0)
+    # readlinkat(dirfd=0, pathname=t0, buf=x20+15, bufsize=1024)
     li a0, 0
-    li a1, 16384
-    li a2, 3
-    li a3, 34
-    li a4, -1
-    li a5, 0
-    li a7, 222
+    mv a1, t0
+    add a2, t0, 15
+    li a3, 1024
+    li a7, 78
     ecall
-    mv t2, a0
   )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(5), process_->getMmapStart());
-  EXPECT_EQ(getGeneralRegister<uint64_t>(6), process_->getMmapStart() + 65536);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(7), process_->getMmapStart() + 69632);
 
-  // Test for mmap allocation between two previous allocations
+  EXPECT_EQ(getGeneralRegister<int64_t>(10), reference.size());
+  char* data = processMemory_ + process_->getHeapStart() + 15;
+  for (size_t i = 0; i < reference.size(); i++) {
+    EXPECT_EQ(data[i], reference.c_str()[i]) << "at index i=" << i << '\n';
+  }
+}
+
+TEST_P(Syscall, newfstatat) {
+  const char filepath[] = SIMENG_RISCV_TEST_ROOT "/data/input.txt";
+  // Reserve 128 bytes for statbuf
+  initialHeapData_.resize(128 + strlen(filepath) + 1);
+  // Copy filepath to heap
+  memcpy(initialHeapData_.data() + 128, filepath, strlen(filepath) + 1);
+
   RUN_RISCV(R"(
-    # Setup 3 contiguous allocations
-    # mmap(addr=NULL, length=1024, prot=3, flags=34, fd=-1, offset=0)
-    li a0, 0
-    li a1, 1024
-    li a2, 3
-    li a3, 34
-    li a4, -1
-    li a5, 0
-    li a7, 222
-    ecall
-    mv t0, a0
-
-    # mmap(addr=NULL, length=12288, prot=3, flags=34, fd=-1, offset=0)
-    li a0, 0
-    li a1, 12288
-    li a2, 3
-    li a3, 34
-    li a4, -1
-    li a5, 0
-    li a7, 222
-    ecall
-    mv t1, a0
-
-    # mmap(addr=NULL, length=1024, prot=3, flags=34, fd=-1, offset=0)
-    li a0, 0
-    li a1, 1024
-    li a2, 3
-    li a3, 34
-    li a4, -1
-    li a5, 0
-    li a7, 222
-    ecall
-    mv t2, a0
-
-    # unmap second allocation to create an empty space between allocations
-    # munmap(addr=t1, length=12288, prot=3, flags=34, fd=-1, offset=0)
-    mv a0, t1
-    li a1, 12288
-    li a7, 215
-    ecall
-    mv t3, a0
-
-    # Allocate a region larger than the new empty space
-    # mmap(addr=NULL, length=16384, prot=3, flags=34, fd=-1, offset=0)
-    li a0, 0
-    li a1, 16384
-    li a2, 3
-    li a3, 34
-    li a4, -1
-    li a5, 0
-    li a7, 222
-    ecall
-    mv t4, a0
-
-    # Two allocations whose combined length equals the new empty space
-    # mmap(addr=NULL, length=4096, prot=3, flags=34, fd=-1, offset=0)
-    li a0, 0
-    li a1, 4096
-    li a2, 3
-    li a3, 34
-    li a4, -1
-    li a5, 0
-    li a7, 222
-    ecall
-    mv t5, a0
-
-    # mmap(addr=NULL, length=8192, prot=3, flags=34, fd=-1, offset=0)
-    li a0, 0
-    li a1, 8192
-    li a2, 3
-    li a3, 34
-    li a4, -1
-    li a5, 0
-    li a7, 222
-    ecall
-    mv t6, a0
-  )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(5), process_->getMmapStart());
-  EXPECT_EQ(getGeneralRegister<uint64_t>(6), process_->getMmapStart() + 4096);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(7), process_->getMmapStart() + 16384);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(29), process_->getMmapStart() + 20480);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(30), process_->getMmapStart() + 4096);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(31), process_->getMmapStart() + 8192);
-}
-
-TEST_P(Syscall, munmap) {
-  // Test that no errors are given during expected usage
-  RUN_RISCV(R"(
-    # mmap(addr=NULL, length=65536, prot=3, flags=34, fd=-1, offset=0)
-    li a0, 0
-    li a1, 65536
-    li a2, 3
-    li a3, 34
-    li a4, -1
-    li a5, 0
-    li a7, 222
-    ecall
-    mv t0, a0
-
-    # munmap(addr=mmapStart_, length=65536, prot=3, flags=34, fd=-1, offset=0)
-    mv a0, t0
-    li a1, 65536
-    li a7, 215
-    ecall
-    mv t1, a0
-
-    # munmap(addr=mmapStart_, length=65536, prot=3, flags=34, fd=-1, offset=0)
-    mv a0, t0
-    li a1, 65536
-    li a7, 215
-    ecall
-    mv t2, a0
-  )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(5), process_->getMmapStart());
-  EXPECT_EQ(getGeneralRegister<int64_t>(6), 0);
-  EXPECT_EQ(getGeneralRegister<int64_t>(7), 0);
-
-  // Test that EINVAL error types trigger
-  RUN_RISCV(R"(
-    # mmap(addr=NULL, length=1024, prot=3, flags=34, fd=-1, offset=0)
-    li a0, 0
-    li a1, 1024
-    li a2, 3
-    li a3, 34
-    li a4, -1
-    li a5, 0
-    li a7, 222
-    ecall
-    mv t0, a0
-
-    # munmap(addr=mmapStart_, length=65536, prot=3, flags=34, fd=-1, offset=0)
-    mv a0, t0
-    li a1, 65536
-    li a7, 215
-    ecall
-    mv t1, a0
-
-    # munmap(addr=mmapStart_, length=65536, prot=3, flags=34, fd=-1, offset=0)
-    addi t0, t0, 1024
-    mv a0, t0
-    li a1, 65536
-    li a7, 215
-    ecall
-    mv t2, a0
-  )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(5), process_->getMmapStart() + 1024);
-  EXPECT_EQ(getGeneralRegister<int64_t>(6), -1);
-  EXPECT_EQ(getGeneralRegister<int64_t>(7), -1);
-}
-
-TEST_P(Syscall, stdout) {
-  const char str[] = "Hello, World!\n";
-  for (char c : str) {
-    initialHeapData_.push_back(c);
-  }
-  RUN_RISCV(R"(
-    # load temporary for subtracts
-    li t6, 32
-
-    # Get heap address
-    li a0, 0
-    li a7, 214
-    ecall
-
-    # iovec = {{a0, 10}, {a0+10, 4}}
-    sd a0, -32(sp)
-    li a1, 10
-    sd a1, -24(sp)
-    addi a0, a0, 10
-    sd a0, -16(sp)
-    li a1, 4
-    sd a1, -8(sp)
-
-    # writev(fd=1, iov=iovec, iovcnt=2)
-    li a0, 1
-    sub a1, sp, t6
-    li a2, 2
-    li a7, 66
-    ecall
-  )");
-  EXPECT_EQ(stdout_.substr(0, sizeof(str) - 1), str);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(10), sizeof(str) - 1);
-}
-
-TEST_P(Syscall, mprotect) {
-  // Check mprotect returns placeholder value as currently not implemented
-  RUN_RISCV(R"(
-    # mprotect(addr=47472, len=4096, prot=1) = 0
-    li a0, 47472
-    li a1, 4096
-    li a2, 1
-    li a7, 226
-    ecall
-  )");
-  EXPECT_EQ(getGeneralRegister<uint64_t>(10), 0);
-}
-
-TEST_P(Syscall, newfstatat) {
-  const char filepath[] = SIMENG_RISCV_TEST_ROOT "/data/input.txt";
-  // Reserve 128 bytes for statbuf
-  initialHeapData_.resize(128 + strlen(filepath) + 1);
-  // Copy filepath to heap
-  memcpy(initialHeapData_.data() + 128, filepath, strlen(filepath) + 1);
-
-  RUN_RISCV(R"(
-    # Get heap address
+    # Get heap address
     li a0, 0
     li a7, 214
     ecall
@@ -723,8 +659,64 @@ TEST_P(Syscall, newfstatat) {
     ecall
     mv t1, a0
   )");
+  // Run fstatat syscall to define a reference
+  struct ::stat statbufRef;
+  ::fstatat(AT_FDCWD, filepath, &statbufRef, 0);
+
   // Check fstatat returned 0
-  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(6), 0);
+  // Check fstatat buf matches reference
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart()),
+            statbufRef.st_dev);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 8),
+            statbufRef.st_ino);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 16),
+            statbufRef.st_mode);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 20),
+            statbufRef.st_nlink);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 24),
+            statbufRef.st_uid);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 28),
+            statbufRef.st_gid);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 32),
+            statbufRef.st_rdev);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 40), 0ull);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 48),
+            statbufRef.st_size);
+  EXPECT_EQ(getMemoryValue<int32_t>(process_->getHeapStart() + 56),
+            statbufRef.st_blksize);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 60), 0ull);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 64),
+            statbufRef.st_blocks);
+#ifdef __MACH__
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 72),
+            statbufRef.st_atimespec.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 80),
+            statbufRef.st_atimespec.tv_nsec);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 88),
+            statbufRef.st_mtimespec.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 96),
+            statbufRef.st_mtimespec.tv_nsec);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 104),
+            statbufRef.st_ctimespec.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 112),
+            statbufRef.st_ctimespec.tv_nsec);
+#else
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 72),
+            statbufRef.st_atim.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 80),
+            statbufRef.st_atim.tv_nsec);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 88),
+            statbufRef.st_mtim.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 96),
+            statbufRef.st_mtim.tv_nsec);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 104),
+            statbufRef.st_ctim.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 112),
+            statbufRef.st_ctim.tv_nsec);
+#endif
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 116), 0ull);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 124), 0ull);
 
   RUN_RISCV(R"(
     # Get heap address
@@ -743,12 +735,18 @@ TEST_P(Syscall, newfstatat) {
     mv t1, a0
   )");
   // Check fstatat returned -1 (file not found)
-  EXPECT_EQ(getGeneralRegister<uint64_t>(6), -1);
+  EXPECT_EQ(getGeneralRegister<int64_t>(6), -1);
 
   // Check syscall works using dirfd instead of AT_FDCWD
   const char file[] = "input.txt\0";
   char dirPath[LINUX_PATH_MAX];
-  realpath(SIMENG_RISCV_TEST_ROOT "/data/\0", dirPath);
+
+  if (!realpath(SIMENG_RISCV_TEST_ROOT "/data/\0", dirPath)) {
+    // Something went wrong
+    std::cerr << "[SimEng:syscall] realpath failed with errno = " << errno
+              << std::endl;
+    exit(EXIT_FAILURE);
+  }
 
   initialHeapData_.resize(128 + strlen(dirPath) + strlen(file) + 2);
   // Copy dirPath to heap
@@ -780,65 +778,74 @@ TEST_P(Syscall, newfstatat) {
       li a7, 79
       ecall
       mv t1, a0
-    )");
-  EXPECT_EQ(getGeneralRegister<int64_t>(6), 0);
-}
-
-TEST_P(Syscall, getrusage) {
-  // Reserve 128 bytes for usage
-  initialHeapData_.resize(128);
-  RUN_RISCV(R"(
-    # Get heap address
-    li a0, 0
-    li a7, 214
-    ecall
-    mv t0, a0
+    )");  // Run fstatat syscall to define a reference
+  ::fstatat(AT_FDCWD, filepath, &statbufRef, 0);
 
-    # getrusage(who = RUSAGE_SELF, usage)
-    li a0, 0
-    mv a1, t0
-    li a7, 165
-    ecall
-    mv t1, a0
-
-    # getrusage(who = RUSAGE_CHILDREN, usage)
-    li a0, -1
-    mv a1, t0
-    li a7, 165
-    ecall
-    mv t2, a0
-  )");
+  // Check fstatat returned 0
   EXPECT_EQ(getGeneralRegister<int64_t>(6), 0);
-  EXPECT_EQ(getGeneralRegister<int64_t>(7), 0);
 
-  // MacOS doesn't support the final enum RUSAGE_THREAD
-#ifndef __MACH__
-  // Reserve 128 bytes for usage
-  initialHeapData_.resize(128);
-  RUN_RISCV(R"(
-      # Get heap address
-      li a0, 0
-      li a7, 214
-      ecall
-      mv t0, a0
-
-      # getrusage(who = RUSAGE_THREAD, usage)
-      li a0, 1
-      mv a1, t0
-      li a7, 165
-      ecall
-      mv t1, a0
-    )");
-  EXPECT_EQ(getGeneralRegister<int64_t>(6), 0);
+  // Check fstatat buf matches reference
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart()),
+            statbufRef.st_dev);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 8),
+            statbufRef.st_ino);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 16),
+            statbufRef.st_mode);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 20),
+            statbufRef.st_nlink);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 24),
+            statbufRef.st_uid);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 28),
+            statbufRef.st_gid);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 32),
+            statbufRef.st_rdev);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 40), 0ull);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 48),
+            statbufRef.st_size);
+  EXPECT_EQ(getMemoryValue<int32_t>(process_->getHeapStart() + 56),
+            statbufRef.st_blksize);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 60), 0ull);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 64),
+            statbufRef.st_blocks);
+#ifdef __MACH__
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 72),
+            statbufRef.st_atimespec.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 80),
+            statbufRef.st_atimespec.tv_nsec);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 88),
+            statbufRef.st_mtimespec.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 96),
+            statbufRef.st_mtimespec.tv_nsec);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 104),
+            statbufRef.st_ctimespec.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 112),
+            statbufRef.st_ctimespec.tv_nsec);
+#else
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 72),
+            statbufRef.st_atim.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 80),
+            statbufRef.st_atim.tv_nsec);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 88),
+            statbufRef.st_mtim.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 96),
+            statbufRef.st_mtim.tv_nsec);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 104),
+            statbufRef.st_ctim.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 112),
+            statbufRef.st_ctim.tv_nsec);
 #endif
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 116), 0ull);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 124), 0ull);
 }
 
-TEST_P(Syscall, ftruncate) {
-  const char filepath[] = SIMENG_RISCV_TEST_ROOT "/data/truncate-test.txt";
+TEST_P(Syscall, fstat) {
+  const char filepath[] = SIMENG_RISCV_TEST_ROOT "/data/input.txt";
+
+  // Reserve 256 bytes for fstat struct
+  initialHeapData_.resize(256 + strlen(filepath) + 1);
 
   // Copy filepath to heap
-  initialHeapData_.resize(strlen(filepath) + 1);
-  memcpy(initialHeapData_.data(), filepath, strlen(filepath) + 1);
+  memcpy(initialHeapData_.data() + 256, filepath, strlen(filepath) + 1);
 
   RUN_RISCV(R"(
     # Get heap address
@@ -847,44 +854,781 @@ TEST_P(Syscall, ftruncate) {
     ecall
     mv t0, a0
 
-    # <input> = openat(AT_FDCWD, filepath, O_WRONLY, S_IRUSR)
-    mv a1, t0
+    # <input> = openat(AT_FDCWD, filepath, O_RDONLY, S_IRUSR)
     li a0, -100
-    li a2, 0x0001
+    add a1, t0, 256
+    li a2, 0x0000
     li a3, 400
     li a7, 56
     ecall
     mv t1, a0
 
-    # ftruncate(fd, length) - increase length of file
+    # fstat(fd=<input>, buf=t0)
     mv a0, t1
-    li a1, 100
-    li a7, 46
+    mv a1, t0
+    li a7, 80
     ecall
     mv t2, a0
 
-    # ftruncate(fd, length) - decrease length of file
+    # close(fd=<input>)
     mv a0, t1
-    li a1, 46
-    li a7, 46
+    li a7, 57
+    ecall
+  )");
+  // Run fstat syscall to define a reference
+  int64_t fd = ::openat(AT_FDCWD, filepath, O_RDONLY, S_IRUSR);
+  struct ::stat statbufRef;
+  ::fstat(fd, &statbufRef);
+  ::close(fd);
+
+  // Check fstat returned 0
+  EXPECT_EQ(getGeneralRegister<int64_t>(23), 0);
+  // Check fstat buf matches reference
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart()),
+            statbufRef.st_dev);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 8),
+            statbufRef.st_ino);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 16),
+            statbufRef.st_mode);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 20),
+            statbufRef.st_nlink);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 24),
+            statbufRef.st_uid);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 28),
+            statbufRef.st_gid);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 32),
+            statbufRef.st_rdev);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 40), 0ull);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 48),
+            statbufRef.st_size);
+  EXPECT_EQ(getMemoryValue<int32_t>(process_->getHeapStart() + 56),
+            statbufRef.st_blksize);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 60), 0ull);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 64),
+            statbufRef.st_blocks);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 72),
+            statbufRef.st_atime);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 80), 0ull);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 88),
+            statbufRef.st_mtime);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 96), 0ull);
+  EXPECT_EQ(getMemoryValue<int64_t>(process_->getHeapStart() + 104),
+            statbufRef.st_ctime);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 112), 0ull);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 116), 0ull);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getHeapStart() + 124), 0ull);
+}
+
+TEST_P(Syscall, exit) {
+  RUN_RISCV(R"(
+    # exit(1)
+    li a0, 1
+    li a7, 93
+    ecall
+  )");
+  // Set reference for stdout
+  std::string str =
+      "\n[SimEng:ExceptionHandler] Received exit syscall: terminating "
+      "with exit code 1";
+  EXPECT_EQ(stdout_.substr(0, str.size()), str);
+}
+
+TEST_P(Syscall, exit_group) {
+  RUN_RISCV(R"(
+    # exit_group(1)
+    li a0, 1
+    li a7, 94
+    ecall
+  )");
+  // Set reference for stdout
+  std::string str =
+      "\n[SimEng:ExceptionHandler] Received exit_group syscall: terminating "
+      "with exit code 1";
+  EXPECT_EQ(stdout_.substr(0, str.size()), str);
+}
+
+TEST_P(Syscall, set_tid_address) {
+  // Reserve 8 bytes for tid
+  initialHeapData_.resize(8);
+  RUN_RISCV(R"(
+    # Get heap address
+    li a0, 0
+    li a7, 214
+    ecall
+    mv t0, a0
+
+    # set_tid_address(tidptr=t0)
+    mv a0, t0
+    li a7, 96
+    ecall
+    mv t1, a0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(6), 0);
+}
+
+// TODO: write futex test
+// TODO: write set_robust_list test
+
+TEST_P(Syscall, clock_gettime) {
+  // Reserve 32 bytes for time data
+  initialHeapData_.resize(32);
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a0, 0
+    li a7, 214
+    ecall
+    mv t0, a0
+
+    # Execute loop to elapse time in core
+    li t3, 10000
+    li t4, 1
+    sub t3, t3, t4
+    bne zero, t3, -4
+
+    # clock_gettime(clk_id=CLOCK_REALTIME, tp=t0)
+    li a0, 0
+    mv a1, t0
+    li a7, 113
+    ecall
+    mv t1, a0
+
+    # Execute loop to elapse time in core
+    li t3, 10000
+    li t4, 1
+    sub t3, t3, t4
+    bne zero, t3, -4
+
+    # clock_gettime(clk_id=CLOCK_MONOTONIC, tp=t0+16)
+    li a0, 1
+    add a1, t0, 16
+    li a7, 113
+    ecall
+    mv t2, a0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(7), 0);
+  // Set time values based on core model in use
+  uint64_t secondsReal = 0;
+  uint64_t nanosecondsReal = 0;
+  uint64_t secondsMono = 0;
+  uint64_t nanosecondsMono = 0;
+  // Seconds will be 0 as too much host time would have to elapse in the test
+  // suite for 1 simulated second to elapse
+  if (std::get<0>(GetParam()) == EMULATION) {
+    nanosecondsReal = 8004;
+    nanosecondsMono = 16007;
+  } else if (std::get<0>(GetParam()) == INORDER) {
+    nanosecondsReal = 8006;
+    nanosecondsMono = 16011;
+  } else if (std::get<0>(GetParam()) == OUTOFORDER) {
+    nanosecondsReal = 8010;
+    nanosecondsMono = 16016;
+  }
+
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart()), secondsReal);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 8),
+            nanosecondsReal);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 16),
+            secondsMono);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 24),
+            nanosecondsMono);
+}
+
+// TODO: tests only test errored instances of using sched_setaffinity due to
+// omitted functionality. Redo test once functionality is implemented
+TEST_P(Syscall, sched_setaffinity) {
+  RUN_RISCV(R"(
+    # sched_setaffinity(pid=0, cpusetsize=1, mask=0)
+    li a0, 0
+    li a1, 1
+    li a2, 0
+    li a7, 122
+    ecall
+    mv t0, a0
+
+    # sched_setaffinity(pid=1, cpusetsize=1, mask=1)
+    li a0, 1
+    li a1, 1
+    li a2, 1
+    li a7, 122
+    ecall
+    mv t1, a0
+
+    # sched_setaffinity(pid=0, cpusetsize=0, mask=1)
+    li a0, 0
+    li a1, 0
+    li a2, 1
+    li a7, 122
+    ecall
+    mv t2, a0
+
+    # sched_setaffinity(pid=0, cpusetsize=1, mask=1)
+    li a0, 0
+    li a1, 1
+    li a2, 1
+    li a7, 122
     ecall
     mv t3, a0
+    )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(5), -EFAULT);
+  EXPECT_EQ(getGeneralRegister<int64_t>(6), -ESRCH);
+  EXPECT_EQ(getGeneralRegister<int64_t>(7), -EINVAL);
+  EXPECT_EQ(getGeneralRegister<int64_t>(28), 0);
+}
 
-    # close(fd)
+// TODO: tests only test errored instances of using sched_getaffinity due to
+// omitted functionality. Redo test once functionality is implemented
+TEST_P(Syscall, sched_getaffinity) {
+  RUN_RISCV(R"(
+    # schedGetAffinity(pid=0, cpusetsize=0, mask=0)
+    li a0, 0
+    li a1, 0
+    li a2, 0
+    li a7, 123
+    ecall
+    mv t0, a0
+
+    # sched_getaffinity(pid=1, cpusetsize=0, mask=1)
+    li a0, 1
+    li a1, 0
+    li a2, 1
+    li a7, 123
+    ecall
+    mv t1, a0
+
+    # sched_getaffinity(pid=0, cpusetsize=0, mask=1)
+    li a0, 0
+    li a1, 0
+    li a2, 1
+    li a7, 123
+    ecall
+    mv t2, a0
+    )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(5), -1);
+  EXPECT_EQ(getGeneralRegister<int64_t>(6), -1);
+  EXPECT_EQ(getGeneralRegister<int64_t>(7), 1);
+}
+
+// TODO: write tgkill test
+// TODO: write rt_sigaction test
+// TODO: write rt_sigprocmask test
+
+TEST_P(Syscall, uname) {
+  // Reserve 325 bytes for utsname struct
+  initialHeapData_.resize(325);
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a0, 0
+    li a7, 214
+    ecall
+    mv t0, a0
+
+    # getrusage(buf=t0)
+    mv a0, t0
+    li a7, 160
+    ecall
+    mv t1, a0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(6), 0);
+
+  // Check utsname struct in memory
+  char* data = processMemory_ + process_->getHeapStart();
+  const char sysname[] = "Linux";
+  for (size_t i = 0; i < strlen(sysname); i++) EXPECT_EQ(data[i], sysname[i]);
+
+  // Add 65 to data pointer for reserved length of each string field in Linux
+  data += 65;
+  const char nodename[] = "fedora-riscv";
+  for (size_t i = 0; i < strlen(nodename); i++) EXPECT_EQ(data[i], nodename[i]);
+
+  data += 65;
+  const char release[] = "5.5.0-0.rc5.git0.1.1.riscv64.fc32.riscv64";
+  for (size_t i = 0; i < strlen(release); i++) EXPECT_EQ(data[i], release[i]);
+
+  data += 65;
+  const char version[] = "#1 SMP Mon Jan 6 17:31:22 UTC 2020";
+  for (size_t i = 0; i < strlen(version); i++) EXPECT_EQ(data[i], version[i]);
+
+  data += 65;
+  const char machine[] = "riscv64";
+  for (size_t i = 0; i < strlen(machine); i++) EXPECT_EQ(data[i], machine[i]);
+
+  data += 65;
+  const char domainname[] = "(none)";
+  for (size_t i = 0; i < strlen(domainname); i++)
+    EXPECT_EQ(data[i], domainname[i]);
+}
+
+TEST_P(Syscall, getrusage) {
+  // Reserve 128 bytes for usage
+  initialHeapData_.resize(128);
+  RUN_RISCV(R"(
+    # Get heap address
+    li a0, 0
+    li a7, 214
+    ecall
+    mv t0, a0
+
+    # getrusage(who = RUSAGE_SELF, usage)
+    li a0, 0
+    mv a1, t0
+    li a7, 165
+    ecall
+    mv t1, a0
+
+    # getrusage(who = RUSAGE_CHILDREN, usage)
+    li a0, -1
+    mv a1, t0
+    li a7, 165
+    ecall
+    mv t2, a0
+  )");
+  // getrusage rusage struct values changes between inside and outside of
+  // RUN_RISCV statement hence we cannot reliably test against a known value.
+  // Thus only test return value
+  EXPECT_EQ(getGeneralRegister<int64_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(7), 0);
+
+  // MacOS doesn't support the final enum RUSAGE_THREAD
+#ifndef __MACH__
+  // Reserve 128 bytes for usage
+  initialHeapData_.resize(128);
+  RUN_RISCV(R"(
+      # Get heap address
+      li a0, 0
+      li a7, 214
+      ecall
+      mv t0, a0
+
+      # getrusage(who = RUSAGE_THREAD, usage)
+      li a0, 1
+      mv a1, t0
+      li a7, 165
+      ecall
+      mv t1, a0
+    )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(6), 0);
+#endif
+}
+
+TEST_P(Syscall, gettimeofday) {
+  // Reserve 64 bytes for time data
+  initialHeapData_.resize(64);
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a0, 0
+    li a7, 214
+    ecall
+    mv t0, a0
+
+    # Execute loop to elapse time in core
+    li t3, 10000
+    li t4, 1
+    sub t3, t3, t4
+    bne zero, t3, -4
+
+    # gettimeofday(tv=t0, tz=null)
+    mv a0, t0
+    li a1, 0
+    li a7, 169
+    ecall
+    mv t1, a0
+
+    # Execute loop to elapse time in core
+    li t3, 10000
+    li t4, 1
+    sub t3, t3, t4
+    bne zero, t3, -4
+
+    # gettimeofday(tv=null, tz=t0+16)
+    li a0, 0
+    add a1, t0, 16
+    li a7, 169
+    ecall
+    mv t2, a0
+
+    # Execute loop to elapse time in core
+    li t3, 10000
+    li t4, 1
+    sub t3, t3, t4
+    bne zero, t3, -4
+
+    # gettimeofday(tv=t0+32, tz=t0+48)
+    add a0, t0, 32
+    add a1, t0, 48
+    li a7, 169
+    ecall
+    mv t3, a0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(7), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(28), 0);
+
+  // Set time values based on core model in use
+
+  // Seconds will be 0 as too much host time would have to elapse in the test
+  // suite for 1 simulated second to elapse
+  simeng::kernel::timeval tvLoop0 = {0, 8};
+  // tv set to NULL here so no value change will occur
+  simeng::kernel::timeval tvLoop2 = {0, 24};
+  // All tz values are set to 0 given values are the displacement from GMT
+  simeng::kernel::timeval tzLoop1 = {0, 0};
+  simeng::kernel::timeval tzLoop2 = {0, 0};
+
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart()), tvLoop0.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 8),
+            tvLoop0.tv_usec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 16),
+            tzLoop1.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 24),
+            tzLoop1.tv_usec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 32),
+            tvLoop2.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 40),
+            tvLoop2.tv_usec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 48),
+            tzLoop2.tv_sec);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getHeapStart() + 56),
+            tzLoop2.tv_usec);
+}
+
+TEST_P(Syscall, gettid) {
+  RUN_RISCV(R"(
+    # gettid()
+    li a7, 178
+    ecall
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(10), 0);
+}
+
+TEST_P(Syscall, getpid) {
+  RUN_RISCV(R"(
+    # getpid()
+    li a7, 172
+    ecall
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(10), 0);
+}
+
+TEST_P(Syscall, getuid) {
+  RUN_RISCV(R"(
+    # getuid()
+    li a7, 174
+    ecall
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(10), 0);
+}
+
+TEST_P(Syscall, geteuid) {
+  RUN_RISCV(R"(
+    # geteuid()
+    li a7, 175
+    ecall
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(10), 0);
+}
+
+TEST_P(Syscall, getgid) {
+  RUN_RISCV(R"(
+    # getgid()
+    li a7, 176
+    ecall
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(10), 0);
+}
+
+TEST_P(Syscall, getegid) {
+  RUN_RISCV(R"(
+    # getegid()
+    li a7, 177
+    ecall
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(10), 0);
+}
+
+// TODO: write sysinfo test
+// TODO: write shutdown test
+
+TEST_P(Syscall, mprotect) {
+  // Check mprotect returns placeholder value as currently not implemented
+  RUN_RISCV(R"(
+    # mprotect(addr=47472, len=4096, prot=1) = 0
+    li a0, 47472
+    li a1, 4096
+    li a2, 1
+    li a7, 226
+    ecall
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(10), 0);
+}
+
+// TODO: write mbind test
+// TODO: write prlimit64 test
+// TODO: write rseq test
+
+TEST_P(Syscall, munmap) {
+  // Test that no errors are given during expected usage
+  RUN_RISCV(R"(
+    # mmap(addr=NULL, length=65536, prot=3, flags=34, fd=-1, offset=0)
+    li a0, 0
+    li a1, 65536
+    li a2, 3
+    li a3, 34
+    li a4, -1
+    li a5, 0
+    li a7, 222
+    ecall
+    mv t0, a0
+
+    # munmap(addr=mmapStart_, length=65536, prot=3, flags=34, fd=-1, offset=0)
+    mv a0, t0
+    li a1, 65536
+    li a7, 215
+    ecall
+    mv t1, a0
+
+    # munmap(addr=mmapStart_, length=65536, prot=3, flags=34, fd=-1, offset=0)
+    mv a0, t0
+    li a1, 65536
+    li a7, 215
+    ecall
+    mv t2, a0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(5), process_->getMmapStart());
+  EXPECT_EQ(getGeneralRegister<int64_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(7), 0);
+
+  // Test that EINVAL error types trigger
+  RUN_RISCV(R"(
+    # mmap(addr=NULL, length=1024, prot=3, flags=34, fd=-1, offset=0)
+    li a0, 0
+    li a1, 1024
+    li a2, 3
+    li a3, 34
+    li a4, -1
+    li a5, 0
+    li a7, 222
+    ecall
+    mv t0, a0
+
+    # munmap(addr=mmapStart_, length=65536, prot=3, flags=34, fd=-1, offset=0)
+    mv a0, t0
+    li a1, 65536
+    li a7, 215
+    ecall
+    mv t1, a0
+
+    # munmap(addr=mmapStart_, length=65536, prot=3, flags=34, fd=-1, offset=0)
+    addi t0, t0, 1024
+    mv a0, t0
+    li a1, 65536
+    li a7, 215
+    ecall
+    mv t2, a0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(5), process_->getMmapStart() + 1024);
+  EXPECT_EQ(getGeneralRegister<int64_t>(6), -1);
+  EXPECT_EQ(getGeneralRegister<int64_t>(7), -1);
+}
+
+TEST_P(Syscall, mmap) {
+  // Test for 3 consecutive allocations
+  RUN_RISCV(R"(
+    # mmap(addr=NULL, length=65536, prot=3, flags=34, fd=-1, offset=0)
+    li a0, 0
+    li a1, 65536
+    li a2, 3
+    li a3, 34
+    li a4, -1
+    li a5, 0
+    li a7, 222
+    ecall
+    mv t0, a0
+
+    # mmap(addr=NULL, length=1024, prot=3, flags=34, fd=-1, offset=0)
+    li a0, 0
+    li a1, 1024
+    li a2, 3
+    li a3, 34
+    li a4, -1
+    li a5, 0
+    li a7, 222
+    ecall
+    mv t1, a0
+
+    # mmap(addr=NULL, length=16384, prot=3, flags=34, fd=-1, offset=0)
+    li a0, 0
+    li a1, 16384
+    li a2, 3
+    li a3, 34
+    li a4, -1
+    li a5, 0
+    li a7, 222
+    ecall
+    mv t2, a0
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(5), process_->getMmapStart());
+  EXPECT_EQ(getGeneralRegister<int64_t>(6), process_->getMmapStart() + 65536);
+  EXPECT_EQ(getGeneralRegister<int64_t>(7), process_->getMmapStart() + 69632);
+
+  // Test for mmap allocation between two previous allocations
+  RUN_RISCV(R"(
+    # Setup 3 contiguous allocations
+    # mmap(addr=NULL, length=1024, prot=3, flags=34, fd=-1, offset=0)
+    li a0, 0
+    li a1, 1024
+    li a2, 3
+    li a3, 34
+    li a4, -1
+    li a5, 0
+    li a7, 222
+    ecall
+    mv t0, a0
+
+    # mmap(addr=NULL, length=12288, prot=3, flags=34, fd=-1, offset=0)
+    li a0, 0
+    li a1, 12288
+    li a2, 3
+    li a3, 34
+    li a4, -1
+    li a5, 0
+    li a7, 222
+    ecall
+    mv t1, a0
+
+    # mmap(addr=NULL, length=1024, prot=3, flags=34, fd=-1, offset=0)
+    li a0, 0
+    li a1, 1024
+    li a2, 3
+    li a3, 34
+    li a4, -1
+    li a5, 0
+    li a7, 222
+    ecall
+    mv t2, a0
+
+    # unmap second allocation to create an empty space between allocations
+    # munmap(addr=t1, length=12288, prot=3, flags=34, fd=-1, offset=0)
     mv a0, t1
-    li a7, 57
+    li a1, 12288
+    li a7, 215
+    ecall
+    mv t3, a0
+
+    # Allocate a region larger than the new empty space
+    # mmap(addr=NULL, length=16384, prot=3, flags=34, fd=-1, offset=0)
+    li a0, 0
+    li a1, 16384
+    li a2, 3
+    li a3, 34
+    li a4, -1
+    li a5, 0
+    li a7, 222
+    ecall
+    mv t4, a0
+
+    # Two allocations whose combined length equals the new empty space
+    # mmap(addr=NULL, length=4096, prot=3, flags=34, fd=-1, offset=0)
+    li a0, 0
+    li a1, 4096
+    li a2, 3
+    li a3, 34
+    li a4, -1
+    li a5, 0
+    li a7, 222
     ecall
+    mv t5, a0
+
+    # mmap(addr=NULL, length=8192, prot=3, flags=34, fd=-1, offset=0)
+    li a0, 0
+    li a1, 8192
+    li a2, 3
+    li a3, 34
+    li a4, -1
+    li a5, 0
+    li a7, 222
+    ecall
+    mv t6, a0
   )");
-  // Check returned 0
-  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(5), process_->getMmapStart());
+  EXPECT_EQ(getGeneralRegister<int64_t>(6), process_->getMmapStart() + 4096);
+  EXPECT_EQ(getGeneralRegister<int64_t>(7), process_->getMmapStart() + 16384);
+  EXPECT_EQ(getGeneralRegister<int64_t>(28), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(29), process_->getMmapStart() + 20480);
+  EXPECT_EQ(getGeneralRegister<int64_t>(30), process_->getMmapStart() + 4096);
+  EXPECT_EQ(getGeneralRegister<int64_t>(31), process_->getMmapStart() + 8192);
+}
+
+TEST_P(Syscall, getrandom) {
+  initialHeapData_.resize(24);
+  memset(initialHeapData_.data(), -1, 16);
+
+  RUN_RISCV(R"(
+      # Get heap address
+      li a0, 0
+      li a7, 214
+      ecall
+
+      # store initial heap address
+      mv t0, a0
+
+      # Save 8 random bytes to the heap
+      # getrandom(buf * = [a], buflen = 8, no flags)
+      li a1, 8
+      li a7, 278
+      ecall
+
+      # Save another 8 random bytes to the heap
+      # getrandom(buf * = [a], buflen = 8, no flags)
+      addi a0, t0, 8
+      li a1, 8
+      li a7, 278
+      ecall
+    )");
+
+  // Check getrandom returned 8 (8 bytes were requested)
+  EXPECT_EQ(getGeneralRegister<int64_t>(10), 8);
+
+  int heapStart = getGeneralRegister<int64_t>(5);
+  for (size_t i = 0; i < 8; i++) {
+    printf("compare %x == %x\n", getMemoryValue<uint8_t>(heapStart + i),
+           getMemoryValue<uint8_t>(heapStart + 8 + i));
+  }
+
+  // Check that the returned bytes aren't all equal to -1.
+  // heap was initialised to -1 so check bytes have changed
+  bool allUnchanged = true;
+  for (size_t i = 0; i < 16; i++) {
+    if (getMemoryValue<uint8_t>(heapStart + i) != 0xFF) {
+      allUnchanged = false;
+      break;
+    }
+  }
+  EXPECT_EQ(allUnchanged, false);
+
+  // Check that the returned bytes from the two syscalls dont all match.
+  // If they do then the returned bytes surely weren't random
+  bool allMatch = true;
+  for (char i = 0; i < 8; i++) {
+    if (getMemoryValue<uint8_t>(heapStart + i) !=
+        getMemoryValue<uint8_t>(heapStart + 8 + i)) {
+      allMatch = false;
+      break;
+    }
+  }
+
+  EXPECT_EQ(allMatch, false);
 }
 
 INSTANTIATE_TEST_SUITE_P(
     RISCV, Syscall,
-    ::testing::Values(std::make_tuple(EMULATION, YAML::Load("{}")),
-                      std::make_tuple(INORDER, YAML::Load("{}")),
-                      std::make_tuple(OUTOFORDER, YAML::Load("{}"))),
+    ::testing::Values(std::make_tuple(EMULATION, "{}"),
+                      std::make_tuple(INORDER, "{}"),
+                      std::make_tuple(OUTOFORDER,
+                                      "{L1-Data-Memory: "
+                                      "{Interface-Type: Fixed}}")),
     paramToString);
-
 }  // namespace
diff --git a/test/regression/riscv/data/truncate-test.txt b/test/regression/riscv/data/truncate-test.txt
index 12b437c152..94adc17b9e 100644
Binary files a/test/regression/riscv/data/truncate-test.txt and b/test/regression/riscv/data/truncate-test.txt differ
diff --git a/test/regression/riscv/instructions/arithmetic.cc b/test/regression/riscv/instructions/arithmetic.cc
index 30bc4e5fda..703f90133b 100644
--- a/test/regression/riscv/instructions/arithmetic.cc
+++ b/test/regression/riscv/instructions/arithmetic.cc
@@ -3,6 +3,7 @@
 namespace {
 
 using InstArithmetic = RISCVRegressionTest;
+using namespace simeng::arch::riscv::InstructionGroups;
 
 TEST_P(InstArithmetic, sll) {
   RUN_RISCV(R"(
@@ -13,6 +14,9 @@ TEST_P(InstArithmetic, sll) {
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 48);
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 192);
+
+  EXPECT_GROUP(R"(sll t5, t4, t3)", INT_SIMPLE_SHIFT);
+  EXPECT_GROUP(R"(slli t6, t4, 5)", INT_SIMPLE_SHIFT);
 }
 
 TEST_P(InstArithmetic, sllw) {
@@ -42,6 +46,9 @@ TEST_P(InstArithmetic, sllw) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7),
             6);  // If shamt >= 32 don't change operand as per qemu
+
+  EXPECT_GROUP(R"(sllw t5, t4, t3)", INT_SIMPLE_SHIFT);
+  EXPECT_GROUP(R"(slliw t1, t4, 31)", INT_SIMPLE_SHIFT);
 }
 
 TEST_P(InstArithmetic, srl) {
@@ -53,6 +60,9 @@ TEST_P(InstArithmetic, srl) {
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 15);
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 7);
+
+  EXPECT_GROUP(R"(srl t5, t4, t3)", INT_SIMPLE_SHIFT);
+  EXPECT_GROUP(R"(srli t6, t4, 61)", INT_SIMPLE_SHIFT);
 }
 
 TEST_P(InstArithmetic, srlw) {
@@ -70,6 +80,9 @@ TEST_P(InstArithmetic, srlw) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31),
             0b01111111111111111111111111111100);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), -7);
+
+  EXPECT_GROUP(R"(srlw t1, t4, t3)", INT_SIMPLE_SHIFT);
+  EXPECT_GROUP(R"(srliw t6, t4, 1)", INT_SIMPLE_SHIFT);
 }
 
 TEST_P(InstArithmetic, sra) {
@@ -86,6 +99,9 @@ TEST_P(InstArithmetic, sra) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), -2);
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 1);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 2);
+
+  EXPECT_GROUP(R"(sra t5, t4, t3)", INT_SIMPLE_SHIFT);
+  EXPECT_GROUP(R"(srai t6, t4, 1)", INT_SIMPLE_SHIFT);
 }
 
 TEST_P(InstArithmetic, sraw) {
@@ -113,6 +129,9 @@ TEST_P(InstArithmetic, sraw) {
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), -1);
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 1);
+
+  EXPECT_GROUP(R"(sraw t5, t5, t2)", INT_SIMPLE_SHIFT);
+  EXPECT_GROUP(R"(sraiw t6, t6, 30)", INT_SIMPLE_SHIFT);
 }
 
 TEST_P(InstArithmetic, add) {
@@ -126,6 +145,9 @@ TEST_P(InstArithmetic, add) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 6u);
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 9u);
   EXPECT_EQ(getGeneralRegister<uint64_t>(0), 0);
+
+  EXPECT_GROUP(R"(add t5, t3, t4)", INT_SIMPLE_ARTH);
+  EXPECT_GROUP(R"(addi t4, t4, 6)", INT_SIMPLE_ARTH);
 }
 
 TEST_P(InstArithmetic, addw) {
@@ -140,6 +162,8 @@ TEST_P(InstArithmetic, addw) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 6u);
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 9u);
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), -4);
+
+  EXPECT_GROUP(R"(addw t5, t3, t4)", INT_SIMPLE_ARTH);
 }
 
 TEST_P(InstArithmetic, addiw) {
@@ -154,6 +178,8 @@ TEST_P(InstArithmetic, addiw) {
   EXPECT_EQ(getGeneralRegister<int64_t>(29), -5);
   EXPECT_EQ(getGeneralRegister<int32_t>(30), -1342177285);
   EXPECT_EQ(getGeneralRegister<int64_t>(31), -5);
+
+  EXPECT_GROUP(R"(addiw t5, t3, -5)", INT_SIMPLE_ARTH);
 }
 
 TEST_P(InstArithmetic, sub) {
@@ -165,6 +191,8 @@ TEST_P(InstArithmetic, sub) {
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), -3);
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 3);
+
+  EXPECT_GROUP(R"(sub t6, t4, t3)", INT_SIMPLE_ARTH);
 }
 
 TEST_P(InstArithmetic, subw) {
@@ -185,6 +213,8 @@ TEST_P(InstArithmetic, subw) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0xFFFFFFFFFFFFFFFF);
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), -2);
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x0000000000000001);
+
+  EXPECT_GROUP(R"(subw t1, t3, t4)", INT_SIMPLE_ARTH);
 }
 
 TEST_P(InstArithmetic, lui) {
@@ -194,6 +224,8 @@ TEST_P(InstArithmetic, lui) {
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(28), 4 << 12);
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), -4ull << 12);
+
+  EXPECT_GROUP(R"(lui t3, 4)", INT_SIMPLE_ARTH);
 }
 
 TEST_P(InstArithmetic, auipc) {
@@ -203,6 +235,8 @@ TEST_P(InstArithmetic, auipc) {
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(28), 4 << 12);
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), (-4ull << 12) + 4);
+
+  EXPECT_GROUP(R"(auipc t4, 1048572)", INT_SIMPLE_ARTH);
 }
 
 TEST_P(InstArithmetic, xor) {
@@ -224,6 +258,10 @@ TEST_P(InstArithmetic, xor) {
       not t1, t3
     )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), -4);
+
+  EXPECT_GROUP(R"(xor t5, t3, t4)", INT_SIMPLE_LOGICAL);
+  EXPECT_GROUP(R"(xori t6, t5, 5)", INT_SIMPLE_LOGICAL);
+  EXPECT_GROUP(R"(not t1, t3)", INT_SIMPLE_LOGICAL);
 }
 
 TEST_P(InstArithmetic, or) {
@@ -237,6 +275,9 @@ TEST_P(InstArithmetic, or) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0b0111);
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0b1111);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), -5);
+
+  EXPECT_GROUP(R"(or t5, t3, t4)", INT_SIMPLE_LOGICAL);
+  EXPECT_GROUP(R"(ori t6, t5, 9)", INT_SIMPLE_LOGICAL);
 }
 
 TEST_P(InstArithmetic, and) {
@@ -250,6 +291,9 @@ TEST_P(InstArithmetic, and) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0b0001);
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0b0001);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 1);
+
+  EXPECT_GROUP(R"(and t5, t3, t4)", INT_SIMPLE_LOGICAL);
+  EXPECT_GROUP(R"(andi t6, t5, 9)", INT_SIMPLE_LOGICAL);
 }
 
 TEST_P(InstArithmetic, slt) {
@@ -265,6 +309,9 @@ TEST_P(InstArithmetic, slt) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 1);
+
+  EXPECT_GROUP(R"(slt t6, t4, t3)", INT_SIMPLE_CMP);
+  EXPECT_GROUP(R"(sltu t1, t3, t4)", INT_SIMPLE_CMP);
 }
 
 TEST_P(InstArithmetic, slti) {
@@ -280,6 +327,9 @@ TEST_P(InstArithmetic, slti) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 1);
+
+  EXPECT_GROUP(R"(slti t6, t4, -3)", INT_SIMPLE_CMP);
+  EXPECT_GROUP(R"(sltiu t1, t3, 5)", INT_SIMPLE_CMP);
 }
 
 TEST_P(InstArithmetic, addiPseudoinstructions) {
@@ -295,6 +345,10 @@ TEST_P(InstArithmetic, addiPseudoinstructions) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(0), 0);
   EXPECT_EQ(getGeneralRegister<int64_t>(28), -5);
   EXPECT_EQ(getGeneralRegister<int64_t>(29), -5);
+
+  EXPECT_GROUP(R"(nop)", INT_SIMPLE_ARTH);
+  EXPECT_GROUP(R"(mv t2, t1)", INT_SIMPLE_ARTH);
+  EXPECT_GROUP(R"(sext.w t4, t3)", INT_SIMPLE_ARTH);
 }
 
 TEST_P(InstArithmetic, subwPseudoinstructions) {
@@ -311,6 +365,9 @@ TEST_P(InstArithmetic, subwPseudoinstructions) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 48586817536);
   EXPECT_EQ(getGeneralRegister<int64_t>(7), 1342177280);
   EXPECT_EQ(getGeneralRegister<int64_t>(31), -1342177280);
+
+  EXPECT_GROUP(R"(neg t4, t3)", INT_SIMPLE_ARTH);
+  EXPECT_GROUP(R"(negw t6, t5)", INT_SIMPLE_ARTH);
 }
 
 TEST_P(InstArithmetic, setPseudoinstructions) {
@@ -342,6 +399,11 @@ TEST_P(InstArithmetic, setPseudoinstructions) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(8), 1);
   EXPECT_EQ(getGeneralRegister<uint64_t>(9), 0);
+
+  EXPECT_GROUP(R"(seqz t3, t1)", INT_SIMPLE_CMP);
+  EXPECT_GROUP(R"(snez t4, t0)", INT_SIMPLE_CMP);
+  EXPECT_GROUP(R"(sltz t4, t6)", INT_SIMPLE_CMP);
+  EXPECT_GROUP(R"(sgtz t5, t0)", INT_SIMPLE_CMP);
 }
 
 TEST_P(InstArithmetic, liPseudoinstruction) {
@@ -356,11 +418,12 @@ TEST_P(InstArithmetic, liPseudoinstruction) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(15), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(14), 192);
   EXPECT_EQ(getGeneralRegister<int64_t>(13), -180);
+
+  EXPECT_GROUP(R"(li a5, 0)", INT_SIMPLE_ARTH);
 }
 
 INSTANTIATE_TEST_SUITE_P(RISCV, InstArithmetic,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/regression/riscv/instructions/atomic.cc b/test/regression/riscv/instructions/atomic.cc
index 2d9af18ecb..d8b0bc6151 100644
--- a/test/regression/riscv/instructions/atomic.cc
+++ b/test/regression/riscv/instructions/atomic.cc
@@ -3,6 +3,17 @@
 namespace {
 
 using InstAtomic = RISCVRegressionTest;
+using namespace simeng::arch::riscv::InstructionGroups;
+
+// Whilst most RISC-V atomic instructions perform read-modify-write operations
+// i.e. a load, comparison and store, each is given the group LOAD_INT only. The
+// instruction object is tagged with the appropriate identifiers (isLoad,
+// isStore and isAtomic) but the group only reflects the first stage of
+// execution. This ensures the instruction goes to the correct part of the
+// pipeline i.e. the LSQ. But we currently do not model the rest of the atomic
+// behaviour precisely as the comparison happens here also. The change of the
+// instructions behaviour over its lifetime is currently not reflected in the
+// group it is given.
 
 TEST_P(InstAtomic, lr) {
   initialHeapData_.resize(16);
@@ -20,11 +31,12 @@ TEST_P(InstAtomic, lr) {
     lr.w t6, (a0)
     addi a0, a0, 4
     lr.w t5, (a0)
-
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0xFFFFFFFFDEADBEEF);
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0x012345678);
 
+  EXPECT_GROUP(R"(lr.w t5, (a0))", LOAD_INT);
+
   RUN_RISCV(R"(
     # Get heap address
     li a7, 214
@@ -33,11 +45,12 @@ TEST_P(InstAtomic, lr) {
     lr.w.aq t6, (a0)
     addi a0, a0, 4
     lr.w.aq t5, (a0)
-
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0xFFFFFFFFDEADBEEF);
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0x012345678);
 
+  EXPECT_GROUP(R"(lr.w.aq t5, (a0))", LOAD_INT);
+
   RUN_RISCV(R"(
     # Get heap address
     li a7, 214
@@ -46,11 +59,12 @@ TEST_P(InstAtomic, lr) {
     lr.w.aqrl t6, (a0)
     addi a0, a0, 4
     lr.w.aqrl t5, (a0)
-
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0xFFFFFFFFDEADBEEF);
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0x012345678);
 
+  EXPECT_GROUP(R"(lr.w.aqrl t5, (a0))", LOAD_INT);
+
   // Software should not set only the RL bit, but this is not guaranteed
   RUN_RISCV(R"(
     # Get heap address
@@ -60,11 +74,12 @@ TEST_P(InstAtomic, lr) {
     lr.w.rl t6, (a0)
     addi a0, a0, 4
     lr.w.rl t5, (a0)
-
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0xFFFFFFFFDEADBEEF);
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0x012345678);
 
+  EXPECT_GROUP(R"(lr.w.rl t5, (a0))", LOAD_INT);
+
   RUN_RISCV(R"(
     # Get heap address
     li a7, 214
@@ -73,11 +88,12 @@ TEST_P(InstAtomic, lr) {
     lr.d t6, (a0)
     addi a0, a0, 4
     lr.d t5, (a0)
-
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0x12345678DEADBEEF);
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0xFEEBDAED12345678);
 
+  EXPECT_GROUP(R"(lr.d t5, (a0))", LOAD_INT);
+
   RUN_RISCV(R"(
     # Get heap address
     li a7, 214
@@ -86,11 +102,12 @@ TEST_P(InstAtomic, lr) {
     lr.d.aq t6, (a0)
     addi a0, a0, 4
     lr.d.aq t5, (a0)
-
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0x12345678DEADBEEF);
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0xFEEBDAED12345678);
 
+  EXPECT_GROUP(R"(lr.d.aq t5, (a0))", LOAD_INT);
+
   RUN_RISCV(R"(
     # Get heap address
     li a7, 214
@@ -99,11 +116,12 @@ TEST_P(InstAtomic, lr) {
     lr.d.aqrl t6, (a0)
     addi a0, a0, 4
     lr.d.aqrl t5, (a0)
-
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0x12345678DEADBEEF);
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0xFEEBDAED12345678);
 
+  EXPECT_GROUP(R"(lr.d.aqrl t5, (a0))", LOAD_INT);
+
   RUN_RISCV(R"(
     # Get heap address
     li a7, 214
@@ -112,10 +130,11 @@ TEST_P(InstAtomic, lr) {
     lr.d.rl t6, (a0)
     addi a0, a0, 4
     lr.d.rl t5, (a0)
-
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0x12345678DEADBEEF);
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0xFEEBDAED12345678);
+
+  EXPECT_GROUP(R"(lr.d.rl t5, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, sc_w) {
@@ -142,6 +161,8 @@ TEST_P(InstAtomic, sc_w) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 987);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart), 987);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 4), 0x12345678);
+
+  EXPECT_GROUP(R"(sc.w t5, t6, (a0))", STORE_INT);
 }
 
 TEST_P(InstAtomic, sc_w_aq) {
@@ -170,6 +191,8 @@ TEST_P(InstAtomic, sc_w_aq) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 987);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart), 987);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 4), 0x12345678);
+
+  EXPECT_GROUP(R"(sc.w.aq t5, t6, (a0))", STORE_INT);
 }
 
 TEST_P(InstAtomic, sc_w_rl) {
@@ -196,6 +219,8 @@ TEST_P(InstAtomic, sc_w_rl) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 987);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart), 987);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 4), 0x12345678);
+
+  EXPECT_GROUP(R"(sc.w.rl t5, t6, (a0))", STORE_INT);
 }
 
 TEST_P(InstAtomic, sc_w_aq_rl) {
@@ -222,6 +247,8 @@ TEST_P(InstAtomic, sc_w_aq_rl) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 987);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart), 987);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 4), 0x12345678);
+
+  EXPECT_GROUP(R"(sc.w.aqrl t5, t6, (a0))", STORE_INT);
 }
 
 TEST_P(InstAtomic, sc_d) {
@@ -252,6 +279,8 @@ TEST_P(InstAtomic, sc_d) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0x12365000000001EF);
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart), 0x5000000001EFBEEF);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 8), 0xFEEB1236);
+
+  EXPECT_GROUP(R"(sc.d t5, t6, (a0))", STORE_INT);
 }
 
 TEST_P(InstAtomic, sc_d_aq) {
@@ -282,6 +311,8 @@ TEST_P(InstAtomic, sc_d_aq) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0x12365000000001EF);
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart), 0x5000000001EFBEEF);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 8), 0xFEEB1236);
+
+  EXPECT_GROUP(R"(sc.d.aq t5, t6, (a0))", STORE_INT);
 }
 
 TEST_P(InstAtomic, sc_d_rl) {
@@ -312,6 +343,8 @@ TEST_P(InstAtomic, sc_d_rl) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0x12365000000001EF);
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart), 0x5000000001EFBEEF);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 8), 0xFEEB1236);
+
+  EXPECT_GROUP(R"(sc.d.rl t5, t6, (a0))", STORE_INT);
 }
 
 TEST_P(InstAtomic, sc_d_aq_rl) {
@@ -342,6 +375,8 @@ TEST_P(InstAtomic, sc_d_aq_rl) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0x12365000000001EF);
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart), 0x5000000001EFBEEF);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 8), 0xFEEB1236);
+
+  EXPECT_GROUP(R"(sc.d.aqrl t5, t6, (a0))", STORE_INT);
 }
 
 TEST_P(InstAtomic, amoswap_w) {
@@ -380,6 +415,8 @@ TEST_P(InstAtomic, amoswap_w) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 987);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 8), 0xFEEBDAED);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 12), 987);
+
+  EXPECT_GROUP(R"(amoswap.w t5, t6, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amoswap_w_aq) {
@@ -418,6 +455,8 @@ TEST_P(InstAtomic, amoswap_w_aq) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 987);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 8), 0xFEEBDAED);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 12), 987);
+
+  EXPECT_GROUP(R"(amoswap.w.aq t5, t6, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amoswap_w_rl) {
@@ -456,6 +495,8 @@ TEST_P(InstAtomic, amoswap_w_rl) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 987);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 8), 0xFEEBDAED);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 12), 987);
+
+  EXPECT_GROUP(R"(amoswap.w.rl t5, t6, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amoswap_w_aq_rl) {
@@ -494,6 +535,8 @@ TEST_P(InstAtomic, amoswap_w_aq_rl) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 987);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 8), 0xFEEBDAED);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 12), 987);
+
+  EXPECT_GROUP(R"(amoswap.w.aqrl t5, t6, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amoswap_d) {
@@ -539,6 +582,8 @@ TEST_P(InstAtomic, amoswap_d) {
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 12), 0x000003DB);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 16), 0x80000000);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 20), 0xFFEEFFEE);
+
+  EXPECT_GROUP(R"(amoswap.d t5, t6, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amoswap_d_aq) {
@@ -584,6 +629,8 @@ TEST_P(InstAtomic, amoswap_d_aq) {
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 12), 0x000003DB);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 16), 0x80000000);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 20), 0xFFEEFFEE);
+
+  EXPECT_GROUP(R"(amoswap.d.aq t5, t6, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amoswap_d_rl) {
@@ -629,6 +676,8 @@ TEST_P(InstAtomic, amoswap_d_rl) {
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 12), 0x000003DB);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 16), 0x80000000);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 20), 0xFFEEFFEE);
+
+  EXPECT_GROUP(R"(amoswap.d.rl t5, t6, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amoswap_d_aq_rl) {
@@ -674,6 +723,8 @@ TEST_P(InstAtomic, amoswap_d_aq_rl) {
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 12), 0x000003DB);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 16), 0x80000000);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 20), 0xFFEEFFEE);
+
+  EXPECT_GROUP(R"(amoswap.d.aqrl t5, t6, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amoadd_w) {
@@ -717,6 +768,8 @@ TEST_P(InstAtomic, amoadd_w) {
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 12),
             0x800003DA);  // +ve + +ve = -ve as per GDB
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 16), 0x12365478);
+
+  EXPECT_GROUP(R"(amoadd.w t5, t6, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amoadd_w_aq) {
@@ -760,6 +813,8 @@ TEST_P(InstAtomic, amoadd_w_aq) {
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 12),
             0x800003DA);  // +ve + +ve = -ve as per GDB
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 16), 0x12365478);
+
+  EXPECT_GROUP(R"(amoadd.w.aq t5, t6, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amoadd_w_rl) {
@@ -803,6 +858,8 @@ TEST_P(InstAtomic, amoadd_w_rl) {
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 12),
             0x800003DA);  // +ve + +ve = -ve as per GDB
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 16), 0x12365478);
+
+  EXPECT_GROUP(R"(amoadd.w.rl t5, t6, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amoadd_w_aq_rl) {
@@ -846,6 +903,8 @@ TEST_P(InstAtomic, amoadd_w_aq_rl) {
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 12),
             0x800003DA);  // +ve + +ve = -ve as per GDB
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 16), 0x12365478);
+
+  EXPECT_GROUP(R"(amoadd.w.aqrl t5, t6, (a0))", LOAD_INT);
 }
 
 // TODO add aq rl tests for all instructions below, omitted as currently
@@ -891,6 +950,8 @@ TEST_P(InstAtomic, amoadd_d) {
             0x80000000000003DA);  // +ve + +ve = -ve as per GDB
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 20),
             0x12365478);  // +ve + +ve = -ve as per GDB
+
+  EXPECT_GROUP(R"(amoadd.d t5, t6, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amoand_w) {
@@ -919,6 +980,8 @@ TEST_P(InstAtomic, amoand_w) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x5555555555555555);
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0xFFFFFFFFB3333333);
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart), 0x1234567811111111);  // 0b0001
+
+  EXPECT_GROUP(R"(amoand.w t1, t0, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amoand_d) {
@@ -948,6 +1011,8 @@ TEST_P(InstAtomic, amoand_d) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x3333333333333333);
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart), 0x1111111111111111);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 8), 0xFEEBDAED);
+
+  EXPECT_GROUP(R"(amoand.d t1, t0, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amoor_w) {
@@ -976,6 +1041,8 @@ TEST_P(InstAtomic, amoor_w) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x5555555555555555);
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0xFFFFFFFFB3333333);
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart), 0x12345678F7777777);  // 0b0111
+
+  EXPECT_GROUP(R"(amoor.w t1, t0, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amoor_d) {
@@ -1005,6 +1072,8 @@ TEST_P(InstAtomic, amoor_d) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x3333333333333333);
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart), 0x7777777777777777);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 8), 0xFEEBDAED);
+
+  EXPECT_GROUP(R"(amoor.d t1, t0, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amoxor_w) {
@@ -1033,6 +1102,8 @@ TEST_P(InstAtomic, amoxor_w) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x5555555555555555);
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0xFFFFFFFFB3333333);
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart), 0x12345678E6666666);  // 0b0110
+
+  EXPECT_GROUP(R"(amoxor.w t1, t0, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amoxor_d) {
@@ -1062,6 +1133,8 @@ TEST_P(InstAtomic, amoxor_d) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x3333333333333333);
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart), 0x6666666666666666);
   EXPECT_EQ(getMemoryValue<uint32_t>(heapStart + 8), 0xFEEBDAED);
+
+  EXPECT_GROUP(R"(amoxor.d t1, t0, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amomin_w) {
@@ -1141,6 +1214,8 @@ TEST_P(InstAtomic, amomin_w) {
             0xF000000055555555);  // (large +ve word), -ve double
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x0000000003333333);  // small +ve
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart), 0x1234567803333333);
+
+  EXPECT_GROUP(R"(amomin.w t1, t0, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amomin_d) {
@@ -1180,6 +1255,8 @@ TEST_P(InstAtomic, amomin_d) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0x0034567899999999);  // small +ve
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0x12345678FEEBDAED);  // large +ve
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart + 8), 0x0034567899999999);
+
+  EXPECT_GROUP(R"(amomin.d t6, t5, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amominu_w) {
@@ -1260,6 +1337,8 @@ TEST_P(InstAtomic, amominu_w) {
             0xF000000055555555);  // (large +ve word), -ve double
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x0000000003333333);  // small +ve
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart), 0x1234567803333333);
+
+  EXPECT_GROUP(R"(amominu.w t1, t0, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amominu_d) {
@@ -1300,6 +1379,8 @@ TEST_P(InstAtomic, amominu_d) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0x0034567899999999);  // small +ve
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0x12345678FEEBDAED);  // large +ve
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart + 8), 0x0034567899999999);
+
+  EXPECT_GROUP(R"(amominu.d t6, t5, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amomax_w) {
@@ -1379,6 +1460,8 @@ TEST_P(InstAtomic, amomax_w) {
             0xF000000055555555);  // (large +ve word), -ve double
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x0000000003333333);  // small +ve
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart), 0x1234567855555555);
+
+  EXPECT_GROUP(R"(amomax.w t1, t0, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amomax_d) {
@@ -1418,6 +1501,8 @@ TEST_P(InstAtomic, amomax_d) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0x0034567899999999);  // small +ve
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0x12345678FEEBDAED);  // large +ve
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart + 8), 0x12345678FEEBDAED);
+
+  EXPECT_GROUP(R"(amomax.d t6, t5, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amomaxu_w) {
@@ -1498,6 +1583,8 @@ TEST_P(InstAtomic, amomaxu_w) {
             0xF000000055555555);  // (large +ve word), -ve double
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x0000000003333333);  // small +ve
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart), 0x1234567855555555);
+
+  EXPECT_GROUP(R"(amomaxu.w t1, t0, (a0))", LOAD_INT);
 }
 
 TEST_P(InstAtomic, amomaxu_d) {
@@ -1537,11 +1624,12 @@ TEST_P(InstAtomic, amomaxu_d) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0x0034567899999999);
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0x12345678FEEBDAED);
   EXPECT_EQ(getMemoryValue<uint64_t>(heapStart + 8), 0x12345678FEEBDAED);
+
+  EXPECT_GROUP(R"(amomaxu.d t6, t5, (a0))", LOAD_INT);
 }
 
 INSTANTIATE_TEST_SUITE_P(RISCV, InstAtomic,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/regression/riscv/instructions/branch.cc b/test/regression/riscv/instructions/branch.cc
index 547795f629..a31effb676 100644
--- a/test/regression/riscv/instructions/branch.cc
+++ b/test/regression/riscv/instructions/branch.cc
@@ -3,6 +3,7 @@
 namespace {
 
 using InstBranch = RISCVRegressionTest;
+using namespace simeng::arch::riscv::InstructionGroups;
 
 TEST_P(InstBranch, BEQ) {
   RUN_RISCV(R"(
@@ -23,6 +24,9 @@ TEST_P(InstBranch, BEQ) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 7);
   EXPECT_EQ(getGeneralRegister<uint64_t>(28), 5);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0);
+
+  EXPECT_GROUP(R"(beq zero, t4, 8)", BRANCH);
+  EXPECT_GROUP(R"(beqz s0, -8)", BRANCH);
 }
 
 TEST_P(InstBranch, BNE) {
@@ -41,6 +45,9 @@ TEST_P(InstBranch, BNE) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 7);
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 19);
   EXPECT_EQ(getGeneralRegister<uint64_t>(28), 17);
+
+  EXPECT_GROUP(R"(bne t0, t1, 8)", BRANCH);
+  EXPECT_GROUP(R"(bnez t0, 4)", BRANCH);
 }
 
 TEST_P(InstBranch, BLT) {
@@ -63,6 +70,9 @@ TEST_P(InstBranch, BLT) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 17);
 
+  EXPECT_GROUP(R"(blt t0, t1, 8)", BRANCH);
+  EXPECT_GROUP(R"(bltz t4, 8)", BRANCH);
+
   RUN_RISCV(R"(
       addi t0, t0, -5
       addi t1, t1, 5
@@ -79,6 +89,8 @@ TEST_P(InstBranch, BLT) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 13);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 12);
 
+  EXPECT_GROUP(R"(bgtz t1, 8)", BRANCH);
+
   RUN_RISCV(R"(
       addi t0, t0, -5
       addi t1, t1, 5
@@ -94,6 +106,8 @@ TEST_P(InstBranch, BLT) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 18);
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 13);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 12);
+
+  EXPECT_GROUP(R"(bgt t1, t0, 8 )", BRANCH);
 }
 
 TEST_P(InstBranch, BLTU) {
@@ -112,6 +126,9 @@ TEST_P(InstBranch, BLTU) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 7);
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 16);
   EXPECT_EQ(getGeneralRegister<uint64_t>(28), 15);
+
+  EXPECT_GROUP(R"(bltu t1, t0, 8)", BRANCH);
+  EXPECT_GROUP(R"(bgtu t1, t0, 8)", BRANCH);
 }
 
 TEST_P(InstBranch, BGE) {
@@ -130,6 +147,8 @@ TEST_P(InstBranch, BGE) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0);
 
+  EXPECT_GROUP(R"(bge t1, t0, 8)", BRANCH);
+
   RUN_RISCV(R"(
       addi t0, t0, -5
       addi t1, t1, 5
@@ -146,6 +165,8 @@ TEST_P(InstBranch, BGE) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 12);
 
+  EXPECT_GROUP(R"(blez t1, 8)", BRANCH);
+
   RUN_RISCV(R"(
       addi t0, t0, -5
       addi t1, t1, 5
@@ -162,6 +183,8 @@ TEST_P(InstBranch, BGE) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 12);
 
+  EXPECT_GROUP(R"(bgez t3, 8)", BRANCH);
+
   RUN_RISCV(R"(
       addi t0, t0, -5
       addi t1, t1, 5
@@ -177,6 +200,8 @@ TEST_P(InstBranch, BGE) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 18);
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 12);
+
+  EXPECT_GROUP(R"(ble t3, t3, 8)", BRANCH);
 }
 
 TEST_P(InstBranch, BGEU) {
@@ -198,6 +223,8 @@ TEST_P(InstBranch, BGEU) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(28), 14);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 11);
 
+  EXPECT_GROUP(R"(bgeu t0, t1, 8)", BRANCH);
+
   RUN_RISCV(R"(
       addi t0, t0, -5
       addi t1, t1, 5
@@ -213,11 +240,12 @@ TEST_P(InstBranch, BGEU) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 12);
+
+  EXPECT_GROUP(R"(bleu t3, t3, 8)", BRANCH);
 }
 
 INSTANTIATE_TEST_SUITE_P(RISCV, InstBranch,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/regression/riscv/instructions/compressed.cc b/test/regression/riscv/instructions/compressed.cc
new file mode 100644
index 0000000000..a7387e09b4
--- /dev/null
+++ b/test/regression/riscv/instructions/compressed.cc
@@ -0,0 +1,804 @@
+#include "RISCVRegressionTest.hh"
+
+namespace {
+
+using InstCompressed = RISCVRegressionTest;
+using namespace simeng::arch::riscv::InstructionGroups;
+
+TEST_P(InstCompressed, lwsp) {
+  //  Load word from mem[stack pointer + imm]
+  initialHeapData_.resize(16);
+  uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heap[0] = 0xDEADBEEF;
+  heap[1] = 0x12345678;
+  heap[2] = 0xFEEBDAED;
+  heap[3] = 0x87654321;
+
+  RUN_RISCV_COMP(R"(
+      li a7, 214
+      ecall
+
+      li x2, 0
+      add x2, x2, a0
+      c.lwsp t6, 0(x2)
+      c.lwsp t4, 4(x2)
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0xFFFFFFFFDEADBEEF);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0x0000000012345678);
+
+  EXPECT_GROUP_COMP(R"(c.lwsp t4, 4(x2))", LOAD_INT);
+}
+
+TEST_P(InstCompressed, ldsp) {
+  //  Load double word from mem[stack pointer + imm]
+  initialHeapData_.resize(16);
+  uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heap[0] = 0xDEADBEEF;
+  heap[1] = 0x12345678;
+  heap[2] = 0xFEEBDAED;
+  heap[3] = 0x87654321;
+
+  RUN_RISCV_COMP(R"(
+      li a7, 214
+      ecall
+
+      li x2, 0
+      add x2, x2, a0
+      c.ldsp t6, 0(x2)
+      addi x2, x2, -4
+      c.ldsp t4, 8(x2)
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0x12345678DEADBEEF);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0xFEEBDAED12345678);
+
+  EXPECT_GROUP_COMP(R"(c.ldsp t4, 8(x2))", LOAD_INT);
+}
+
+TEST_P(InstCompressed, fldsp) {
+  //  Load double precision float from mem[stack pointer + imm]
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 1.0;
+  heap[1] = 123.456;
+  heap[2] = -0.00032;
+  heap[3] = 123456;
+
+  RUN_RISCV_COMP(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    li x2, 0
+    add x2, x2, a0
+    c.fldsp ft0, 0(x2)
+    c.fldsp ft1, 8(x2)
+    c.fldsp ft2, 16(x2)
+    c.fldsp ft3, 24(x2)
+  )");
+
+  EXPECT_EQ(getFPRegister<double>(0), 1.0);
+  EXPECT_EQ(getFPRegister<double>(1), 123.456);
+  EXPECT_EQ(getFPRegister<double>(2), -0.00032);
+  EXPECT_EQ(getFPRegister<double>(3), 123456);
+
+  EXPECT_GROUP_COMP(R"(c.fldsp ft3, 24(x2))", LOAD_FLOAT);
+}
+
+TEST_P(InstCompressed, swsp) {
+  //  Store word at mem[stack pointer + imm]
+  RUN_RISCV_COMP(R"(
+      li t6, 0xAA
+      c.swsp t6, 0(sp)
+
+      addi t6, t6, 0xAA  # 0xAA + 0xAA = 154
+      slli t6, t6, 16
+      addi t6, t6, 0xAA  # 0x15400AA
+      c.swsp t6, 4(sp)
+  )");
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer()),
+            0x000000AA);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer()),
+            0x15400AA000000AA);
+
+  EXPECT_GROUP_COMP(R"(c.swsp t6, 4(sp))", STORE_INT);
+}
+
+TEST_P(InstCompressed, sdsp) {
+  //  Store double word at mem[stack pointer + imm]
+  RUN_RISCV_COMP(R"(
+      li t6, 0xAA
+      c.sdsp t6, 0(sp)
+
+      addi t6, t6, 0xAA  # 0xAA + 0xAA = 154
+      slli t6, t6, 16
+      addi t6, t6, 0xAA  # 0x15400AA
+      c.sdsp t6, 8(sp)
+  )");
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer()),
+            0x00000000000000AA);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() + 8),
+            0x00000000015400AA);
+
+  EXPECT_GROUP_COMP(R"(c.sdsp t6, 8(sp))", STORE_INT);
+}
+
+TEST_P(InstCompressed, fsdsp) {
+  //  Store double precision float at mem[stack pointer + imm]
+  RUN_RISCV_COMP(R"(
+      li t6, 0xAA
+      fmv.d.x f8, t6
+      c.fsdsp f8, 0(sp)
+
+      addi t6, t6, 0xAA  # 0xAA + 0xAA = 154
+      slli t6, t6, 16
+      addi t6, t6, 0xAA  # 0x15400AA
+      fmv.d.x f8, t6
+      c.fsdsp f8, 8(sp)
+  )");
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer()),
+            0x00000000000000AA);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() + 8),
+            0x00000000015400AA);
+
+  EXPECT_GROUP_COMP(R"(c.fsdsp f8, 8(sp))", STORE_FLOAT);
+}
+
+TEST_P(InstCompressed, lw) {
+  // Compressed load word
+  initialHeapData_.resize(16);
+  uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heap[0] = 0xDEADBEEF;
+  heap[1] = 0x12345678;
+  heap[2] = 0xFEEBDAED;
+  heap[3] = 0x87654321;
+
+  RUN_RISCV_COMP(R"(
+      li a7, 214
+      ecall
+
+      add x8, x8, a0
+      c.lw x15, 0(x8)
+      c.lw x13, 4(x8)
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(15), 0xFFFFFFFFDEADBEEF);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(13), 0x0000000012345678);
+
+  EXPECT_GROUP_COMP(R"(c.lw x13, 4(x8))", LOAD_INT);
+}
+
+TEST_P(InstCompressed, ld) {
+  // Compressed store word
+  initialHeapData_.resize(16);
+  uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heap[0] = 0xDEADBEEF;
+  heap[1] = 0x12345678;
+  heap[2] = 0xFEEBDAED;
+  heap[3] = 0x87654321;
+
+  RUN_RISCV_COMP(R"(
+      li a7, 214
+      ecall
+
+      add x8, x8, a0
+      c.ld x15, 0(x8)
+      addi x8, x8, -4
+      c.ld x13, 8(x8)
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(15), 0x12345678DEADBEEF);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(13), 0xFEEBDAED12345678);
+
+  EXPECT_GROUP_COMP(R"(c.ld x13, 8(x8))", LOAD_INT);
+}
+
+TEST_P(InstCompressed, fld) {
+  // Compressed load double precision float
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 1.0;
+  heap[1] = 123.456;
+  heap[2] = -0.00032;
+  heap[3] = 123456;
+
+  RUN_RISCV_COMP(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    c.fld f8, 0(a0)
+    c.fld f9, 8(a0)
+    c.fld f10, 16(a0)
+    c.fld f11, 24(a0)
+  )");
+
+  EXPECT_EQ(getFPRegister<double>(8), 1.0);
+  EXPECT_EQ(getFPRegister<double>(9), 123.456);
+  EXPECT_EQ(getFPRegister<double>(10), -0.00032);
+  EXPECT_EQ(getFPRegister<double>(11), 123456);
+
+  EXPECT_GROUP_COMP(R"(c.fld f11, 24(a0))", LOAD_FLOAT);
+}
+
+TEST_P(InstCompressed, sw) {
+  // Compressed store word
+  initialHeapData_.resize(16);
+  uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heap[0] = 0x12345678;
+  heap[1] = 0xDEADBEEF;
+  heap[2] = 0x87654321;
+
+  RUN_RISCV_COMP(R"(
+      # Get heap address
+      li a7, 214
+      ecall
+
+      li x8, 0xAA
+      c.sw x8, 0(a0)
+
+      addi x8, x8, 0xAA  # 0xAA + 0xAA = 154
+      slli x8, x8, 16
+      addi x8, x8, 0xAA  # 0x15400AA
+      c.sw x8, 4(a0)
+  )");
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(10), 32);
+  EXPECT_EQ(getMemoryValue<uint64_t>(32), 0x015400AA000000AA);
+  EXPECT_EQ(getMemoryValue<uint64_t>(36), 0x87654321015400AA);
+
+  EXPECT_GROUP_COMP(R"(c.sw x8, 4(a0))", STORE_INT);
+}
+
+TEST_P(InstCompressed, sd) {
+  // Compressed store double word
+  initialHeapData_.resize(16);
+  uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heap[0] = 0x12345678;
+  heap[1] = 0xDEADBEEF;
+  heap[2] = 0x87654321;
+
+  RUN_RISCV_COMP(R"(
+      # Get heap address
+      li a7, 214
+      ecall
+
+      li x8, 0xAA
+      c.sd x8, 0(a0)
+
+      addi x8, x8, 0xAA  # 0xAA + 0xAA = 154
+      slli x8, x8, 16
+      addi x8, x8, 0xAA  # 0x15400AA
+      c.sd x8, 8(a0)
+  )");
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(10), 32);
+  EXPECT_EQ(getMemoryValue<uint64_t>(32), 0x00000000000000AA);
+  EXPECT_EQ(getMemoryValue<uint64_t>(40), 0x00000000015400AA);
+
+  EXPECT_GROUP_COMP(R"(c.sd x8, 8(a0))", STORE_INT);
+}
+
+TEST_P(InstCompressed, fsd) {
+  // Compressed store double precision float
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 1.0;
+  heap[1] = 123.456;
+  heap[2] = -0.00032;
+  heap[3] = 123456;
+
+  RUN_RISCV_COMP(R"(
+     # Get heap address
+     li a7, 214
+     ecall
+
+     fld fa0, 0(a0)
+     fld fa1, 8(a0)
+     fld fa2, 16(a0)
+     fld fa3, 24(a0)
+
+     c.fsd fa3, 0(a0)
+     c.fsd fa2, 8(a0)
+     c.fsd fa1, 16(a0)
+     c.fsd fa0, 24(a0)
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(10), 1.0);
+  EXPECT_EQ(getFPRegister<double>(11), 123.456);
+  EXPECT_EQ(getFPRegister<double>(12), -0.00032);
+  EXPECT_EQ(getFPRegister<double>(13), 123456);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(10), 32);
+
+  EXPECT_EQ(getMemoryValue<double>(32), 123456);
+  EXPECT_EQ(getMemoryValue<double>(40), -0.00032);
+  EXPECT_EQ(getMemoryValue<double>(48), 123.456);
+  EXPECT_EQ(getMemoryValue<double>(56), 1.0);
+
+  EXPECT_GROUP_COMP(R"(c.fsd fa3, 0(a0))", STORE_FLOAT);
+}
+
+TEST_P(InstCompressed, j) {
+  // Compressed jump
+  // Labels needed as LLVM eagerly uses compressed instructions e.g. addi ->
+  // c.addi causing manual jump offsets to become seemingly misaligned with the
+  // values used in the tests
+  RUN_RISCV_COMP(R"(
+    c.j jump              #c.j 0xc
+    jumpa:
+    addi t6, t6, 10
+    jal t1, jumpc        #jal t1, 0xc
+    jump:
+    addi t5, t5, 5
+    jal jumpa           #jal -0xc
+    jumpc:
+    addi t4, t4, 3
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(30), 5);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31), 10);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 3);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 8);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(1), 14);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(0), 0);
+
+  EXPECT_GROUP_COMP(R"(c.j jump)", BRANCH);
+}
+
+TEST_P(InstCompressed, jr) {
+  // Compressed jump to address in register
+  RUN_RISCV_COMP(R"(
+    c.addi x9, 8
+    c.jr x9
+    c.addi x8, 4
+    c.j end
+    c.addi x8, 5
+    end:
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(8), 5);
+
+  EXPECT_GROUP_COMP(R"(c.jr x9)", BRANCH);
+}
+
+TEST_P(InstCompressed, jalr) {
+  // Compressed jump to address in rs1, save pc+2 in link register
+  RUN_RISCV_COMP(R"(
+    li x8, 12
+    c.jalr x8
+    mv t0, ra
+    addi t6, t6, 10
+    li x8, 20
+    c.jalr x8
+    mv t1, ra
+    addi t5, t5, 5
+    li x8, 4
+    c.jalr x8
+    mv t2, ra
+    addi t4, t4, 3
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(30), 5);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31), 10);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 3);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 20);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 4);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 12);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(1), 12);
+
+  EXPECT_GROUP_COMP(R"(c.jalr x8)", BRANCH);
+}
+
+TEST_P(InstCompressed, beqz) {
+  // Compressed branch if rs1 equal to zero
+  RUN_RISCV_COMP(R"(
+    addi x8, x8, 2
+    c.beqz x8, b1
+    addi x10, x10, 10
+    li x9, 0
+    c.beqz x9, b2
+    j b3
+    b1:
+    addi x10, x10, 5
+    b2:
+    addi x11, x11, 10
+    j b4
+    b3:
+    addi x11, x11, 5
+    b4:
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(10), 10);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(11), 10);
+
+  EXPECT_GROUP_COMP(R"(c.beqz x9, b2)", BRANCH);
+}
+
+TEST_P(InstCompressed, bnez) {
+  // Compressed branch if rs1 not equal to zero
+  RUN_RISCV_COMP(R"(
+    addi x8, x8, 0
+    c.bnez x8, b1
+    addi x10, x10, 10
+    li x9, 2
+    c.bnez x9, b2
+    j b3
+    b1:
+    addi x10, x10, 5
+    b2:
+    addi x11, x11, 10
+    j b4
+    b3:
+    addi x11, x11, 5
+    b4:
+  )");
+
+  EXPECT_GROUP_COMP(R"(c.bnez x9, b2)", BRANCH);
+}
+
+TEST_P(InstCompressed, li) {
+  // Compressed load immediate
+  RUN_RISCV_COMP(R"(
+    addi a5, a5, 12
+    c.li a5, 0
+    addi a4, a4, 12
+    c.li a4, -32
+    addi a3, a3, 12
+    c.li a3, 31
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(15), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(14), -32);
+  EXPECT_EQ(getGeneralRegister<int64_t>(13), 31);
+
+  EXPECT_GROUP_COMP(R"(c.li a3, 31)", INT_SIMPLE_ARTH);
+}
+
+TEST_P(InstCompressed, lui) {
+  // Compressed load immediate into bits 17-12, clear bottom 12 and sign extend
+  // high bits
+  RUN_RISCV_COMP(R"(
+      c.lui t3, 4
+      c.lui t4, 0xFFFFC
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 4 << 12);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), -4ull << 12);
+
+  EXPECT_GROUP_COMP(R"(c.lui t4, 0xFFFFC)", INT_SIMPLE_ARTH);
+}
+
+TEST_P(InstCompressed, addi) {
+  // Compressed add immediate
+  RUN_RISCV_COMP(R"(
+    c.addi t3, 3
+    c.addi t4, 6
+    c.addi t3, 30
+    c.addi zero, 16
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 6u);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 33u);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(0), 0);
+
+  EXPECT_GROUP_COMP(R"(c.addi zero, 16)", INT_SIMPLE_ARTH);
+}
+
+TEST_P(InstCompressed, addiw) {
+  // Compressed add immediate. Produces 32 bit result and sign extends
+  RUN_RISCV_COMP(R"(
+    addi t3, t3, 91
+    slli t3, t3, 28
+    addiw t5, t3, -5
+    addiw t6, t2, -5
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 24427626496);
+  EXPECT_EQ(getGeneralRegister<int32_t>(30), -1342177285);
+  EXPECT_EQ(getGeneralRegister<int64_t>(31), -5);
+
+  EXPECT_GROUP_COMP(R"(addiw t6, t2, -5)", INT_SIMPLE_ARTH);
+}
+
+TEST_P(InstCompressed, addi16sp) {
+  // Add immediate (multiple of 16) to stack pointer
+  RUN_RISCV_COMP(R"(
+    mv x8, sp
+    c.addi16sp x2, 16
+    mv x9, x2
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(8),
+            process_->getInitialStackPointer());
+  EXPECT_EQ(getGeneralRegister<uint64_t>(9),
+            process_->getInitialStackPointer() + 16);
+
+  EXPECT_GROUP_COMP(R"(mv x9, x2)", INT_SIMPLE_ARTH);
+}
+
+TEST_P(InstCompressed, addi4spn) {
+  // Add immediate to stack pointer
+  RUN_RISCV_COMP(R"(
+    c.addi4spn x8, x2, 4
+    c.addi4spn x9, x2, 12
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(8),
+            process_->getInitialStackPointer() + 4);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(9),
+            process_->getInitialStackPointer() + 12);
+
+  EXPECT_GROUP_COMP(R"(c.addi4spn x9, x2, 12)", INT_SIMPLE_ARTH);
+}
+
+TEST_P(InstCompressed, slli) {
+  // Compressed shift left logical by immediate. rs1 = rd
+  RUN_RISCV_COMP(R"(
+      addi t4, t4, 6
+      c.slli t4, 5
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 192);
+
+  EXPECT_GROUP_COMP(R"(c.slli t4, 5)", INT_SIMPLE_SHIFT);
+}
+
+TEST_P(InstCompressed, srli) {
+  // Compressed shift right logical by immediate. rs1 = rd
+  RUN_RISCV_COMP(R"(
+      addi x8, x8, -4
+      c.srli x8, 61
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(8), 7);
+
+  EXPECT_GROUP_COMP(R"(c.srli x8, 61)", INT_SIMPLE_SHIFT);
+}
+
+TEST_P(InstCompressed, srai) {
+  // Compressed shift right arithmetic by immediate. rs1 = rd
+  RUN_RISCV_COMP(R"(
+    addi x8, x8, -4
+    add t0, t0, x8
+    c.srai x8, 1
+    addi x9, t0, 8
+    c.srai x9, 1
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(8), -2);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(9), 2);
+
+  EXPECT_GROUP_COMP(R"(c.srai x9, 1)", INT_SIMPLE_SHIFT);
+}
+
+TEST_P(InstCompressed, andi) {
+  // Compressed AND with sign extended immediate. rs1 = rd
+  RUN_RISCV_COMP(R"(
+    addi x9, x9, 3
+    addi t4, t4, 5
+    and x8, x9, t4
+    c.andi x8, 9
+    c.andi x9, -7
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(8), 0b0001);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(9), 1);
+
+  EXPECT_GROUP_COMP(R"(c.andi x9, -7)", INT_SIMPLE_LOGICAL);
+}
+
+TEST_P(InstCompressed, mv) {
+  // Compressed move
+  RUN_RISCV_COMP(R"(
+     addi x8, x8, 3
+     addi x9, x9, 6
+     c.mv x8, x9
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(8), 6u);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(9), 6u);
+
+  EXPECT_GROUP_COMP(R"(c.mv x8, x9)", INT_SIMPLE_ARTH);
+}
+
+TEST_P(InstCompressed, add) {
+  // Compressed add. rs1 = rd
+  RUN_RISCV_COMP(R"(
+     addi x8, x8, 3
+     addi x9, x9, 6
+     c.add x8, x9
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(8), 9u);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(9), 6u);
+
+  EXPECT_GROUP_COMP(R"(c.add x8, x9)", INT_SIMPLE_ARTH);
+}
+
+TEST_P(InstCompressed, and) {
+  // Compressed AND. rs1 = rd
+  RUN_RISCV_COMP(R"(
+    addi x8, x8, 3
+    addi x9, x9, 5
+    c.and x8, x9
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(8), 0b0001);
+
+  EXPECT_GROUP_COMP(R"(c.and x8, x9)", INT_SIMPLE_LOGICAL);
+}
+
+TEST_P(InstCompressed, or) {
+  // Compressed OR. rs1 = rd
+  RUN_RISCV_COMP(R"(
+    addi x8, x8, 3
+    addi x9, x9, 5
+    c.or x8, x9
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(8), 0b0111);
+
+  EXPECT_GROUP_COMP(R"(c.or x8, x9)", INT_SIMPLE_LOGICAL);
+}
+
+TEST_P(InstCompressed, xor) {
+  // Compressed XOR. rs1 = rd
+  RUN_RISCV_COMP(R"(
+    addi x8, x8, 3
+    addi x9, x9, 5
+    c.xor x8, x9
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(8), 0b0110);
+
+  EXPECT_GROUP_COMP(R"(c.xor x8, x9)", INT_SIMPLE_LOGICAL);
+}
+
+TEST_P(InstCompressed, sub) {
+  // Compressed subtract. rs1 = rd
+  RUN_RISCV_COMP(R"(
+    addi x8, x8, 3
+    addi x9, x9, 6
+    mv x10, x8
+    c.sub x8, x9
+    c.sub x9, x10
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(8), -3);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(9), 3);
+
+  EXPECT_GROUP_COMP(R"(c.sub x9, x10)", INT_SIMPLE_ARTH);
+}
+
+TEST_P(InstCompressed, addw) {
+  // Compressed add word. Adds rd and rs2 then sign extends lower 32 bits. rs1 =
+  // rd
+  RUN_RISCV_COMP(R"(
+    addi x9, x9, -7
+    addi x8, x8, 3
+    mv x11, x8
+    addi x10, x10, 6
+    c.addw x8, x10
+    c.addw x9, x11
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(8), 9u);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(9), -4);
+
+  EXPECT_GROUP_COMP(R"(c.addw x9, x11)", INT_SIMPLE_ARTH);
+}
+
+TEST_P(InstCompressed, subw) {
+  // Compressed subtract word. Subtracts rs2 from rd then sign extends lower 32
+  // bits. rs1 = rd
+  RUN_RISCV_COMP(R"(
+    addi x9, x9, 3
+    addi x10, x10, 6
+    mv x11, x10
+    mv x12, x9
+    c.subw x9, x10
+    c.subw x10, x12
+
+    li x12, -1
+    addi x11, x11, -8
+    c.subw x12, x11
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(9), -3);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(10), 3);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(11), -2);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(12), 0x0000000000000001);
+
+  EXPECT_GROUP_COMP(R"(c.subw x12, x11)", INT_SIMPLE_ARTH);
+}
+
+TEST_P(InstCompressed, nop) {
+  // Ensure that a nop doesn't change the state of the processor
+  // Load a register and check initial architectural state
+  RUN_RISCV_COMP(R"(
+    li x8, 1234
+  )");
+  EXPECT_EQ(getGeneralRegister<uint64_t>(0), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(1), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(2),
+            process_->getInitialStackPointer());
+  EXPECT_EQ(getGeneralRegister<uint64_t>(3), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(4), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(8), 1234);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(9), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(10), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(11), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(12), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(13), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(14), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(15), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(16), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(17), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(18), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(19), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(20), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(21), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(22), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(23), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(24), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(25), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(26), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(27), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0);
+  EXPECT_EQ(numTicks_, 2);  // 1 insn + 1 for unimplemented final insn
+
+  numTicks_ = 0;
+
+  // Run some no operations
+  RUN_RISCV_COMP(R"(
+    c.nop
+    c.nop
+    c.nop
+    c.nop
+    c.nop
+  )");
+
+  // Ensure state hasn't changed except the number of ticks
+  EXPECT_EQ(getGeneralRegister<uint64_t>(0), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(1), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(2),
+            process_->getInitialStackPointer());
+  EXPECT_EQ(getGeneralRegister<uint64_t>(3), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(4), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(8), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(9), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(10), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(11), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(12), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(13), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(14), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(15), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(16), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(17), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(18), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(19), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(20), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(21), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(22), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(23), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(24), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(25), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(26), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(27), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0);
+  EXPECT_EQ(numTicks_, 6);  // 5 insns + 1 for unimplemented final insn
+
+  EXPECT_GROUP_COMP(R"(c.nop)", INT_SIMPLE_ARTH);
+}
+
+TEST_P(InstCompressed, ebreak) {
+  // Currently not implemented so ensure this produces an exception
+
+  RUN_RISCV_COMP(R"(
+    c.ebreak
+  )");
+
+  const char err1[] =
+      "\n[SimEng:ExceptionHandler] Encountered execution not-yet-implemented "
+      "exception\n[SimEng:ExceptionHandler]  Generated by instruction: "
+      "\n[SimEng:ExceptionHandler]    0x0000000000000000: 02 90     c.ebreak";
+  EXPECT_EQ(stdout_.substr(0, sizeof(err1) - 1), err1);
+
+  EXPECT_GROUP_COMP(R"(c.ebreak)", INT_SIMPLE_ARTH);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    RISCV, InstCompressed,
+    ::testing::Values(std::make_tuple(EMULATION, "{Core: {Compressed: True}}")),
+    paramToString);
+
+}  // namespace
diff --git a/test/regression/riscv/instructions/float.cc b/test/regression/riscv/instructions/float.cc
new file mode 100644
index 0000000000..d5c910d53b
--- /dev/null
+++ b/test/regression/riscv/instructions/float.cc
@@ -0,0 +1,3003 @@
+#include "RISCVRegressionTest.hh"
+
+namespace {
+
+using InstFloat = RISCVRegressionTest;
+using namespace simeng::arch::riscv::InstructionGroups;
+
+static constexpr uint64_t boxedPositiveNan = 0xffffffff7fc00000;
+
+// All test verified with qemu
+
+TEST_P(InstFloat, FLD) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 1.0;
+  heap[1] = 123.456;
+  heap[2] = -0.00032;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld ft0, 0(a0)
+    fld ft1, 8(a0)
+    fld ft2, 16(a0)
+    fld ft3, 24(a0)
+  )");
+
+  EXPECT_EQ(getFPRegister<double>(0), 1.0);
+  EXPECT_EQ(getFPRegister<double>(1), 123.456);
+  EXPECT_EQ(getFPRegister<double>(2), -0.00032);
+  EXPECT_EQ(getFPRegister<double>(3), 123456);
+
+  EXPECT_GROUP(R"(fld ft3, 24(a0))", LOAD_FLOAT);
+}
+
+TEST_P(InstFloat, FLW) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 1.0;
+  heap[1] = 123.456;
+  heap[2] = -0.00032;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+     # Get heap address
+     li a7, 214
+     ecall
+
+     flw ft0, 0(a0)
+     flw ft1, 4(a0)
+     flw ft2, 8(a0)
+     flw ft3, 12(a0)
+   )");
+
+  // Check bit values to avoid discrepancies with rounding
+
+  EXPECT_EQ(getFPRegister<uint32_t>(0), 0x3f800000);
+  EXPECT_EQ(getFPRegister<uint32_t>(1), 0x42f6e979);
+  EXPECT_EQ(getFPRegister<uint32_t>(2), 0xb9a7c5ac);
+  EXPECT_EQ(getFPRegister<uint32_t>(3), 0x47f12000);
+
+  EXPECT_EQ(getFPRegister<float>(0), (float)1.0);
+  EXPECT_EQ(getFPRegister<float>(1), (float)123.456);
+  EXPECT_EQ(getFPRegister<float>(2), (float)-0.00032);
+  EXPECT_EQ(getFPRegister<float>(3), (float)123456);
+
+  // Check bit values as NaNs comparison results in false even if equivalent
+
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0xffffffff42f6e979);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0xffffffffb9a7c5ac);
+  EXPECT_EQ(getFPRegister<uint64_t>(3), 0xffffffff47f12000);
+
+  EXPECT_GROUP(R"( flw ft3, 12(a0))", LOAD_FLOAT);
+}
+
+TEST_P(InstFloat, FSD) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 1.0;
+  heap[1] = 123.456;
+  heap[2] = -0.00032;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+     # Get heap address
+     li a7, 214
+     ecall
+
+     fld ft0, 0(a0)
+     fld ft1, 8(a0)
+     fld ft2, 16(a0)
+     fld ft3, 24(a0)
+
+     fsd ft3, 0(a0)
+     fsd ft2, 8(a0)
+     fsd ft1, 16(a0)
+     fsd ft0, 24(a0)
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(0), 1.0);
+  EXPECT_EQ(getFPRegister<double>(1), 123.456);
+  EXPECT_EQ(getFPRegister<double>(2), -0.00032);
+  EXPECT_EQ(getFPRegister<double>(3), 123456);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(10), 64);
+
+  EXPECT_EQ(getMemoryValue<double>(64), 123456);
+  EXPECT_EQ(getMemoryValue<double>(72), -0.00032);
+  EXPECT_EQ(getMemoryValue<double>(80), 123.456);
+  EXPECT_EQ(getMemoryValue<double>(88), 1.0);
+
+  EXPECT_GROUP(R"(fsd ft0, 24(a0))", STORE_FLOAT);
+}
+
+TEST_P(InstFloat, FSW) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 1.0;
+  heap[1] = 123.456;
+  heap[2] = -0.00032;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+     # Get heap address
+     li a7, 214
+     ecall
+
+     fld ft0, 0(a0)
+     fld ft1, 4(a0)
+     flw ft2, 8(a0)
+     flw ft3, 12(a0)
+
+     fsw ft3, 0(a0)
+     fsw ft2, 4(a0)
+     fsw ft1, 8(a0)
+     fsw ft0, 12(a0)
+   )");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0x42f6e9793f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0xb9a7c5ac42f6e979);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0xffffffffb9a7c5ac);
+  EXPECT_EQ(getFPRegister<uint64_t>(3), 0xffffffff47f12000);
+  EXPECT_EQ(getFPRegister<float>(2), (float)-0.00032);
+  EXPECT_EQ(getFPRegister<float>(3), (float)123456);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(10), 64);
+
+  EXPECT_EQ(getMemoryValue<float>(64), (float)123456);
+  EXPECT_EQ(getMemoryValue<float>(68), (float)-0.00032);
+  EXPECT_EQ(getMemoryValue<float>(72), (float)123.456);
+  EXPECT_EQ(getMemoryValue<float>(76), (float)1.0);
+
+  EXPECT_GROUP(R"(fsw ft0, 12(a0))", STORE_FLOAT);
+}
+
+TEST_P(InstFloat, FDIV_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+
+    fdiv.d fa6, fa5, fa3
+    fdiv.d ft0, fa5, fa4
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(15), (double)999.212341);
+  EXPECT_EQ(getFPRegister<double>(16), (double)999.212341 / (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(0), (double)999.212341 / (double)-3.78900003);
+
+  EXPECT_GROUP(R"(fdiv.d ft0, fa5, fa4)", FLOAT_DIV_OR_SQRT);
+}
+
+TEST_P(InstFloat, FDIV_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+
+    fdiv.s fa6, fa5, fa3
+    fdiv.s ft0, fa5, fa4
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(15), (float)999.212341);
+  EXPECT_EQ(getFPRegister<float>(16), (float)999.212341 / (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(0), (float)999.212341 / (float)-3.78900003);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffff3f800000;  // Correct NaN boxing of no. 1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fdiv.s fa5, fa3, fa3  # 1 / 1 = 1
+    fdiv.s fa6, fa4, fa3  # Incorrect NaN box should be caught by fdiv and
+                          # canonical NaN used as input, NaN / 1 = NaN
+   )");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(15), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+
+  EXPECT_GROUP(R"(fdiv.s ft0, fa5, fa4)", FLOAT_DIV_OR_SQRT);
+}
+
+TEST_P(InstFloat, FMUL_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+
+    fmul.d fa6, fa5, fa3
+    fmul.d ft0, fa5, fa4
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(15), (double)999.212341);
+  EXPECT_EQ(getFPRegister<double>(16), (double)999.212341 * (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(0), (double)999.212341 * (double)-3.78900003);
+
+  EXPECT_GROUP(R"(fmul.d ft0, fa5, fa4)", FLOAT_MUL);
+}
+
+TEST_P(InstFloat, FMUL_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+
+    fmul.s fa6, fa5, fa3
+    fmul.s ft0, fa5, fa4
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(15), (float)999.212341);
+  EXPECT_EQ(getFPRegister<float>(16), (float)999.212341 * (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(0), (float)999.212341 * (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0xFFFFFFFFC56CA040);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffff3f800000;  // Correct NaN boxing of no. 1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fmul.s fa5, fa3, fa3  # 1 * 1 = 1
+    fmul.s fa6, fa4, fa3  # Incorrect NaN box should be caught by fmul and
+                          # canonical NaN used as input, NaN * 1 = NaN
+   )");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(15), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+
+  EXPECT_GROUP(R"(fmul.s ft0, fa5, fa4)", FLOAT_MUL);
+}
+
+TEST_P(InstFloat, FCVT_D_L) {
+  RUN_RISCV(R"(
+    li t0, 123
+    li t1, -1
+
+    fcvt.d.l ft0, t0
+    fcvt.d.l ft1, t1
+   )");
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 123);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), -1);
+
+  EXPECT_EQ(getFPRegister<double>(0), (double)123);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0x405EC00000000000);
+  EXPECT_EQ(getFPRegister<double>(1), (double)-1);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0xBFF0000000000000);
+
+  EXPECT_GROUP(R"(fcvt.d.l ft1, t1)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FCVT_D_W) {
+  RUN_RISCV(R"(
+    li t0, 23456
+    li t1, -1
+    li t2, 0xFFFFFFFF0FFFFFFF
+
+    fcvt.d.w ft0, t0
+    fcvt.d.w ft1, t1
+    fcvt.d.w ft2, t2
+   )");
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 23456);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), -1);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), -4026531841);
+
+  EXPECT_EQ(getFPRegister<double>(0), (double)23456);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0x40D6E80000000000);
+  EXPECT_EQ(getFPRegister<double>(1), (double)-1);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0xBFF0000000000000);
+  EXPECT_EQ(getFPRegister<double>(2), (double)268435455);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0x41AFFFFFFE000000);
+
+  EXPECT_GROUP(R"(fcvt.d.w ft1, t1)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FCVT_S_L) {
+  RUN_RISCV(R"(
+    li t0, 23456
+    li t1, -1
+    li t2, 0xFFFFFFFF0FFFFFFF
+
+    fcvt.s.l ft0, t0
+    fcvt.s.l ft1, t1
+    fcvt.s.l ft2, t2
+   )");
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 23456);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), -1);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), -4026531841);
+
+  EXPECT_EQ(getFPRegister<float>(0), (float)23456);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0xFFFFFFFF46b74000);
+  EXPECT_EQ(getFPRegister<float>(1), (float)-1);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0xFFFFFFFFbf800000);
+  EXPECT_EQ(getFPRegister<float>(2), (float)-4026531841);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0xFFFFFFFFCF700000);
+
+  EXPECT_GROUP(R"(fcvt.s.l ft1, t1)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FCVT_S_W) {
+  RUN_RISCV(R"(
+    li t0, 23456
+    li t1, -1
+    li t2, 0xFFFFFFFF0FFFFFFF
+
+    fcvt.s.w ft0, t0
+    fcvt.s.w ft1, t1
+    fcvt.s.w ft2, t2
+   )");
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 23456);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), -1);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), -4026531841);
+
+  EXPECT_EQ(getFPRegister<float>(0), (float)23456);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0xFFFFFFFF46b74000);
+  EXPECT_EQ(getFPRegister<float>(1), (float)-1);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0xFFFFFFFFbf800000);
+  EXPECT_EQ(getFPRegister<float>(2), (float)268435455);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0xFFFFFFFF4d800000);
+
+  EXPECT_GROUP(R"(fcvt.s.w ft1, t1)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FCVT_W_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    # Set rounding mode to nearest ties to even
+    li a1, 0
+    fsrm a1
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+    fld fa6, 24(a0)
+
+    fcvt.w.d t0, fa3      # should convert to 5
+    fcvt.w.d t3, fa3, rtz # should convert to 4
+    fcvt.w.d t1, fa4      # should convert to -4
+    fcvt.w.d t4, fa4, rtz # should convert to -3
+    fcvt.w.d t2, fa6 #Nan converts to 0x7fffffff in integer reg
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), 0x7FF8000000000000);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x5);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0x4);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0xFFFFFFFFFFFFFFFC);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0xFFFFFFFFFFFFFFFD);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0x000000007FFFFFFF);
+
+  EXPECT_GROUP(R"(fcvt.w.d t0, fa3)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FCVT_W_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+    flw fa6, 12(a0)
+
+    fcvt.w.s t0, fa3      # should convert to 5
+    fcvt.w.s t3, fa3, rtz # should convert to 4
+    fcvt.w.s t1, fa4      # should convert to -4
+    fcvt.w.s t4, fa4, rtz # should convert to -3
+    fcvt.w.s t2, fa6 #Nan converts to 0x7fffffff in integer reg
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x5);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0x4);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0xFFFFFFFFFFFFFFFC);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0xFFFFFFFFFFFFFFFD);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0x000000007FFFFFFF);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffff3f800000;  // Correct NaN boxing of no. 1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fcvt.w.s t0, fa3 # fcvt 1 = 1
+    fcvt.w.s t1, fa4 # Incorrect NaN box should be caught by fcvt and
+                          # canonical NaN used as input, fcvt NaN = 2^31 − 1
+)");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x0000000000000001);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(6),
+            pow(2, 31) - 1);  // Expected result from spec
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x000000007fffffff);
+
+  EXPECT_GROUP(R"(fcvt.w.s t0, fa3)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FCVT_L_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+    fld fa6, 24(a0)
+
+    fcvt.l.d t0, fa3      # should convert to 5
+    fcvt.l.d t3, fa3, rtz # should convert to 4
+    fcvt.l.d t1, fa4      # should convert to -4
+    fcvt.l.d t4, fa4, rtz # should convert to -3
+    fcvt.l.d t2, fa6 #Nan converts to 0x7fffffff in integer reg
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), 0x7FF8000000000000);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x5);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0x4);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0xFFFFFFFFFFFFFFFC);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0xFFFFFFFFFFFFFFFD);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0x7FFFFFFFFFFFFFFF);
+
+  EXPECT_GROUP(R"(fcvt.l.d t0, fa3)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FCVT_L_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+    flw fa6, 12(a0)
+
+    fcvt.l.s t0, fa3      # should convert to 5
+    fcvt.l.s t3, fa3, rtz # should convert to 4
+    fcvt.l.s t1, fa4      # should convert to -4
+    fcvt.l.s t4, fa4, rtz # should convert to -3
+    fcvt.l.s t2, fa6 #Nan converts to 0x7fffffff in integer reg
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x5);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0x4);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0xFFFFFFFFFFFFFFFC);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0xFFFFFFFFFFFFFFFD);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0x7FFFFFFFFFFFFFFF);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffff3f800000;  // Correct NaN boxing of no. 1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fcvt.l.s t0, fa3 # fcvt 1 = 1
+    fcvt.l.s t1, fa4 # Incorrect NaN box should be caught by fcvt and
+                          # canonical NaN used as input, fcvt NaN = 2^31 − 1
+)");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x0000000000000001);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6),
+            (uint64_t)pow(2, 63) - 1);  // Expected result from spec
+
+  EXPECT_GROUP(R"(fcvt.l.s t0, fa3)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FCVT_LU_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 1.8446744073709552e+19;  // 2^64 - 1
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+    fld fa6, 24(a0)
+
+    fcvt.lu.d t0, fa3      # should convert to 5
+    fcvt.lu.d t3, fa3, rtz # should convert to 4
+    fcvt.lu.d t1, fa4      # should convert to 0
+    fcvt.lu.d t4, fa4, rtz # should convert to 0
+    fcvt.lu.d t2, fa6 #Nan converts to 0x7fffffff in integer reg
+    fcvt.lu.d t5, fa5
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(15), 0x43F0000000000000);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), 0x7FF8000000000000);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x5);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0x4);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0xFFFFFFFFFFFFFFFF);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0xFFFFFFFFFFFFFFFF);
+
+  EXPECT_GROUP(R"(fcvt.lu.d t0, fa3)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FCVT_WU_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 1.8446744073709552e+19;  // 2^64 - 1
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+    fld fa6, 24(a0)
+
+    fcvt.wu.d t0, fa3      # should convert to 5
+    fcvt.wu.d t3, fa3, rtz # should convert to 4
+    fcvt.wu.d t1, fa4      # should convert to 0
+    fcvt.wu.d t4, fa4, rtz # should convert to 0
+    fcvt.wu.d t2, fa6 #Nan converts to 0xffffffffffffffff in integer reg
+    fcvt.wu.d t5, fa5
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(15), 0x43F0000000000000);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), 0x7FF8000000000000);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x5);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0x4);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0xFFFFFFFFFFFFFFFF);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0xFFFFFFFFFFFFFFFF);
+
+  EXPECT_GROUP(R"(fcvt.wu.d t0, fa3)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FCVT_LU_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 1.8446744073709552e+19;  // 2^64 - 1
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+    flw fa6, 12(a0)
+
+    fcvt.lu.s t0, fa3      # should convert to 5
+    fcvt.lu.s t3, fa3, rtz # should convert to 4
+    fcvt.lu.s t1, fa4      # should convert to 0
+    fcvt.lu.s t4, fa4, rtz # should convert to 0
+    fcvt.lu.s t2, fa6 #Nan converts to 0x7fffffff in integer reg
+    fcvt.lu.s t5, fa5
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(15), 0xFFFFFFFF5F800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x5);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0x4);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0xFFFFFFFFFFFFFFFF);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0xFFFFFFFFFFFFFFFF);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffff3f800000;  // Correct NaN boxing of no. 1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fcvt.lu.s t0, fa3 # fcvt 1 = 1
+    fcvt.lu.s t1, fa4 # Incorrect NaN box should be caught by fcvt and
+                          # canonical NaN used as input, fcvt NaN = 2^32 − 1
+)");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x0000000000000001);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6),
+            0xFFFFFFFFFFFFFFFF);  // Expected result from spec
+
+  EXPECT_GROUP(R"(fcvt.lu.s t0, fa3)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FCVT_WU_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 1.8446744073709552e+19;  // 2^64 - 1
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+    flw fa6, 12(a0)
+
+    fcvt.wu.s t0, fa3      # should convert to 5
+    fcvt.wu.s t3, fa3, rtz # should convert to 4
+    fcvt.wu.s t1, fa4      # should convert to 0
+    fcvt.wu.s t4, fa4, rtz # should convert to 0
+    fcvt.wu.s t2, fa6 #Nan converts to 0x7fffffff in integer reg
+    fcvt.wu.s t5, fa5
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(15), 0xFFFFFFFF5F800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x5);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0x4);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0xFFFFFFFFFFFFFFFF);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0xFFFFFFFFFFFFFFFF);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffff3f800000;  // Correct NaN boxing of no. 1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fcvt.wu.s t0, fa3 # fcvt 1 = 1
+    fcvt.wu.s t1, fa4 # Incorrect NaN box should be caught by fcvt and
+                          # canonical NaN used as input, fcvt NaN = 2^32 − 1
+)");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x0000000000000001);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(6),
+            pow(2, 32) - 1);  // Expected result from spec
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0xFFFFFFFFFFFFFFFF);
+
+  EXPECT_GROUP(R"(fcvt.wu.s t0, fa3)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FCVT_D_WU) {
+  RUN_RISCV(R"(
+    li t0, 23456
+    li t1, -1
+    li t2, 0xFFFFFFFF0FFFFFFF
+
+    fcvt.d.wu ft0, t0
+    fcvt.d.wu ft1, t1
+    fcvt.d.wu ft2, t2
+   )");
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 23456);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), -1);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), -4026531841);
+
+  EXPECT_EQ(getFPRegister<double>(0), (double)23456);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0x40D6E80000000000);
+  EXPECT_EQ(getFPRegister<double>(1), (double)4294967295);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0x41EFFFFFFFE00000);
+  EXPECT_EQ(getFPRegister<double>(2), (double)268435455);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0x41AFFFFFFE000000);
+
+  EXPECT_GROUP(R"(fcvt.d.wu ft0, t0)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FCVT_S_WU) {
+  RUN_RISCV(R"(
+    li t0, 23456
+    li t1, -1
+    li t2, 0xFFFFFFFF0FFFFFFF
+
+    fcvt.s.wu ft0, t0
+    fcvt.s.wu ft1, t1
+    fcvt.s.wu ft2, t2
+   )");
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 23456);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), -1);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), -4026531841);
+
+  EXPECT_EQ(getFPRegister<float>(0), (float)23456);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0xFFFFFFFF46b74000);
+  EXPECT_EQ(getFPRegister<float>(1), (float)4294967295);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0xFFFFFFFF4F800000);
+  EXPECT_EQ(getFPRegister<float>(2), (float)268435456);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0xFFFFFFFF4D800000);
+
+  EXPECT_GROUP(R"(fcvt.s.wu ft0, t0)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FCVT_D_LU) {
+  RUN_RISCV(R"(
+    li t0, 23456
+    li t1, -1
+    li t2, 0xFFFFFFFF0FFFFFFF
+
+    fcvt.d.lu ft0, t0
+    fcvt.d.lu ft1, t1
+    fcvt.d.lu ft2, t2
+   )");
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 23456);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), -1);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), -4026531841);
+
+  EXPECT_EQ(getFPRegister<double>(0), (double)23456);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0x40D6E80000000000);
+  EXPECT_EQ(getFPRegister<double>(1), (double)1.8446744073709551616e+19);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0x43F0000000000000);
+  EXPECT_EQ(getFPRegister<double>(2), (double)1.8446744069683019776e+19);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0x43EFFFFFFFE20000);
+
+  EXPECT_GROUP(R"(fcvt.d.lu ft0, t0)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FCVT_S_LU) {
+  RUN_RISCV(R"(
+    li t0, 23456
+    li t1, -1
+    li t2, 0xFFFFFFFF0FFFFFFF
+
+    fcvt.s.lu ft0, t0
+    fcvt.s.lu ft1, t1
+    fcvt.s.lu ft2, t2
+   )");
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 23456);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), -1);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), -4026531841);
+
+  EXPECT_EQ(getFPRegister<float>(0), (float)23456);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0xFFFFFFFF46b74000);
+  EXPECT_EQ(getFPRegister<float>(1), (float)1.84467440737e+19);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0xFFFFFFFF5F800000);
+  EXPECT_EQ(getFPRegister<float>(2), (float)1.84467440737e+19);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0xFFFFFFFF5F800000);
+
+  EXPECT_GROUP(R"(fcvt.s.lu ft0, t0)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FMADD_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+
+    fmadd.d fa6, fa3, fa5, fa4
+    fmadd.d fa7, fa5, fa4, fa3
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(15), (double)999.212341);
+  EXPECT_EQ(getFPRegister<double>(16), (4.52432537 * 999.212341) + -3.78900003);
+  EXPECT_EQ(getFPRegister<double>(17), (999.212341 * -3.78900003) + 4.52432537);
+
+  EXPECT_GROUP(R"(fmadd.d fa6, fa3, fa5, fa4)", FLOAT_MUL);
+}
+
+TEST_P(InstFloat, FMADD_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+
+    fmadd.s fa6, fa5, fa4, fa3 # (999.212341 * -3.78900003) + 4.52432537
+    fmadd.s fa7, fa4, fa3, fa5
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(15), (float)999.212341);
+  EXPECT_EQ(getFPRegister<float>(16),
+            ((float)999.212341 * (float)-3.78900003) + (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(16), (float)-3781.49121);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), 0xFFFFFFFFC56C57DC);
+  EXPECT_EQ(getFPRegister<uint64_t>(17), 0xFFFFFFFF44758476);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffff3f800000;  // Correct NaN boxing of no. 1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fmadd.s fa5, fa3, fa3, fa3  # (1 * 1) + 1 = 2
+    fmadd.s fa6, fa4, fa3, fa3  # Incorrect NaN box should be caught by fmadd and
+                          # canonical NaN used as input, (NaN * 1) + 1 = NaN
+    fmadd.s fa7, fa3, fa4, fa3
+    fmadd.s fs2, fa3, fa3, fa4
+   )");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(15), 0xffffffff40000000);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+  EXPECT_EQ(getFPRegister<uint64_t>(17), boxedPositiveNan);
+  EXPECT_EQ(getFPRegister<uint64_t>(18), boxedPositiveNan);
+
+  EXPECT_GROUP(R"(fmadd.s fa7, fa4, fa3, fa5)", FLOAT_MUL);
+}
+
+TEST_P(InstFloat, FNMSUB_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+                                # Counter intuitively sums with the product
+    fnmsub.d fa6, fa5, fa4, fa3 # -(999.212341 * -3.78900003) + 4.52432537
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(15), (double)999.212341);
+  EXPECT_EQ(getFPRegister<double>(16),
+            -(999.212341 * -3.78900003) + 4.52432537);
+
+  EXPECT_GROUP(R"(fnmsub.d fa6, fa5, fa4, fa3)", FLOAT_MUL);
+}
+
+TEST_P(InstFloat, FNMSUB_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+                                # Counter intuitively sums with the product
+    fnmsub.s fa6, fa5, fa4, fa3 # -(999.212341 * -3.78900003) + 4.52432537
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(15), (float)999.212341);
+  EXPECT_EQ(getFPRegister<float>(16),
+            -((float)999.212341 * (float)-3.78900003) + (float)4.52432537);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), 0xFFFFFFFF456CE8A4);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffff3f800000;  // Correct NaN boxing of no. 1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fnmsub.s fa5, fa3, fa3, fa3  # -(1 * 1) + 1 = 0
+    fnmsub.s fa6, fa4, fa3, fa3  # Incorrect NaN box should be caught by fmadd and
+                          # canonical NaN used as input, -(NaN * 1) + 1 = NaN
+    fnmsub.s fa7, fa3, fa4, fa3
+    fnmsub.s fs2, fa3, fa3, fa4
+   )");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(15), 0xffffffff00000000);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+  EXPECT_EQ(getFPRegister<uint64_t>(17), boxedPositiveNan);
+  EXPECT_EQ(getFPRegister<uint64_t>(18), boxedPositiveNan);
+
+  EXPECT_GROUP(R"(fnmsub.s fa6, fa5, fa4, fa3)", FLOAT_MUL);
+}
+
+TEST_P(InstFloat, FMSUB_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+
+    fmsub.s fa6, fa5, fa4, fa3 # (999.212341 * -3.78900003) - 4.52432537
+    fmsub.s fa7, fa4, fa3, fa5
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(15), (float)999.212341);
+  EXPECT_EQ(getFPRegister<float>(16),
+            ((float)999.212341 * (float)-3.78900003) - (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(16), (float)-3790.54004);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), 0xFFFFFFFFC56CE8A4);
+  EXPECT_EQ(getFPRegister<uint64_t>(17), 0xFFFFFFFFC47E16B8);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffff3f800000;  // Correct NaN boxing of no. 1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fmsub.s fa5, fa3, fa3, fa3  # (1 * 1) - 1 = 0
+    fmsub.s fa6, fa4, fa3, fa3  # Incorrect NaN box should be caught by fmadd and
+                          # canonical NaN used as input, (NaN * 1) - 1 = NaN
+    fmsub.s fa7, fa3, fa4, fa3
+    fmsub.s fs2, fa3, fa3, fa4
+   )");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(15), 0xffffffff00000000);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+  EXPECT_EQ(getFPRegister<uint64_t>(17), boxedPositiveNan);
+  EXPECT_EQ(getFPRegister<uint64_t>(18), boxedPositiveNan);
+
+  EXPECT_GROUP(R"(fmsub.s fa7, fa4, fa3, fa5)", FLOAT_MUL);
+}
+
+TEST_P(InstFloat, FMSUB_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+
+    fmsub.d fa6, fa5, fa4, fa3 # (999.212341 * -3.78900003) - 4.52432537
+    fmsub.d fa7, fa4, fa3, fa5
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(15), (double)999.212341);
+  EXPECT_EQ(getFPRegister<double>(16),
+            ((double)999.212341 * (double)-3.78900003) - (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(16),
+            (double)-3790.5399153953703716979362070560455322265625);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), 0xC0AD9D146FCA6B72);
+  EXPECT_EQ(getFPRegister<uint64_t>(17), 0xC08FC2D70F769B06);
+
+  EXPECT_GROUP(R"(fmsub.d fa7, fa4, fa3, fa5)", FLOAT_MUL);
+}
+
+TEST_P(InstFloat, FNMADD_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+
+    fnmadd.s fa6, fa5, fa4, fa3 # -(999.212341 * -3.78900003) - 4.52432537
+    fnmadd.s fa7, fa4, fa3, fa5
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(15), (float)999.212341);
+  EXPECT_EQ(getFPRegister<float>(16),
+            -((float)999.212341 * (float)-3.78900003) - (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(16), (float)3781.4912646554);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), 0xFFFFFFFF456c57dc);
+  EXPECT_EQ(getFPRegister<uint64_t>(17), 0xFFFFFFFFc4758476);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffff3f800000;  // Correct NaN boxing of no. 1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fnmadd.s fa5, fa3, fa3, fa3  # -(1 * 1) - 1 = -2
+    fnmadd.s fa6, fa4, fa3, fa3  # Incorrect NaN box should be caught by fmadd and
+                          # canonical NaN used as input, -(NaN * 1) - 1 = NaN
+    fnmadd.s fa7, fa3, fa4, fa3
+    fnmadd.s fs2, fa3, fa3, fa4
+   )");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(15), 0xffffffffc0000000);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+  EXPECT_EQ(getFPRegister<uint64_t>(17), boxedPositiveNan);
+  EXPECT_EQ(getFPRegister<uint64_t>(18), boxedPositiveNan);
+
+  EXPECT_GROUP(R"(fnmadd.s fa7, fa4, fa3, fa5)", FLOAT_MUL);
+}
+
+TEST_P(InstFloat, FNMADD_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+     # Get heap address
+     li a7, 214
+     ecall
+
+     fld fa3, 0(a0)
+     fld fa5, 8(a0)
+     fld fa4, 16(a0)
+
+     fnmadd.d fa6, fa5, fa4, fa3 # -(999.212341 * -3.78900003) - 4.52432537
+     fnmadd.d fa7, fa4, fa3, fa5
+    )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(15), (double)999.212341);
+  EXPECT_EQ(getFPRegister<double>(16),
+            -((double)999.212341 * (double)-3.78900003) - (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(16),
+            (double)3781.4912646553702870733104646205902099609375);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), 0x40AD8AFB870A78FE);
+  EXPECT_EQ(getFPRegister<uint64_t>(17), 0xC08EB08EB0368E94);
+
+  EXPECT_GROUP(R"(fnmadd.d fa6, fa5, fa4, fa3)", FLOAT_MUL);
+}
+
+TEST_P(InstFloat, FCVT_D_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+
+    fcvt.d.s ft0, fa3
+    fcvt.d.s ft1, fa4
+    fcvt.d.s ft2, fa5
+   )");
+
+  // Floats should be NaN boxed within 64 bit floating point registers
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff4090c746);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xffffffffc0727efa);
+  EXPECT_EQ(getFPRegister<float>(15), (float)999.212341);
+  EXPECT_EQ(getFPRegister<uint64_t>(15), 0xffffffff4479cd97);
+
+  // Must cast to float then to double to account for representation errors.
+  // Can't directly cast to double
+  EXPECT_EQ(getFPRegister<double>(0), (double)(float)4.52432537);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0x401218E8C0000000);
+  EXPECT_EQ(getFPRegister<double>(1), (double)(float)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0xC00E4FDF40000000);
+  EXPECT_EQ(getFPRegister<double>(2), (double)(float)999.212341);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0x408F39B2E0000000);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffff3f800000;  // Correct NaN boxing of no. 1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fcvt.d.s ft0, fa3 # fcvt 1 = 1
+    fcvt.d.s ft1, fa4 # Incorrect NaN box should be caught by fcvt and
+                          # canonical NaN used as input, fcvt NaN = 2^32 − 1
+)");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0x3FF0000000000000);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0x7FF8000000000000);
+
+  EXPECT_GROUP(R"(fcvt.d.s ft0, fa3)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FCVT_S_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+
+    fcvt.s.d ft0, fa3
+    fcvt.s.d ft1, fa4
+    fcvt.s.d ft2, fa5
+   )");
+
+  // Floats should be NaN boxed within 64 bit floating point registers
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0x401218E8BFF273D0);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xC00E4FDF3F6B24E7);
+  EXPECT_EQ(getFPRegister<double>(15), (double)999.212341);
+  EXPECT_EQ(getFPRegister<uint64_t>(15), 0x408F39B2DFD694CD);
+
+  EXPECT_EQ(getFPRegister<float>(0), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0xFFFFFFFF4090c746);
+  EXPECT_EQ(getFPRegister<float>(1), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0xFFFFFFFFc0727efa);
+  EXPECT_EQ(getFPRegister<float>(2), (float)999.212341);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0xFFFFFFFF4479cd97);
+
+  EXPECT_GROUP(R"(fcvt.s.d ft0, fa3)", FLOAT_SIMPLE_CVT);
+}
+
+TEST_P(InstFloat, FSGNJ_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+
+    fsgnj.d fa6, fa4, fa5
+    fsgnj.d fa7, fa4, fa4
+    fsgnj.d ft0, fa5, fa4
+    fsgnj.d ft1, fa5, fa5
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(15), (double)999.212341);
+  EXPECT_EQ(getFPRegister<double>(16), (double)3.78900003);
+  EXPECT_EQ(getFPRegister<double>(17), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(0), (double)-999.212341);
+  EXPECT_EQ(getFPRegister<double>(1), (double)999.212341);
+
+  // Pseudoinstructions fmv.d
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa4, 16(a0)
+
+    fmv.d ft2, fa4
+    fmv.d ft3, fa3
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(2), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(3), (double)4.52432537);
+
+  EXPECT_GROUP(R"(fsgnj.d fa6, fa4, fa5)", FLOAT_SIMPLE_LOGICAL);
+  EXPECT_GROUP(R"(fmv.d ft2, fa4)", FLOAT_SIMPLE_LOGICAL);
+}
+
+TEST_P(InstFloat, FSGNJ_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+
+    fsgnj.s fa6, fa4, fa5
+    fsgnj.s fa7, fa4, fa4
+    fsgnj.s ft0, fa5, fa4
+    fsgnj.s ft1, fa5, fa5
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(15), (float)999.212341);
+  EXPECT_EQ(getFPRegister<float>(16), (float)3.78900003);
+  EXPECT_EQ(getFPRegister<float>(17), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(0), (float)-999.212341);
+  EXPECT_EQ(getFPRegister<float>(1), (float)999.212341);
+
+  // Pseudoinstructions fmv.s
+
+  RUN_RISCV(R"(
+      # Get heap address
+      li a7, 214
+      ecall
+
+      flw fa3, 0(a0)
+      flw fa4, 8(a0)
+
+      fmv.s ft2, fa4
+      fmv.s ft3, fa3
+     )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(2), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0xFFFFFFFFc0727efa);
+  EXPECT_EQ(getFPRegister<float>(3), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<uint64_t>(3), 0xFFFFFFFF4090c746);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffffbf800000;  // Correct NaN boxing of no. -1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fsgnj.s ft0, fa3, fa3 # fsgnj -1, -1 = -1
+    fsgnj.s ft1, fa4, fa3 # Incorrect NaN box should be caught by fsgnj and
+                          # canonical NaN used as input, fsgnj +NaN, -1 = -NaN
+    fsgnj.s ft2, fa3, fa4 # Not commutative, fsgnj -1, +NaN = 1
+)");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffffbf800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0xffffffffbf800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0xffffffffffc00000);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0xffffffff3f800000);
+
+  EXPECT_GROUP(R"(fsgnj.s fa6, fa4, fa5)", FLOAT_SIMPLE_LOGICAL);
+  EXPECT_GROUP(R"(fmv.s ft2, fa4)", FLOAT_SIMPLE_LOGICAL);
+}
+
+TEST_P(InstFloat, FSGNJX_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+
+    fsgnjx.d fa6, fa4, fa5
+    fsgnjx.d fa7, fa4, fa4
+    fsgnjx.d ft0, fa5, fa4
+    fsgnjx.d ft1, fa5, fa5
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(15), (double)999.212341);
+  EXPECT_EQ(getFPRegister<double>(16), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(17), (double)3.78900003);
+  EXPECT_EQ(getFPRegister<double>(0), (double)-999.212341);
+  EXPECT_EQ(getFPRegister<double>(1), (double)999.212341);
+
+  //   Pseudoinstructions fabs.d
+
+  RUN_RISCV(R"(
+      # Get heap address
+      li a7, 214
+      ecall
+
+      fld fa3, 0(a0)
+      fld fa4, 16(a0)
+
+      fabs.d ft2, fa4
+      fabs.d ft3, fa3
+     )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(2), (double)3.78900003);
+  EXPECT_EQ(getFPRegister<double>(3), (double)4.52432537);
+
+  EXPECT_GROUP(R"(fsgnjx.d fa6, fa4, fa5)", FLOAT_SIMPLE_LOGICAL);
+  EXPECT_GROUP(R"(fabs.d ft2, fa4)", FLOAT_SIMPLE_LOGICAL);
+}
+
+TEST_P(InstFloat, FSGNJX_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+
+    fsgnjx.s fa6, fa4, fa5
+    fsgnjx.s fa7, fa4, fa4
+    fsgnjx.s ft0, fa5, fa4
+    fsgnjx.s ft1, fa5, fa5
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(15), (float)999.212341);
+  EXPECT_EQ(getFPRegister<float>(16), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(17), (float)3.78900003);
+  EXPECT_EQ(getFPRegister<float>(0), (float)-999.212341);
+  EXPECT_EQ(getFPRegister<float>(1), (float)999.212341);
+
+  //   Pseudoinstructions fabs.s
+
+  RUN_RISCV(R"(
+        # Get heap address
+        li a7, 214
+        ecall
+
+        flw fa3, 0(a0)
+        flw fa4, 8(a0)
+
+        fabs.s ft2, fa4
+        fabs.s ft3, fa3
+       )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(2), (float)3.78900003);
+  EXPECT_EQ(getFPRegister<float>(3), (float)4.52432537);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffffbf800000;  // Correct NaN boxing of no. -1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fsgnjx.s ft0, fa3, fa3 # fsgnjx -1, -1 = 1
+    fsgnjx.s ft1, fa4, fa3 # Incorrect NaN box should be caught by fsgnjx and
+                          # canonical NaN used as input, fsgnjx +NaN, -1 = -NaN
+    fsgnjx.s ft2, fa3, fa4 # Not commutative, fsgnjx -1, +NaN = -1
+)");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffffbf800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0xffffffffffc00000);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0xffffffffbf800000);
+
+  EXPECT_GROUP(R"(fsgnjx.s fa6, fa4, fa5)", FLOAT_SIMPLE_LOGICAL);
+  EXPECT_GROUP(R"(fabs.s ft2, fa4)", FLOAT_SIMPLE_LOGICAL);
+}
+
+TEST_P(InstFloat, FSGNJN_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+
+    fsgnjn.d fa6, fa4, fa5
+    fsgnjn.d fa7, fa4, fa4
+    fsgnjn.d ft0, fa5, fa4
+    fsgnjn.d ft1, fa5, fa5
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(15), (double)999.212341);
+
+  EXPECT_EQ(getFPRegister<double>(16), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(17), (double)3.78900003);
+  EXPECT_EQ(getFPRegister<double>(0), (double)999.212341);
+  EXPECT_EQ(getFPRegister<double>(1), (double)-999.212341);
+
+  //   Pseudoinstructions fneg.d
+
+  RUN_RISCV(R"(
+        # Get heap address
+        li a7, 214
+        ecall
+
+        fld fa3, 0(a0)
+        fld fa4, 16(a0)
+
+        fneg.d ft2, fa4
+        fneg.d ft3, fa3
+       )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(2), (double)3.78900003);
+  EXPECT_EQ(getFPRegister<double>(3), (double)-4.52432537);
+
+  EXPECT_GROUP(R"(fsgnjn.d fa6, fa4, fa5)", FLOAT_SIMPLE_LOGICAL);
+  EXPECT_GROUP(R"(fneg.d ft2, fa4)", FLOAT_SIMPLE_LOGICAL);
+}
+
+TEST_P(InstFloat, FSGNJN_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+
+    fsgnjn.s fa6, fa4, fa5
+    fsgnjn.s fa7, fa4, fa4
+    fsgnjn.s ft0, fa5, fa4
+    fsgnjn.s ft1, fa5, fa5
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(15), (float)999.212341);
+
+  EXPECT_EQ(getFPRegister<float>(16), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(17), (float)3.78900003);
+  EXPECT_EQ(getFPRegister<float>(0), (float)999.212341);
+  EXPECT_EQ(getFPRegister<float>(1), (float)-999.212341);
+
+  //   Pseudoinstructions fneg.s
+
+  RUN_RISCV(R"(
+        # Get heap address
+        li a7, 214
+        ecall
+
+        flw fa3, 0(a0)
+        flw fa4, 8(a0)
+
+        fneg.s ft2, fa4
+        fneg.s ft3, fa3
+       )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(2), (float)3.78900003);
+  EXPECT_EQ(getFPRegister<float>(3), (float)-4.52432537);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffffbf800000;  // Correct NaN boxing of no. -1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fsgnjn.s ft0, fa3, fa3 # fsgnjn -1, -1 = 1
+    fsgnjn.s ft1, fa4, fa3 # Incorrect NaN box should be caught by fsgnjn and
+                          # canonical NaN used as input, fsgnjn +NaN, -1 = NaN
+    fsgnjn.s ft2, fa3, fa4 # Not commutative, fsgnjn -1, +NaN = -1
+)");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffffbf800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), boxedPositiveNan);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0xffffffffbf800000);
+
+  EXPECT_GROUP(R"(fsgnjn.s fa6, fa4, fa5)", FLOAT_SIMPLE_LOGICAL);
+  EXPECT_GROUP(R"(fneg.s ft2, fa4)", FLOAT_SIMPLE_LOGICAL);
+}
+
+TEST_P(InstFloat, FADD_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+
+    fadd.s ft0, fa4, fa3
+    fadd.s ft1, fa5, fa4
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(15), (float)999.212341);
+
+  EXPECT_EQ(getFPRegister<float>(0), (float)0.73532534);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0xFFFFFFFF3f3c3e48);
+  EXPECT_EQ(getFPRegister<float>(1), (float)995.423341);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0xFFFFFFFF4478db18);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffff3f800000;  // Correct NaN boxing of no. 1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fadd.s fa5, fa3, fa3  # 1 + 1 = 1
+    fadd.s fa6, fa4, fa3  # Incorrect NaN box should be caught by fadd and
+                          # canonical NaN used as input, NaN + 1 = NaN
+   )");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(15), 0xffffffff40000000);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+
+  EXPECT_GROUP(R"(fadd.s ft0, fa4, fa3)", FLOAT_SIMPLE_ARTH);
+}
+
+TEST_P(InstFloat, FADD_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 1.0;
+  heap[1] = 123.456;
+  heap[2] = -0.00032;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+     # Get heap address
+     li a7, 214
+     ecall
+
+     fld ft0, 0(a0)
+     fld ft1, 8(a0)
+     fld ft2, 16(a0)
+     fld ft3, 24(a0)
+
+     fadd.d ft4, ft0, ft1
+     fadd.d ft5, ft1, ft2
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(4), 124.456);
+  EXPECT_EQ(getFPRegister<double>(5), 123.456 - 0.00032);
+
+  EXPECT_GROUP(R"(fadd.d ft4, ft0, ft1)", FLOAT_SIMPLE_ARTH);
+}
+
+TEST_P(InstFloat, FSUB_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+
+    fsub.d ft0, fa4, fa3
+    fsub.d ft1, fa3, fa4
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(15), (double)999.212341);
+
+  EXPECT_EQ(getFPRegister<double>(0), (double)-8.3133254);
+  EXPECT_EQ(getFPRegister<double>(1), (double)8.3133254);
+
+  EXPECT_GROUP(R"(fsub.d ft0, fa4, fa3)", FLOAT_SIMPLE_ARTH);
+}
+
+TEST_P(InstFloat, FSUB_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+
+    fsub.s ft0, fa4, fa3
+    fsub.s ft1, fa3, fa4
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(15), (float)999.212341);
+
+  EXPECT_EQ(getFPRegister<float>(0), (float)-3.78900003 - (float)4.52432537);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0xFFFFFFFFc1050362);
+  EXPECT_EQ(getFPRegister<float>(1), (float)4.52432537 - (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0xFFFFFFFF41050362);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffff3f800000;  // Correct NaN boxing of no. 1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fsub.s fa5, fa3, fa3  # 1 - 1 = 0
+    fsub.s fa6, fa4, fa3  # Incorrect NaN box should be caught by fsub and
+                          # canonical NaN used as input, NaN - 1 = NaN
+   )");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(15), 0xffffffff00000000);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+
+  EXPECT_GROUP(R"(fsub.s ft0, fa4, fa3)", FLOAT_SIMPLE_ARTH);
+}
+
+TEST_P(InstFloat, FSQRT_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+
+    fsqrt.d ft0, fa5      # TODO set CSR = 0b1      inexact
+    fsqrt.d ft1, fa4      # TODO set CSR = 0b10001  invalid op & inexact
+    fdiv.d fa3, fa3, fa5  # 0.00452789199 < 0
+    fsqrt.d ft2, fa3
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), 4.52432537 / 999.212341);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(15), (double)999.212341);
+
+  EXPECT_EQ(getFPRegister<double>(0), (double)31.6103201660470389811052882578);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0x403F9C3DF14142E6);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0x7FF8000000000000);  // NaN
+  EXPECT_EQ(getFPRegister<double>(2), (double)0.067289611417595679432324118352);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0x3FB139E458662CD6);
+
+  EXPECT_GROUP(R"(fsqrt.d ft0, fa5)", FLOAT_DIV_OR_SQRT);
+}
+
+TEST_P(InstFloat, FSQRT_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+
+    fsqrt.s ft0, fa5      # TODO set CSR = 0b1      inexact
+    fsqrt.s ft1, fa4      # TODO set CSR = 0b10001  invalid op & inexact
+    fdiv.s fa3, fa3, fa5  # 0.00452789199 < 0
+    fsqrt.s ft2, fa3
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)0.00452789199);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(15), (float)999.212341);
+
+  EXPECT_EQ(getFPRegister<float>(0), (float)31.610321);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0xFFFFFFFF41FCE1F0);
+  EXPECT_EQ(getFPRegister<uint64_t>(1), boxedPositiveNan);  // NaN
+  EXPECT_EQ(getFPRegister<float>(2), (float)0.0672896132);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0xFFFFFFFF3D89CF23);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffff3f800000;  // Correct NaN boxing of no. 1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fsqrt.s fa5, fa3  # 1^0.5 = 1
+    fsqrt.s fa6, fa4  # Incorrect NaN box should be caught by fsqrt and
+                      # canonical NaN used as input, NaN^0.5 = NaN
+   )");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(15), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+
+  EXPECT_GROUP(R"(fsqrt.s ft0, fa5)", FLOAT_DIV_OR_SQRT);
+}
+
+TEST_P(InstFloat, FMV_X_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa4, 16(a0)
+
+    fmv.x.d t0, fa3
+    fmv.x.d t1, fa4
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0x401218E8BFF273D0);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xC00E4FDF3F6B24E7);
+
+  EXPECT_EQ(getGeneralRegister<double>(5), (double)4.52432537);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x401218E8BFF273D0);
+  EXPECT_EQ(getGeneralRegister<double>(6), (double)-3.78900003);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0xC00E4FDF3F6B24E7);
+
+  EXPECT_GROUP(R"(fmv.x.d t0, fa3)", FLOAT_SIMPLE_ARTH);
+}
+
+TEST_P(InstFloat, FMV_X_W) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa4, 8(a0)
+
+    fmv.x.w t0, fa3
+    fmv.x.w t1, fa4
+   )");
+
+  // Floats should be NaN boxed within 64 bit floating point registers
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff4090c746);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xffffffffc0727efa);
+
+  // "float" should be sign extended when moved to integer register
+  EXPECT_EQ(getGeneralRegister<float>(5), (float)4.52432537);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x000000004090c746);
+  EXPECT_EQ(getGeneralRegister<float>(6), (float)-3.78900003);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0xffffffffc0727efa);
+
+  EXPECT_GROUP(R"(fmv.x.w t0, fa3)", FLOAT_SIMPLE_ARTH);
+}
+
+TEST_P(InstFloat, FMV_D_X) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa4, 16(a0)
+
+    fmv.x.d t0, fa3
+    fmv.x.d t1, fa4
+
+    fmv.d.x fa4, t0
+    fmv.d.x fa3, t1
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(14), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0x401218E8BFF273D0);
+  EXPECT_EQ(getFPRegister<double>(13), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xC00E4FDF3F6B24E7);
+
+  EXPECT_EQ(getGeneralRegister<double>(5), (double)4.52432537);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x401218E8BFF273D0);
+  EXPECT_EQ(getGeneralRegister<double>(6), (double)-3.78900003);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0xC00E4FDF3F6B24E7);
+
+  EXPECT_GROUP(R"(fmv.d.x fa4, t0)", FLOAT_SIMPLE_ARTH);
+}
+
+TEST_P(InstFloat, FMV_W_X) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = 123456;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa4, 8(a0)
+
+    fmv.x.w t0, fa3
+    fmv.x.w t1, fa4
+
+    fmv.w.x fa4, t0
+    fmv.w.x fa3, t1
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(14), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xFFFFFFFF4090c746);
+  EXPECT_EQ(getFPRegister<float>(13), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffffc0727efa);
+
+  EXPECT_EQ(getGeneralRegister<float>(5), (float)4.52432537);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x000000004090c746);
+  EXPECT_EQ(getGeneralRegister<float>(6), (float)-3.78900003);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0xffffffffc0727efa);
+
+  EXPECT_GROUP(R"(fmv.w.x fa4, t0)", FLOAT_SIMPLE_ARTH);
+}
+
+TEST_P(InstFloat, FEQ_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+    fld fa6, 24(a0)
+
+    feq.d t0, fa3, fa3 #equal set t0
+    feq.d t1, fa3, fa4 #unequal don't set t1
+    feq.d t2, fa6, fa4 #one NaN don't set t2
+    feq.d t3, fa6, fa6 #both NaN don't set t3
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), 0x7FF8000000000000);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 1);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0);
+
+  EXPECT_GROUP(R"(feq.d t0, fa3, fa3)", FLOAT_SIMPLE_CMP);
+}
+
+TEST_P(InstFloat, FEQ_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+    flw fa6, 12(a0)
+
+    feq.s t0, fa3, fa3 #equal set t0
+    feq.s t1, fa3, fa4 #unequal don't set t1
+    feq.s t2, fa6, fa4 #one NaN don't set t2
+    feq.s t3, fa6, fa6 #both NaN don't set t3
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 1);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffffbf800000;  // Correct NaN boxing of no. -1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    feq.s t0, fa3, fa3 # feq -1, -1 = 1
+    feq.s t1, fa4, fa3 # Incorrect NaN box should be caught by feq and
+                          # canonical NaN used as input, feq NaN, 1 = 0
+)");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffffbf800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x1);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x0);
+
+  EXPECT_GROUP(R"(feq.s t0, fa3, fa3)", FLOAT_SIMPLE_CMP);
+}
+
+TEST_P(InstFloat, FLT_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+    fld fa6, 24(a0)
+
+    flt.d t0, fa3, fa3 #equal don't set t0
+    flt.d t1, fa3, fa4 #fa3 </ fa4 don't set t1
+    flt.d t4, fa4, fa3 #fa4 < fa3 set t4
+    flt.d t2, fa6, fa4 #one NaN don't set t2
+    flt.d t3, fa6, fa6 #both NaN don't set t3
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), 0x7FF8000000000000);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 1);
+
+  EXPECT_GROUP(R"(flt.d t0, fa3, fa3)", FLOAT_SIMPLE_CMP);
+}
+
+TEST_P(InstFloat, FLT_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+    flw fa6, 12(a0)
+
+    flt.s t0, fa3, fa3 #equal don't set t0
+    flt.s t1, fa3, fa4 #fa3 </ fa4 don't set t1
+    flt.s t4, fa4, fa3 #fa4 < fa3 set t4
+    flt.s t2, fa6, fa4 #one NaN don't set t2
+    flt.s t3, fa6, fa6 #both NaN don't set t3
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 1);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffffbf800000;  // Correct NaN boxing of no. -1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    flt.s t0, fa3, fa3 # flt -1, -1 = 0
+    flt.s t1, fa4, fa3 # Incorrect NaN box should be caught by flt and
+                          # canonical NaN used as input, flt NaN, 1 = 0
+)");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffffbf800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x0);
+
+  EXPECT_GROUP(R"(flt.s t0, fa3, fa3)", FLOAT_SIMPLE_CMP);
+}
+
+TEST_P(InstFloat, FLE_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+    fld fa6, 24(a0)
+
+    fle.d t0, fa3, fa3 #equal set t0
+    fle.d t1, fa3, fa4 #fa3 <=/ fa4 don't set t1
+    fle.d t4, fa4, fa3 #fa4 < fa3 set t4
+    fle.d t2, fa6, fa4 #one NaN don't set t2
+    fle.d t3, fa6, fa6 #both NaN don't set t3
+   )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), 0x7FF8000000000000);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 1);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 1);
+
+  EXPECT_GROUP(R"(fle.d t0, fa3, fa3)", FLOAT_SIMPLE_CMP);
+}
+
+TEST_P(InstFloat, FLE_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+    flw fa6, 12(a0)
+
+    fle.s t0, fa3, fa3 #equal set t0
+    fle.s t1, fa3, fa4 #fa3 <=/ fa4 don't set t1
+    fle.s t4, fa4, fa3 #fa4 < fa3 set t4
+    fle.s t2, fa6, fa4 #one NaN don't set t2
+    fle.s t3, fa6, fa6 #both NaN don't set t3
+   )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 1);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 1);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffffbf800000;  // Correct NaN boxing of no. -1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fle.s t0, fa3, fa3 # fle -1, -1 = 1
+    fle.s t1, fa4, fa3 # Incorrect NaN box should be caught by fle and
+                          # canonical NaN used as input, fle NaN, 1 = 0
+)");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffffbf800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x1);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x0);
+
+  EXPECT_GROUP(R"(fle.s t0, fa3, fa3)", FLOAT_SIMPLE_CMP);
+}
+
+TEST_P(InstFloat, FMIN_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+    fld fa6, 24(a0)
+
+    fmin.d fa0, fa3, fa4
+    fmin.d fa1, fa3, fa6 # min(n, NaN) = n
+    fmin.d ft0, fa6, fa6 # min(NaN, NaN) = NaN
+
+    fcvt.d.l ft1, zero
+    fneg.d ft2, ft1
+
+    fmin.d ft3, ft1, ft2 # min(+0, -0) = -0
+    fmin.d ft4, ft2, ft1 # min(-0, +0) = -0
+  )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), 0x7FF8000000000000);
+
+  EXPECT_EQ(getFPRegister<double>(10), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<double>(11), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0x7FF8000000000000);
+  EXPECT_EQ(getFPRegister<double>(3),
+            (double)-0);  // Doesn't check for sign so below test needed
+  EXPECT_EQ(getFPRegister<uint64_t>(3), 0x8000000000000000);
+  EXPECT_EQ(getFPRegister<double>(4), (double)-0);
+  EXPECT_EQ(getFPRegister<uint64_t>(4), 0x8000000000000000);
+
+  EXPECT_GROUP(R"(fmin.d fa0, fa3, fa4)", FLOAT_SIMPLE_CMP);
+}
+
+TEST_P(InstFloat, FMIN_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa4, 8(a0)
+    flw fa6, 12(a0)
+
+    fmin.s fa0, fa3, fa4
+    fmin.s fa1, fa3, fa6 # min(n, NaN) = n
+    fmin.s ft0, fa6, fa6 # min(NaN, NaN) = NaN
+
+    fcvt.s.w ft1, zero
+    fneg.s ft2, ft1
+
+    fmin.s ft3, ft1, ft2 # min(+0, -0) = -0 # fminf picks the later of the two options in both cases. Check our implementation fixes this to pick -0 instead
+    fmin.s ft4, ft2, ft1 # min(-0, +0) = -0
+  )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+
+  EXPECT_EQ(getFPRegister<float>(10), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<float>(11), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), boxedPositiveNan);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0xffffffff00000000);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0xffffffff80000000);
+  EXPECT_EQ(getFPRegister<uint64_t>(3), 0xffffffff80000000);
+  EXPECT_EQ(getFPRegister<uint64_t>(4), 0xffffffff80000000);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffff3f800000;  // Correct NaN boxing of no. 1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fmin.s fa5, fa3, fa3  # fmin(1, 1) = 1
+    fmin.s fa6, fa4, fa4  # Incorrect NaN box should be caught by fmin and
+                          # canonical NaN used as input, fmin(NaN, NaN) = NaN
+   )");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(15), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+
+  EXPECT_GROUP(R"(fmin.s fa0, fa3, fa4)", FLOAT_SIMPLE_CMP);
+}
+
+TEST_P(InstFloat, FMAX_D) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)
+    fld fa5, 8(a0)
+    fld fa4, 16(a0)
+    fld fa6, 24(a0)
+
+    fmax.d fa0, fa3, fa4
+    fmax.d fa1, fa3, fa6 # max(n, NaN) = n
+    fmax.d ft0, fa6, fa6 # max(NaN, NaN) = NaN
+
+    fcvt.d.l ft1, zero
+    fneg.d ft2, ft1
+
+    fmax.d ft3, ft1, ft2 # max(+0, -0) = 0
+    fmax.d ft4, ft1, ft1 # max(-0, +0) = 0
+  )");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), 0x7FF8000000000000);
+
+  EXPECT_EQ(getFPRegister<double>(10), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(11), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), 0x7FF8000000000000);
+  EXPECT_EQ(getFPRegister<double>(3), (double)0);
+  EXPECT_EQ(getFPRegister<uint64_t>(3), 0x0000000000000000);
+  EXPECT_EQ(getFPRegister<uint64_t>(4), 0x0000000000000000);
+
+  EXPECT_GROUP(R"(fmax.d fa0, fa3, fa4)", FLOAT_SIMPLE_CMP);
+}
+
+TEST_P(InstFloat, FMAX_S) {
+  initialHeapData_.resize(32);
+  float* heap = reinterpret_cast<float*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 999.212341;
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    flw fa3, 0(a0)
+    flw fa5, 4(a0)
+    flw fa4, 8(a0)
+    flw fa6, 12(a0)
+
+    fmax.s fa0, fa3, fa4
+    fmax.s fa1, fa3, fa6 # max(n, NaN) = n
+    fmax.s ft0, fa6, fa6 # max(NaN, NaN) = NaN
+
+    fcvt.s.w ft1, zero
+    fneg.s ft2, ft1
+
+    fmax.s ft3, ft1, ft2 # max(+0, -0) = +0
+    fmax.s ft4, ft2, ft1 # max(-0, +0) = +0
+  )");
+
+  EXPECT_EQ(getFPRegister<float>(13), (float)4.52432537);
+  EXPECT_EQ(getFPRegister<float>(14), (float)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(10), 0xffffffff4090c746);
+  EXPECT_EQ(getFPRegister<uint64_t>(11), 0xffffffff4090c746);
+  EXPECT_EQ(getFPRegister<uint64_t>(0), boxedPositiveNan);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(1), 0xffffffff00000000);
+  EXPECT_EQ(getFPRegister<uint64_t>(2), 0xffffffff80000000);
+  EXPECT_EQ(getFPRegister<uint64_t>(3), 0xffffffff00000000);
+  EXPECT_EQ(getFPRegister<uint64_t>(4), 0xffffffff00000000);
+
+  initialHeapData_.resize(32);
+  uint64_t* intHeap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  intHeap[0] = 0xffffffff3f800000;  // Correct NaN boxing of no. 1
+  intHeap[1] = 0xfff7ffff3f800000;  // Incorrect NaN boxing of no. 1
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0) # Correct
+    fld fa4, 8(a0) # Incorrect
+
+    fmax.s fa5, fa3, fa3  # fmax(1, 1) = 1
+    fmax.s fa6, fa4, fa4  # Incorrect NaN box should be caught by fmax and
+                          # canonical NaN used as input, fmax(NaN, NaN) = NaN
+   )");
+
+  EXPECT_EQ(getFPRegister<uint64_t>(13), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(14), 0xfff7ffff3f800000);
+
+  EXPECT_EQ(getFPRegister<uint64_t>(15), 0xffffffff3f800000);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), boxedPositiveNan);
+
+  EXPECT_GROUP(R"(fmax.s fa0, fa3, fa4)", FLOAT_SIMPLE_CMP);
+}
+
+TEST_P(InstFloat, RoundToNearest) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 3.5;
+  heap[1] = 2.5;
+  heap[2] = -3.5;
+  heap[3] = -2.5;
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    # Set rounding mode to RISC-V "nearest ties to even"
+    li a1, 0
+    fsrm a1
+
+    fld fa3, 0(a0)   # 3.5
+    fld fa4, 8(a0)   # 2.5
+    fld fa5, 16(a0)  # -3.5
+    fld fa6, 24(a0)  # -2.5
+
+    # Test for how CPP handles ties
+    fcvt.w.d t0, fa3      # should convert to 4
+    fcvt.w.d t1, fa4      # should convert to 2
+    fcvt.w.d t2, fa5      # should convert to -4
+    fcvt.w.d t3, fa6      # should convert to -2
+)");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)3.5);
+  EXPECT_EQ(getFPRegister<double>(14), (double)2.5);
+  EXPECT_EQ(getFPRegister<double>(15), (double)-3.5);
+  EXPECT_EQ(getFPRegister<double>(16), (double)-2.5);
+
+  // Test for CPP tie handling. Below case test for ties to even
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x4);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x2);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0xFFFFFFFFFFFFFFFC);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0xFFFFFFFFFFFFFFFE);
+}
+
+TEST_P(InstFloat, StaticRoundingMode) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 2.5;
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    # Set rounding mode to "nearest ties to even"
+    li a1, 0
+    fsrm a1
+
+    fld fa3, 0(a0)   # 4.52432537
+    fld fa4, 8(a0)   # 2.5
+    fld fa5, 16(a0)  # -3.78900003
+    fld fa6, 24(a0)  # nan
+
+    # Should obey dynamic rounding mode in CSR
+    fcvt.w.d t0, fa3      # should convert to 5
+    fcvt.w.d t1, fa4      # should convert to 2
+    fcvt.w.d t2, fa5      # should convert to -4
+
+    #towards zero
+    fcvt.w.d t3, fa3, rtz      # should convert to 4
+    fcvt.w.d t4, fa4, rtz      # should convert to 2
+    fcvt.w.d t5, fa5, rtz      # should convert to -3
+
+    #towards -inf
+    fcvt.w.d t6, fa3, rdn      # should convert to 4
+    fcvt.w.d a0, fa4, rdn      # should convert to 2
+    fcvt.w.d a1, fa5, rdn      # should convert to -4
+
+    #towards +inf
+    fcvt.w.d a2, fa3, rup      # should convert to 5
+    fcvt.w.d a3, fa4, rup      # should convert to 3
+    fcvt.w.d a4, fa5, rup      # should convert to -3
+
+    #to nearest ties to maximum magnitude
+    fcvt.w.d a5, fa3, rmm      # should convert to 5
+    fcvt.w.d a6, fa4, rmm      # should convert to 3
+    fcvt.w.d a7, fa5, rmm      # should convert to -4
+
+
+    # Set rounding mode to "round down"
+    li s2, 2
+    fsrm s2
+
+    # Should obey dynamic rounding mode in CSR
+    fcvt.w.d s2, fa3      # should convert to 4
+    fcvt.w.d s3, fa4      # should convert to 2
+    fcvt.w.d s4, fa5      # should convert to -4
+
+    #to nearest ties to even
+    fcvt.w.d s5, fa3, rne      # should convert to 5
+    fcvt.w.d s6, fa4, rne      # should convert to 2
+    fcvt.w.d s7, fa5, rne      # should convert to -4
+)");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)2.5);
+  EXPECT_EQ(getFPRegister<double>(15), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), 0x7FF8000000000000);
+
+  // Dynamic
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x5);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x2);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0xFFFFFFFFFFFFFFFC);
+
+  // RTZ
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0x4);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0x2);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0xFFFFFFFFFFFFFFFD);
+
+  // RDN
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0x4);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(10), 0x2);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(11), 0xFFFFFFFFFFFFFFFC);
+
+  // RUP
+  EXPECT_EQ(getGeneralRegister<uint64_t>(12), 0x5);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(13), 0x3);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(14), 0xFFFFFFFFFFFFFFFD);
+
+  // RMM
+  EXPECT_EQ(getGeneralRegister<uint64_t>(15), 0x5);
+  // This test won't pass as CPP doesn't provide functionality to tie to max
+  // magnitude
+  //  EXPECT_EQ(getGeneralRegister<uint64_t>(16), 0x3);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(17), 0xFFFFFFFFFFFFFFFC);
+
+  // Dynamic change to RDN
+  EXPECT_EQ(getGeneralRegister<uint64_t>(18), 0x4);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(19), 0x2);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(20), 0xFFFFFFFFFFFFFFFC);
+
+  // Dynamic
+  EXPECT_EQ(getGeneralRegister<uint64_t>(21), 0x5);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(22), 0x2);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(23), 0xFFFFFFFFFFFFFFFC);
+}
+
+TEST_P(InstFloat, DynamicRoundingMode) {
+  initialHeapData_.resize(32);
+  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
+  heap[0] = 4.52432537;
+  heap[1] = 2.5;
+  heap[2] = -3.78900003;
+  heap[3] = std::nan("0");
+
+  RUN_RISCV(R"(
+    # Get heap address
+    li a7, 214
+    ecall
+
+    fld fa3, 0(a0)   # 4.52432537
+    fld fa4, 8(a0)   # 2.5
+    fld fa5, 16(a0)  # -3.78900003
+    fld fa6, 24(a0)  # nan
+
+    # Set rounding mode to RNE
+    li a1, 0
+    fsrm a1
+
+    #nearest ties to even
+    fcvt.w.d t0, fa3      # should convert to 5
+    fcvt.w.d t1, fa4      # should convert to 2
+    fcvt.w.d t2, fa5      # should convert to -4
+
+
+    # Set rounding mode to RTZ
+    li a1, 1
+    fsrm a1
+
+    #towards zero
+    fcvt.w.d t3, fa3      # should convert to 4
+    fcvt.w.d t4, fa4      # should convert to 2
+    fcvt.w.d t5, fa5      # should convert to -3
+
+
+    # Set rounding mode to RDN
+    li a1, 2
+    fsrm a1
+
+    #towards -inf
+    fcvt.w.d t6, fa3      # should convert to 4
+    fcvt.w.d a0, fa4      # should convert to 2
+    fcvt.w.d s7, fa5      # should convert to -4
+
+
+    # Set rounding mode to RUP
+    li a1, 3
+    fsrm a1
+
+    #towards +inf
+    fcvt.w.d a2, fa3      # should convert to 5
+    fcvt.w.d a3, fa4      # should convert to 3
+    fcvt.w.d a4, fa5      # should convert to -3
+
+
+    # Set rounding mode to RMM
+    li a1, 4
+    fsrm a1
+
+    #to nearest ties to maximum magnitude
+    fcvt.w.d a5, fa3, rmm      # should convert to 5
+    fcvt.w.d a6, fa4, rmm      # should convert to 3
+    fcvt.w.d a7, fa5, rmm      # should convert to -4
+
+)");
+
+  EXPECT_EQ(getFPRegister<double>(13), (double)4.52432537);
+  EXPECT_EQ(getFPRegister<double>(14), (double)2.5);
+  EXPECT_EQ(getFPRegister<double>(15), (double)-3.78900003);
+  EXPECT_EQ(getFPRegister<uint64_t>(16), 0x7FF8000000000000);
+
+  // RNE
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0x5);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x2);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0xFFFFFFFFFFFFFFFC);
+
+  // RTZ
+  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0x4);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0x2);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(30), 0xFFFFFFFFFFFFFFFD);
+
+  // RDN
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31), 0x4);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(10), 0x2);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(23), 0xFFFFFFFFFFFFFFFC);
+
+  // RUP
+  EXPECT_EQ(getGeneralRegister<uint64_t>(12), 0x5);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(13), 0x3);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(14), 0xFFFFFFFFFFFFFFFD);
+
+  // RMM
+  EXPECT_EQ(getGeneralRegister<uint64_t>(15), 0x5);
+  // This test won't pass as CPP doesn't provide functionality to tie to max
+  // magnitude
+  //  EXPECT_EQ(getGeneralRegister<uint64_t>(16), 0x3);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(17), 0xFFFFFFFFFFFFFFFC);
+}
+
+INSTANTIATE_TEST_SUITE_P(RISCV, InstFloat,
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
+                         paramToString);
+
+}  // namespace
\ No newline at end of file
diff --git a/test/regression/riscv/instructions/jump.cc b/test/regression/riscv/instructions/jump.cc
index 5aca3b453c..5f3c59520b 100644
--- a/test/regression/riscv/instructions/jump.cc
+++ b/test/regression/riscv/instructions/jump.cc
@@ -3,9 +3,11 @@
 namespace {
 
 using InstJump = RISCVRegressionTest;
+using namespace simeng::arch::riscv::InstructionGroups;
 
 TEST_P(InstJump, jalr) {
   RUN_RISCV(R"(
+    li t1, 4
     jalr t0, t1, 12
     addi t6, t6, 10
     jalr ra, t1, 20
@@ -16,8 +18,10 @@ TEST_P(InstJump, jalr) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 5);
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 10);
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 3);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(1), 12);
-  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 4);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(1), 16);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(5), 8);
+
+  EXPECT_GROUP(R"(jalr ra, t1, 4)", BRANCH);
 }
 
 TEST_P(InstJump, jalrAlias) {
@@ -30,6 +34,8 @@ TEST_P(InstJump, jalrAlias) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 3);
   EXPECT_EQ(getGeneralRegister<uint64_t>(1), 8);
 
+  EXPECT_GROUP(R"(jalr t0)", BRANCH);
+
   RUN_RISCV(R"(
     addi ra, ra, 12
     ret               # jalr zero, ra, 0
@@ -40,6 +46,8 @@ TEST_P(InstJump, jalrAlias) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(1), 12);
   EXPECT_EQ(getGeneralRegister<uint64_t>(0), 0);
 
+  EXPECT_GROUP(R"(ret)", BRANCH);
+
   RUN_RISCV(R"(
     addi t0, t0, 12
     jr t0               # jalr zero, t0, 0
@@ -49,6 +57,8 @@ TEST_P(InstJump, jalrAlias) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), 3);
   EXPECT_EQ(getGeneralRegister<uint64_t>(1), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(0), 0);
+
+  EXPECT_GROUP(R"(jr t0)", BRANCH);
 }
 
 TEST_P(InstJump, jal) {
@@ -65,6 +75,8 @@ TEST_P(InstJump, jal) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 3);
   EXPECT_EQ(getGeneralRegister<uint64_t>(1), 12);
   EXPECT_EQ(getGeneralRegister<uint64_t>(5), 4);
+
+  EXPECT_GROUP(R"(jal ra, 12)", BRANCH);
 }
 
 TEST_P(InstJump, jalAlias) {
@@ -83,11 +95,13 @@ TEST_P(InstJump, jalAlias) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(5), 0);
   EXPECT_EQ(getGeneralRegister<uint64_t>(1), 20);
   EXPECT_EQ(getGeneralRegister<uint64_t>(0), 0);
+
+  EXPECT_GROUP(R"(j 12)", BRANCH);
+  EXPECT_GROUP(R"(jal -12)", BRANCH);
 }
 
 INSTANTIATE_TEST_SUITE_P(RISCV, InstJump,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/regression/riscv/instructions/load.cc b/test/regression/riscv/instructions/load.cc
index 01fb8e095f..0e2f919cd7 100644
--- a/test/regression/riscv/instructions/load.cc
+++ b/test/regression/riscv/instructions/load.cc
@@ -3,6 +3,7 @@
 namespace {
 
 using InstLoad = RISCVRegressionTest;
+using namespace simeng::arch::riscv::InstructionGroups;
 
 TEST_P(InstLoad, lb) {
   initialHeapData_.resize(16);
@@ -28,6 +29,8 @@ TEST_P(InstLoad, lb) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0x0000000000000012);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0xFFFFFFFFFFFFFFAD);
 
+  EXPECT_GROUP(R"(lb t2, -2(t5))", LOAD_INT);
+
   // Load byte unsigned
   RUN_RISCV(R"(
       li a7, 214
@@ -44,6 +47,8 @@ TEST_P(InstLoad, lb) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0x0000000000000078);
   EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0x0000000000000012);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0x00000000000000AD);
+
+  EXPECT_GROUP(R"(lbu t2, -2(t5))", LOAD_INT);
 }
 
 TEST_P(InstLoad, lh) {
@@ -70,6 +75,8 @@ TEST_P(InstLoad, lh) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0xFFFFFFFFFFFFED12);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0xFFFFFFFFFFFFDEAD);
 
+  EXPECT_GROUP(R"(lh t2, -2(t5))", LOAD_INT);
+
   // Load half word unsigned
   RUN_RISCV(R"(
       li a7, 214
@@ -86,6 +93,8 @@ TEST_P(InstLoad, lh) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0x0000000000005678);
   EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0x000000000000ED12);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0x000000000000DEAD);
+
+  EXPECT_GROUP(R"(lhu t2, -2(t5))", LOAD_INT);
 }
 
 TEST_P(InstLoad, lw) {
@@ -112,6 +121,8 @@ TEST_P(InstLoad, lw) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0xFFFFFFFFEBDAED12);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0x000000005678DEAD);
 
+  EXPECT_GROUP(R"(lw t2, -2(t5))", LOAD_INT);
+
   RUN_RISCV(R"(
       li a7, 214
       ecall
@@ -127,6 +138,8 @@ TEST_P(InstLoad, lw) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0x0000000012345678);
   EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0x00000000EBDAED12);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0x000000005678DEAD);
+
+  EXPECT_GROUP(R"(lwu t2, -2(t5))", LOAD_INT);
 }
 
 TEST_P(InstLoad, ld) {
@@ -152,11 +165,12 @@ TEST_P(InstLoad, ld) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0xFEEBDAED12345678);
   EXPECT_EQ(getGeneralRegister<uint64_t>(28), 0x654321FEEBDAED12);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0xDAED12345678DEAD);
+
+  EXPECT_GROUP(R"(ld t2, -2(t5))", LOAD_INT);
 }
 
 INSTANTIATE_TEST_SUITE_P(RISCV, InstLoad,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/regression/riscv/instructions/multiplyDivide.cc b/test/regression/riscv/instructions/multiplyDivide.cc
index cea3006adc..9894ec485a 100644
--- a/test/regression/riscv/instructions/multiplyDivide.cc
+++ b/test/regression/riscv/instructions/multiplyDivide.cc
@@ -3,6 +3,7 @@
 namespace {
 
 using InstMulDiv = RISCVRegressionTest;
+using namespace simeng::arch::riscv::InstructionGroups;
 
 TEST_P(InstMulDiv, mul) {
   initialHeapData_.resize(16);
@@ -34,6 +35,8 @@ TEST_P(InstMulDiv, mul) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(18), 0x80000000);
   EXPECT_EQ(getGeneralRegister<uint64_t>(19),
             0x8000000000000000);  // 2^31 * 2^32 = 2^63 (NO overflow)
+
+  EXPECT_GROUP(R"(mul s3, s2, t2)", INT_MUL);
 }
 
 // TODO NYI, tests should fail
@@ -54,6 +57,8 @@ TEST_P(InstMulDiv, mul) {
 //  EXPECT_EQ(getGeneralRegister<uint64_t>(31), -1);
 //  EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0);
 //  EXPECT_EQ(getGeneralRegister<uint64_t>(28), 1);
+//
+// EXPECT_GROUP(R"()", INT_MUL);
 //}
 
 TEST_P(InstMulDiv, mulhu) {
@@ -71,6 +76,8 @@ TEST_P(InstMulDiv, mulhu) {
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), -1);
   EXPECT_EQ(getGeneralRegister<uint64_t>(29), 0xFFFFFFFFFFFFFFFE);
+
+  EXPECT_GROUP(R"(mulhu t4, t6, t6)", INT_MUL);
 }
 
 // TODO NYI, tests should fail
@@ -89,6 +96,8 @@ TEST_P(InstMulDiv, mulhu) {
 //  )");
 //  EXPECT_EQ(getGeneralRegister<uint64_t>(31), -1);
 //  EXPECT_EQ(getGeneralRegister<uint64_t>(29), -1);
+//
+//  EXPECT_GROUP(R"()", INT_MUL);
 //}
 
 TEST_P(InstMulDiv, mulw) {
@@ -106,12 +115,13 @@ TEST_P(InstMulDiv, mulw) {
     li t4, 6
     slli t3, t5, 30
     mulw t2, t4, t3
-
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), -1);
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 1);
   EXPECT_EQ(getGeneralRegister<uint64_t>(28), 1 << 30);
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 0xFFFFFFFF80000000);
+
+  EXPECT_GROUP(R"(mulw t2, t4, t3)", INT_MUL);
 }
 
 TEST_P(InstMulDiv, div) {
@@ -136,7 +146,6 @@ TEST_P(InstMulDiv, div) {
     div t2, s2, s3
     ld t1, 8(a0)
     div s4, t1, t6
-
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(31), -1);
   EXPECT_EQ(getGeneralRegister<uint64_t>(30), 1);  //-1/-1 = 1
@@ -147,6 +156,8 @@ TEST_P(InstMulDiv, div) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x8000000000000000);
   EXPECT_EQ(getGeneralRegister<uint64_t>(20),
             0x8000000000000000);  // division overflow
+
+  EXPECT_GROUP(R"(div s4, t1, t6)", INT_DIV_OR_SQRT);
 }
 
 TEST_P(InstMulDiv, divw) {
@@ -181,6 +192,8 @@ TEST_P(InstMulDiv, divw) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0xFFFFFFFF80000000);
   EXPECT_EQ(getGeneralRegister<uint64_t>(20),
             0xFFFFFFFF80000000);  // division overflow
+
+  EXPECT_GROUP(R"(divw s4, t1, t6)", INT_DIV_OR_SQRT);
 }
 
 TEST_P(InstMulDiv, divu) {
@@ -216,6 +229,8 @@ TEST_P(InstMulDiv, divu) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 8);   // 16/2 = 8
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x8000000000000000);
   EXPECT_EQ(getGeneralRegister<uint64_t>(20), 0);  // big / max pos = 0
+
+  EXPECT_GROUP(R"(divu s4, t1, t6)", INT_DIV_OR_SQRT);
 }
 
 TEST_P(InstMulDiv, divuw) {
@@ -251,6 +266,8 @@ TEST_P(InstMulDiv, divuw) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), 8);   // 16/2 = 8
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0xFFFFFFFF80000000);
   EXPECT_EQ(getGeneralRegister<uint64_t>(20), 0);  // // big pos / max pos = 0
+
+  EXPECT_GROUP(R"(divuw s4, t1, t6)", INT_DIV_OR_SQRT);
 }
 
 TEST_P(InstMulDiv, rem) {
@@ -287,6 +304,8 @@ TEST_P(InstMulDiv, rem) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), -2);  // -16/-7 = -2
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x8000000000000000);
   EXPECT_EQ(getGeneralRegister<uint64_t>(20), 0);  // max pos/-1 = 0
+
+  EXPECT_GROUP(R"(rem s4, t1, t6)", INT_DIV_OR_SQRT);
 }
 
 TEST_P(InstMulDiv, remw) {
@@ -327,6 +346,8 @@ TEST_P(InstMulDiv, remw) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(7), -2);  // -16/-7 = 2
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0xFFFFFFFF80000000);
   EXPECT_EQ(getGeneralRegister<uint64_t>(20), 0);  // big pos/max pos = 0
+
+  EXPECT_GROUP(R"(remw s4, t1, t6)", INT_DIV_OR_SQRT);
 }
 
 TEST_P(InstMulDiv, remu) {
@@ -364,6 +385,8 @@ TEST_P(InstMulDiv, remu) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0x8000000000000000);
   EXPECT_EQ(getGeneralRegister<uint64_t>(20),
             0x8000000000000000);  // big pos/max pos = big pos
+
+  EXPECT_GROUP(R"(remu s4, t1, t6)", INT_DIV_OR_SQRT);
 }
 
 TEST_P(InstMulDiv, remuw) {
@@ -405,11 +428,12 @@ TEST_P(InstMulDiv, remuw) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(6), 0xFFFFFFFF80000000);
   EXPECT_EQ(getGeneralRegister<uint64_t>(20),
             0xFFFFFFFF80000000);  // big pos/max pos = 0
+
+  EXPECT_GROUP(R"(remuw s4, t1, t6)", INT_DIV_OR_SQRT);
 }
 
 INSTANTIATE_TEST_SUITE_P(RISCV, InstMulDiv,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/regression/riscv/instructions/store.cc b/test/regression/riscv/instructions/store.cc
index d03f7c4585..ea2260481b 100644
--- a/test/regression/riscv/instructions/store.cc
+++ b/test/regression/riscv/instructions/store.cc
@@ -3,6 +3,7 @@
 namespace {
 
 using InstStore = RISCVRegressionTest;
+using namespace simeng::arch::riscv::InstructionGroups;
 
 TEST_P(InstStore, sb) {
   initialHeapData_.resize(16);
@@ -23,6 +24,8 @@ TEST_P(InstStore, sb) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(10), 32);
   EXPECT_EQ(getMemoryValue<uint32_t>(33), 0x0012AA56);
   EXPECT_EQ(getMemoryValue<uint32_t>(37), 0x00005400);
+
+  EXPECT_GROUP(R"(sb t6, 6(a0))", STORE_INT);
 }
 
 TEST_P(InstStore, sh) {
@@ -49,6 +52,8 @@ TEST_P(InstStore, sh) {
   EXPECT_EQ(getMemoryValue<uint32_t>(64), 0x1200AA78);
   EXPECT_EQ(getMemoryValue<uint32_t>(69), 0x00015400);
   EXPECT_EQ(getMemoryValue<uint32_t>(73), 0x0054AA00);
+
+  EXPECT_GROUP(R"(sh t6, 10(a0))", STORE_INT);
 }
 
 TEST_P(InstStore, sw) {
@@ -78,7 +83,10 @@ TEST_P(InstStore, sw) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(10), 64);
   EXPECT_EQ(getMemoryValue<uint64_t>(64), 0xAAADBE000000AA78);
   EXPECT_EQ(getMemoryValue<uint64_t>(69), 0x0087015400AAADBE);
-  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer()), 0x5400AA00);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer()),
+            0x5400AA00);
+
+  EXPECT_GROUP(R"(sw t6, 0(sp))", STORE_INT);
 }
 
 TEST_P(InstStore, sd) {
@@ -107,13 +115,14 @@ TEST_P(InstStore, sd) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(10), 68);
   EXPECT_EQ(getMemoryValue<uint64_t>(64), 0x0154000000AA5678);
   EXPECT_EQ(getMemoryValue<uint64_t>(68), 0x8765000001540000);
-  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getStackPointer() + 4),
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() + 4),
             0x000154000000AA01);
+
+  EXPECT_GROUP(R"(sd t6, 4(sp))", STORE_INT);
 }
 
 INSTANTIATE_TEST_SUITE_P(RISCV, InstStore,
-                         ::testing::Values(std::make_tuple(EMULATION,
-                                                           YAML::Load("{}"))),
+                         ::testing::Values(std::make_tuple(EMULATION, "{}")),
                          paramToString);
 
 }  // namespace
diff --git a/test/unit/ArchitecturalRegisterFileSetTest.cc b/test/unit/ArchitecturalRegisterFileSetTest.cc
new file mode 100644
index 0000000000..1529ef1cea
--- /dev/null
+++ b/test/unit/ArchitecturalRegisterFileSetTest.cc
@@ -0,0 +1,41 @@
+#include "gtest/gtest.h"
+#include "simeng/ArchitecturalRegisterFileSet.hh"
+
+namespace simeng {
+namespace pipeline {
+
+class ArchitecturalRegisterFileSetTest : public ::testing::Test {
+ public:
+  ArchitecturalRegisterFileSetTest()
+      : physRegFileSet(regFileStruct), archRegFileSet(physRegFileSet) {}
+
+ protected:
+  const std::vector<RegisterFileStructure> regFileStruct = {
+      {8, 10}, {24, 15}, {256, 31}};
+
+  RegisterFileSet physRegFileSet;
+
+  ArchitecturalRegisterFileSet archRegFileSet;
+};
+
+// Ensure we can read and write values to the architectural register file
+TEST_F(ArchitecturalRegisterFileSetTest, readWrite) {
+  for (uint8_t i = 0; i < regFileStruct.size(); i++) {
+    const uint16_t regSize = regFileStruct[i].bytes;
+    const uint16_t maxRegTag = regFileStruct[i].quantity - 1;
+    const Register r0 = {i, 0};
+    const Register rMax = {i, maxRegTag};
+
+    EXPECT_EQ(archRegFileSet.get(r0), RegisterValue(0, regSize));
+    EXPECT_EQ(archRegFileSet.get(rMax), RegisterValue(0, regSize));
+
+    archRegFileSet.set(r0, RegisterValue(20, regSize));
+    archRegFileSet.set(rMax, RegisterValue(40, regSize));
+
+    EXPECT_EQ(archRegFileSet.get(r0), RegisterValue(20, regSize));
+    EXPECT_EQ(archRegFileSet.get(rMax), RegisterValue(40, regSize));
+  }
+}
+
+}  // namespace pipeline
+}  // namespace simeng
\ No newline at end of file
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index a3e400aad2..2826cd0030 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -1,28 +1,51 @@
 set(TEST_SOURCES
+    aarch64/ArchInfoTest.cc
+    aarch64/ArchitectureTest.cc
+    aarch64/AuxiliaryFunctionsTest.cc
+    aarch64/ExceptionHandlerTest.cc
+    aarch64/InstructionTest.cc
+    aarch64/OperandContainerTest.cc
+    riscv/ArchInfoTest.cc
+    riscv/ArchitectureTest.cc
+    riscv/ExceptionHandlerTest.cc
+    riscv/InstructionTest.cc
     pipeline/A64FXPortAllocatorTest.cc
     pipeline/BalancedPortAllocatorTest.cc
-    pipeline/ExecuteUnitTest.cc
     pipeline/DecodeUnitTest.cc
+    pipeline/DispatchIssueUnitTest.cc
     pipeline/ExecuteUnitTest.cc
     pipeline/FetchUnitTest.cc
     pipeline/LoadStoreQueueTest.cc
+    pipeline/M1PortAllocatorTest.cc
+    pipeline/MappedRegisterFileSetTest.cc
     pipeline/PipelineBufferTest.cc
     pipeline/RegisterAliasTableTest.cc
+    pipeline/RenameUnitTest.cc
     pipeline/ReorderBufferTest.cc
     pipeline/WritebackUnitTest.cc
+    ArchitecturalRegisterFileSetTest.cc
+    ElfTest.cc
+    FixedLatencyMemoryInterfaceTest.cc
+    FlatMemoryInterfaceTest.cc
     GenericPredictorTest.cc
-    ISATest.cc
-    RegisterValueTest.cc
+    OSTest.cc
     PoolTest.cc
-    ShiftValueTest.cc
-    LatencyMemoryInterfaceTest.cc
+    ProcessTest.cc
+    RegisterFileSetTest.cc
+    RegisterValueTest.cc
+    PerceptronPredictorTest.cc
+    SpecialFileDirGenTest.cc
     )
 
 add_executable(unittests ${TEST_SOURCES})
 
+configure_file(${capstone_SOURCE_DIR}/arch/AArch64/AArch64GenInstrInfo.inc AArch64GenInstrInfo.inc COPYONLY)
+configure_file(${capstone_SOURCE_DIR}/arch/RISCV/RISCVGenInstrInfo.inc RISCVGenInstrInfo.inc COPYONLY)
+
 target_include_directories(unittests PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
 target_include_directories(unittests PUBLIC ${PROJECT_SOURCE_DIR}/src/lib)
 target_link_libraries(unittests libsimeng)
 target_link_libraries(unittests gmock_main)
+target_compile_options(unittests PRIVATE ${SIMENG_COMPILE_OPTIONS})
 
 add_test(NAME unit_tests COMMAND unittests)
diff --git a/test/unit/ConfigInit.hh b/test/unit/ConfigInit.hh
new file mode 100644
index 0000000000..32b3c6ef6a
--- /dev/null
+++ b/test/unit/ConfigInit.hh
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "simeng/config/SimInfo.hh"
+#include "simeng/version.hh"
+
+namespace simeng {
+
+// This small class' purpose is to initialise the SimInfo config before the
+// initialisation of a test class
+class ConfigInit {
+ public:
+  ConfigInit(config::ISA isa, std::string configAdditions) {
+    config::SimInfo::generateDefault(isa, true);
+    config::SimInfo::addToConfig(configAdditions);
+  }
+};
+
+}  // namespace simeng
\ No newline at end of file
diff --git a/test/unit/ElfTest.cc b/test/unit/ElfTest.cc
new file mode 100644
index 0000000000..9635304bf3
--- /dev/null
+++ b/test/unit/ElfTest.cc
@@ -0,0 +1,67 @@
+#include "gmock/gmock.h"
+#include "simeng/Elf.hh"
+#include "simeng/version.hh"
+
+using ::testing::_;
+using ::testing::HasSubstr;
+using ::testing::Return;
+
+namespace simeng {
+
+class ElfTest : public testing::Test {
+ public:
+  ElfTest() {}
+
+ protected:
+  const std::string knownElfFilePath =
+      SIMENG_SOURCE_DIR "/test/unit/data/stream-aarch64.elf";
+
+  const uint64_t known_entryPoint = 4206008;
+  const uint16_t known_e_phentsize = 56;
+  const uint16_t known_e_phnum = 6;
+  const uint64_t known_phdrTableAddress = 4194368;
+  const uint64_t known_processImageSize = 5040480;
+
+  char* unwrappedProcImgPtr;
+};
+
+// Test that a valid ELF file can be created
+TEST_F(ElfTest, validElf) {
+  Elf elf(knownElfFilePath, &unwrappedProcImgPtr);
+
+  EXPECT_TRUE(elf.isValid());
+  EXPECT_EQ(elf.getEntryPoint(), known_entryPoint);
+  EXPECT_EQ(elf.getPhdrEntrySize(), known_e_phentsize);
+  EXPECT_EQ(elf.getNumPhdr(), known_e_phnum);
+  EXPECT_EQ(elf.getPhdrTableAddress(), known_phdrTableAddress);
+  EXPECT_EQ(elf.getProcessImageSize(), known_processImageSize);
+}
+
+// Test that wrong filepath results in invalid ELF
+TEST_F(ElfTest, invalidElf) {
+  Elf elf(SIMENG_SOURCE_DIR "/test/bogus_file_path___--__--__",
+          &unwrappedProcImgPtr);
+  EXPECT_FALSE(elf.isValid());
+}
+
+// Test that non-ELF file is not accepted
+TEST_F(ElfTest, nonElf) {
+  testing::internal::CaptureStderr();
+  Elf elf(SIMENG_SOURCE_DIR "/test/unit/ElfTest.cc", &unwrappedProcImgPtr);
+  EXPECT_FALSE(elf.isValid());
+  EXPECT_THAT(testing::internal::GetCapturedStderr(),
+              HasSubstr("[SimEng:Elf] Elf magic does not match"));
+}
+
+// Check that 32-bit ELF is not accepted
+TEST_F(ElfTest, format32Elf) {
+  testing::internal::CaptureStderr();
+  Elf elf(SIMENG_SOURCE_DIR "/test/unit/data/stream.rv32ima.elf",
+          &unwrappedProcImgPtr);
+  EXPECT_FALSE(elf.isValid());
+  EXPECT_THAT(
+      testing::internal::GetCapturedStderr(),
+      HasSubstr("[SimEng:Elf] Unsupported architecture detected in Elf"));
+}
+
+}  // namespace simeng
\ No newline at end of file
diff --git a/test/unit/FixedLatencyMemoryInterfaceTest.cc b/test/unit/FixedLatencyMemoryInterfaceTest.cc
new file mode 100644
index 0000000000..e2cc28c2fd
--- /dev/null
+++ b/test/unit/FixedLatencyMemoryInterfaceTest.cc
@@ -0,0 +1,141 @@
+#include "gtest/gtest.h"
+#include "simeng/memory/FixedLatencyMemoryInterface.hh"
+
+namespace {
+
+class FixedLatencyMemoryInterfaceTest
+    : public testing::TestWithParam<uint16_t> {
+ public:
+  FixedLatencyMemoryInterfaceTest()
+      : memory(memoryData.data(), memorySize, GetParam()) {}
+
+ protected:
+  static constexpr uint16_t memorySize = 4;
+  std::array<char, memorySize> memoryData = {(char)0xFE, (char)0xCA, (char)0xBA,
+                                             (char)0xAB};
+
+  simeng::RegisterValue value = {0xDEADBEEF, 4};
+  simeng::RegisterValue value_oversized = {0xDEADBEEFDEADBEEF, 8};
+  simeng::memory::MemoryAccessTarget target = {0, 4};
+  simeng::memory::MemoryAccessTarget target_OutOfBound1 = {1000, 4};
+  simeng::memory::MemoryAccessTarget target_OutOfBound2 = {0, 8};
+
+  const std::string writeOverflowStr =
+      "Attempted to write beyond memory limit.";
+
+  simeng::memory::FixedLatencyMemoryInterface memory;
+};
+
+// Test that we can read data and it completes after n cycles.
+TEST_P(FixedLatencyMemoryInterfaceTest, FixedReadData) {
+  // Read a 32-bit value
+  memory.requestRead(target, 1);
+  EXPECT_TRUE(memory.hasPendingRequests());
+
+  // Tick n-1 times - request should still be pending
+  uint16_t latency = GetParam();
+  for (int n = 0; n < latency - 1; n++) {
+    memory.tick();
+    EXPECT_TRUE(memory.hasPendingRequests());
+  }
+
+  // Tick again - request should have completed
+  memory.tick();
+  EXPECT_FALSE(memory.hasPendingRequests());
+
+  auto entries = memory.getCompletedReads();
+  EXPECT_EQ(entries.size(), 1);
+  EXPECT_EQ(entries[0].requestId, 1);
+  EXPECT_EQ(entries[0].data, simeng::RegisterValue(0xABBACAFE, 4));
+  EXPECT_EQ(entries[0].target, target);
+}
+
+// Test that we can write data and it completes after n cycles.
+TEST_P(FixedLatencyMemoryInterfaceTest, FixedWriteData) {
+  // Write a 32-bit value to memory
+  memory.requestWrite(target, value);
+  EXPECT_TRUE(memory.hasPendingRequests());
+
+  // Tick n-1 times - request should still be pending
+  uint16_t latency = GetParam();
+  for (int n = 0; n < latency - 1; n++) {
+    memory.tick();
+    EXPECT_TRUE(memory.hasPendingRequests());
+  }
+
+  // Tick again - request should have completed
+  memory.tick();
+  EXPECT_FALSE(memory.hasPendingRequests());
+  EXPECT_EQ(reinterpret_cast<uint32_t*>(memoryData.data())[0], 0xDEADBEEF);
+}
+
+// Test that out-of-bounds memory reads are correctly handled.
+TEST_P(FixedLatencyMemoryInterfaceTest, OutofBoundsRead) {
+  // Create a target such that address + size will overflow
+  memory.requestRead(target_OutOfBound1, 1);
+
+  // Create a regular out-of-bounds target
+  memory.requestRead(target_OutOfBound2, 2);
+
+  // Tick n-1 times - request shouldn't have completed
+  uint16_t latency = GetParam();
+  for (int n = 0; n < latency - 1; n++) {
+    memory.tick();
+    EXPECT_TRUE(memory.hasPendingRequests());
+  }
+
+  // Tick again - request should have completed
+  memory.tick();
+  EXPECT_FALSE(memory.hasPendingRequests());
+
+  auto entries = memory.getCompletedReads();
+  EXPECT_EQ(entries.size(), 2);
+
+  auto overflowResult = entries[0];
+  EXPECT_EQ(overflowResult.requestId, 1);
+  EXPECT_FALSE(overflowResult.data);
+  EXPECT_EQ(overflowResult.target, target_OutOfBound1);
+
+  overflowResult = entries[1];
+  EXPECT_EQ(overflowResult.requestId, 2);
+  EXPECT_FALSE(overflowResult.data);
+  EXPECT_EQ(overflowResult.target, target_OutOfBound2);
+}
+
+// Test that out-of-bounds memory writes are correctly handled.
+TEST_P(FixedLatencyMemoryInterfaceTest, OutofBoundsWrite_1) {
+  // Create a target such that address + size will overflow
+  memory.requestWrite(target_OutOfBound1, value);
+
+  // Tick n-1 times - request shouldn't have completed
+  uint16_t latency = GetParam();
+  for (int n = 0; n < latency - 1; n++) {
+    memory.tick();
+    EXPECT_TRUE(memory.hasPendingRequests());
+  }
+
+  // Tick again - simulation should have come to a stop
+  ASSERT_DEATH(memory.tick(), writeOverflowStr);
+}
+
+// Test that out-of-bounds memory writes are correctly handled.
+TEST_P(FixedLatencyMemoryInterfaceTest, OutofBoundsWrite_2) {
+  // Create a regular out-of-bounds target
+  memory.requestWrite(target_OutOfBound2, value_oversized);
+
+  // Tick n-1 times - request shouldn't have completed
+  uint16_t latency = GetParam();
+  for (int n = 0; n < latency - 1; n++) {
+    memory.tick();
+    EXPECT_TRUE(memory.hasPendingRequests());
+  }
+
+  // Tick again - simulation should have come to a stop
+  ASSERT_DEATH(memory.tick(), writeOverflowStr);
+}
+
+INSTANTIATE_TEST_SUITE_P(FixedLatencyMemoryInterfaceTests,
+                         FixedLatencyMemoryInterfaceTest,
+                         ::testing::Values<uint16_t>(2, 4));
+
+}  // namespace
diff --git a/test/unit/FlatMemoryInterfaceTest.cc b/test/unit/FlatMemoryInterfaceTest.cc
new file mode 100644
index 0000000000..e04b895179
--- /dev/null
+++ b/test/unit/FlatMemoryInterfaceTest.cc
@@ -0,0 +1,81 @@
+#include "gtest/gtest.h"
+#include "simeng/memory/FlatMemoryInterface.hh"
+
+namespace {
+
+class FlatMemoryInterfaceTest : public testing::Test {
+ public:
+  FlatMemoryInterfaceTest() : memory(memoryData.data(), memorySize) {}
+
+ protected:
+  static constexpr uint16_t memorySize = 4;
+  std::array<char, memorySize> memoryData = {(char)0xFE, (char)0xCA, (char)0xBA,
+                                             (char)0xAB};
+
+  simeng::RegisterValue value = {0xDEADBEEF, 4};
+  simeng::RegisterValue value_oversized = {0xDEADBEEFDEADBEEF, 8};
+  simeng::memory::MemoryAccessTarget target = {0, 4};
+  simeng::memory::MemoryAccessTarget target_OutOfBound1 = {1000, 4};
+  simeng::memory::MemoryAccessTarget target_OutOfBound2 = {0, 8};
+
+  const std::string writeOverflowStr =
+      "Attempted to write beyond memory limit.";
+
+  simeng::memory::FlatMemoryInterface memory;
+};
+
+// Test that we can read data and it completes after zero cycles.
+TEST_F(FlatMemoryInterfaceTest, FixedReadData) {
+  // Read a 32-bit value
+  memory.requestRead(target, 1);
+  auto entries = memory.getCompletedReads();
+  EXPECT_EQ(entries.size(), 1);
+  EXPECT_EQ(entries[0].requestId, 1);
+  EXPECT_EQ(entries[0].data, simeng::RegisterValue(0xABBACAFE, 4));
+  EXPECT_EQ(entries[0].target, target);
+}
+
+// Test that we can write data and it completes after zero cycles.
+TEST_F(FlatMemoryInterfaceTest, FixedWriteData) {
+  // Write a 32-bit value to memory
+  memory.requestWrite(target, value);
+  EXPECT_EQ(reinterpret_cast<uint32_t*>(memoryData.data())[0], 0xDEADBEEF);
+}
+
+// Test that out-of-bounds memory reads are correctly handled.
+TEST_F(FlatMemoryInterfaceTest, OutofBoundsRead) {
+  // Create a target such that address + size will overflow
+  memory.requestRead(target_OutOfBound1, 1);
+
+  // Create a regular out-of-bounds target
+  memory.requestRead(target_OutOfBound2, 2);
+
+  auto entries = memory.getCompletedReads();
+  EXPECT_EQ(entries.size(), 2);
+
+  auto overflowResult = entries[0];
+  EXPECT_EQ(overflowResult.requestId, 1);
+  EXPECT_FALSE(overflowResult.data);
+  EXPECT_EQ(overflowResult.target, target_OutOfBound1);
+
+  overflowResult = entries[1];
+  EXPECT_EQ(overflowResult.requestId, 2);
+  EXPECT_FALSE(overflowResult.data);
+  EXPECT_EQ(overflowResult.target, target_OutOfBound2);
+}
+
+// Test that out-of-bounds memory writes are correctly handled.
+TEST_F(FlatMemoryInterfaceTest, OutofBoundsWrite_1) {
+  // Create a target such that address + size will overflow
+  ASSERT_DEATH(memory.requestWrite(target_OutOfBound1, value),
+               writeOverflowStr);
+}
+
+// Test that out-of-bounds memory writes are correctly handled.
+TEST_F(FlatMemoryInterfaceTest, OutofBoundsWrite_2) {
+  // Create a regular out-of-bounds target
+  ASSERT_DEATH(memory.requestWrite(target_OutOfBound2, value_oversized),
+               writeOverflowStr);
+}
+
+}  // namespace
diff --git a/test/unit/GenericPredictorTest.cc b/test/unit/GenericPredictorTest.cc
index 522bad3067..66ec9155c7 100644
--- a/test/unit/GenericPredictorTest.cc
+++ b/test/unit/GenericPredictorTest.cc
@@ -1,6 +1,7 @@
+#include "ConfigInit.hh"
 #include "MockInstruction.hh"
 #include "gtest/gtest.h"
-#include "simeng/GenericPredictor.hh"
+#include "simeng/branchpredictors/GenericPredictor.hh"
 
 namespace simeng {
 
@@ -18,138 +19,298 @@ class GenericPredictorTest : public testing::Test {
 // Tests that a GenericPredictor will predict the correct direction on a
 // miss
 TEST_F(GenericPredictorTest, Miss) {
-  auto predictor = simeng::GenericPredictor(YAML::Load(
-      "{Branch-Predictor: {BTB-Tag-Bits: 11, Saturating-Count-Bits: 2, "
-      "Global-History-Length: 10, RAS-entries: 5, Fallback-Static-Predictor: "
-      "2}}"));
+  ConfigInit configInit = ConfigInit(
+      config::ISA::AArch64,
+      "{Branch-Predictor: {Type: Generic, BTB-Tag-Bits: 11, "
+      "Saturating-Count-Bits: 2, Global-History-Length: 10, RAS-entries: 5, "
+      "Fallback-Static-Predictor: Always-Taken}}");
+  auto predictor = simeng::GenericPredictor();
   auto prediction = predictor.predict(0, BranchType::Conditional, 0);
-  EXPECT_TRUE(prediction.taken);
+  EXPECT_TRUE(prediction.isTaken);
 
-  predictor = simeng::GenericPredictor(YAML::Load(
-      "{Branch-Predictor: {BTB-Tag-Bits: 11, Saturating-Count-Bits: 2, "
-      "Global-History-Length: 10, RAS-entries: 5, Fallback-Static-Predictor: "
-      "1}}"));
+  configInit = ConfigInit(
+      config::ISA::AArch64,
+      "{Branch-Predictor: {Type: Generic, BTB-Tag-Bits: 11, "
+      "Saturating-Count-Bits: 2, Global-History-Length: 10, RAS-entries: 5, "
+      "Fallback-Static-Predictor: Always-Not-Taken}}");
+  predictor = simeng::GenericPredictor();
   prediction = predictor.predict(0, BranchType::Conditional, 0);
-  EXPECT_FALSE(prediction.taken);
+  EXPECT_FALSE(prediction.isTaken);
   prediction = predictor.predict(8, BranchType::Unconditional, 0);
-  EXPECT_TRUE(prediction.taken);
+  EXPECT_TRUE(prediction.isTaken);
 }
 
 // Tests that a GenericPredictor will predict branch-and-link return pairs
 // correctly
 TEST_F(GenericPredictorTest, RAS) {
-  auto predictor = simeng::GenericPredictor(YAML::Load(
-      "{Branch-Predictor: {BTB-Tag-Bits: 11, Saturating-Count-Bits: 2, "
-      "Global-History-Length: 10, RAS-entries: 10, Fallback-Static-Predictor: "
-      "2}}"));
+  simeng::config::SimInfo::addToConfig(
+      "{Branch-Predictor: {Type: Generic, BTB-Tag-Bits: 11, "
+      "Saturating-Count-Bits: 2, Global-History-Length: 10, RAS-entries: 10, "
+      "Fallback-Static-Predictor: Always-Taken}}");
+  auto predictor = simeng::GenericPredictor();
   auto prediction = predictor.predict(8, BranchType::SubroutineCall, 8);
-  EXPECT_TRUE(prediction.taken);
+  EXPECT_TRUE(prediction.isTaken);
   EXPECT_EQ(prediction.target, 16);
   prediction = predictor.predict(24, BranchType::SubroutineCall, 8);
-  EXPECT_TRUE(prediction.taken);
+  EXPECT_TRUE(prediction.isTaken);
   EXPECT_EQ(prediction.target, 32);
   prediction = predictor.predict(40, BranchType::SubroutineCall, 8);
-  EXPECT_TRUE(prediction.taken);
+  EXPECT_TRUE(prediction.isTaken);
   EXPECT_EQ(prediction.target, 48);
   prediction = predictor.predict(56, BranchType::SubroutineCall, 8);
-  EXPECT_TRUE(prediction.taken);
+  EXPECT_TRUE(prediction.isTaken);
   EXPECT_EQ(prediction.target, 64);
   prediction = predictor.predict(72, BranchType::SubroutineCall, 8);
-  EXPECT_TRUE(prediction.taken);
+  EXPECT_TRUE(prediction.isTaken);
   EXPECT_EQ(prediction.target, 80);
 
   prediction = predictor.predict(84, BranchType::Return, 0);
-  EXPECT_TRUE(prediction.taken);
+  EXPECT_TRUE(prediction.isTaken);
   EXPECT_EQ(prediction.target, 76);
   prediction = predictor.predict(68, BranchType::Return, 0);
-  EXPECT_TRUE(prediction.taken);
+  EXPECT_TRUE(prediction.isTaken);
   EXPECT_EQ(prediction.target, 60);
   prediction = predictor.predict(52, BranchType::Return, 0);
-  EXPECT_TRUE(prediction.taken);
+  EXPECT_TRUE(prediction.isTaken);
   EXPECT_EQ(prediction.target, 44);
   prediction = predictor.predict(36, BranchType::Return, 0);
-  EXPECT_TRUE(prediction.taken);
+  EXPECT_TRUE(prediction.isTaken);
   EXPECT_EQ(prediction.target, 28);
   prediction = predictor.predict(20, BranchType::Return, 0);
-  EXPECT_TRUE(prediction.taken);
+  EXPECT_TRUE(prediction.isTaken);
   EXPECT_EQ(prediction.target, 12);
 }
 
 // Tests that a GenericPredictor will predict a previously encountered branch
 // correctly, when no address aliasing has occurred
 TEST_F(GenericPredictorTest, Hit) {
-  auto predictor = simeng::GenericPredictor(YAML::Load(
-      "{Branch-Predictor: {BTB-Tag-Bits: 11, Saturating-Count-Bits: 2, "
-      "Global-History-Length: 1, RAS-entries: 5, Fallback-Static-Predictor: "
-      "2}}"));
-  predictor.update(0, true, 16, BranchType::Conditional);
-  predictor.update(0, true, 16, BranchType::Conditional);
-  predictor.update(0, true, 16, BranchType::Conditional);
-  predictor.update(0, true, 16, BranchType::Conditional);
-  predictor.update(0, false, 16, BranchType::Conditional);
+  simeng::config::SimInfo::addToConfig(
+      "{Branch-Predictor: {Type: Generic, BTB-Tag-Bits: 5, "
+      "Saturating-Count-Bits: 2, Global-History-Length: 1, RAS-entries: 5, "
+      "Fallback-Static-Predictor: Always-Taken}}");
+  auto predictor = simeng::GenericPredictor();
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 0);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 1);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 2);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 3);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 4);
 
   auto prediction = predictor.predict(0, BranchType::Conditional, 0);
-  EXPECT_TRUE(prediction.taken);
+  EXPECT_TRUE(prediction.isTaken);
   EXPECT_EQ(prediction.target, 16);
 }
 
 // Tests that a GenericPredictor will predict correctly for two different
 // behaviours of the same branch but in different states of the program
 TEST_F(GenericPredictorTest, GlobalIndexing) {
-  auto predictor = simeng::GenericPredictor(YAML::Load(
-      "{Branch-Predictor: {BTB-Tag-Bits: 11, Saturating-Count-Bits: 2, "
-      "Global-History-Length: 5, RAS-entries: 5, Fallback-Static-Predictor: "
-      "1}}"));
+  simeng::config::SimInfo::addToConfig(
+      "{Branch-Predictor: {Type: Generic, BTB-Tag-Bits: 10, "
+      "Saturating-Count-Bits: 2, Global-History-Length: 10, RAS-entries: 5, "
+      "Fallback-Static-Predictor: Always-Not-Taken}}");
+  auto predictor = simeng::GenericPredictor();
   // Spool up first global history pattern
-  predictor.update(0, true, 4, BranchType::Unconditional);
-  predictor.update(0, false, 4, BranchType::Unconditional);
-  predictor.update(0, false, 4, BranchType::Unconditional);
-  predictor.update(0, false, 4, BranchType::Unconditional);
-  predictor.update(0, true, 4, BranchType::Unconditional);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 0);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 1);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 2);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 3);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 4);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 5);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 6);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 7);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 8);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 9);
   // Ensure default behaviour for first encounter
-  auto prediction = predictor.predict(0x1F, BranchType::Conditional, 0);
-  EXPECT_FALSE(prediction.taken);
-  EXPECT_EQ(prediction.target, 0x23);
+  auto prediction = predictor.predict(0x7C, BranchType::Conditional, 0);
+  EXPECT_FALSE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 0x80);
   // Set entry in BTB
-  predictor.update(0x1F, true, 0xAB, BranchType::Conditional);
+  predictor.update(0x7C, true, 0xAB, BranchType::Conditional, 10);
 
   // Spool up second global history pattern
-  predictor.update(0, false, 4, BranchType::Unconditional);
-  predictor.update(0, true, 4, BranchType::Unconditional);
-  predictor.update(0, true, 4, BranchType::Unconditional);
-  predictor.update(0, true, 4, BranchType::Unconditional);
-  predictor.update(0, false, 4, BranchType::Unconditional);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 16, BranchType::Conditional, 11);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 12);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 13);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 14);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 16, BranchType::Conditional, 15);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 16, BranchType::Conditional, 16);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 17);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 18);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 19);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 16, BranchType::Conditional, 20);
   // Ensure default behaviour for re-encounter but with different global history
-  prediction = predictor.predict(0x1F, BranchType::Conditional, 0);
-  EXPECT_FALSE(prediction.taken);
-  EXPECT_EQ(prediction.target, 0x23);
+  prediction = predictor.predict(0x7C, BranchType::Conditional, 0);
+  EXPECT_FALSE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 0x80);
   // Set entry in BTB
-  predictor.update(0x1F, true, 0xBA, BranchType::Conditional);
+  predictor.update(0x7C, true, 0xBA, BranchType::Conditional, 21);
 
   // Recreate first global history pattern
-  predictor.update(0, true, 4, BranchType::Unconditional);
-  predictor.update(0, false, 4, BranchType::Unconditional);
-  predictor.update(0, false, 4, BranchType::Unconditional);
-  predictor.update(0, false, 4, BranchType::Unconditional);
-  predictor.update(0, true, 4, BranchType::Unconditional);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 22);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 23);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 24);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 25);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 26);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 27);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 28);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 29);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 30);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 31);
   // Get prediction
-  prediction = predictor.predict(0x1F, BranchType::Conditional, 0);
-  EXPECT_TRUE(prediction.taken);
+  prediction = predictor.predict(0x7C, BranchType::Conditional, 0);
+  EXPECT_TRUE(prediction.isTaken);
   EXPECT_EQ(prediction.target, 0xAB);
   // Set entry in BTB
-  predictor.update(0x1F, true, 0xAB, BranchType::Conditional);
+  predictor.update(0x7C, true, 0xAB, BranchType::Conditional, 32);
 
   // Recreate second global history pattern
-  predictor.update(0, false, 4, BranchType::Unconditional);
-  predictor.update(0, true, 4, BranchType::Unconditional);
-  predictor.update(0, true, 4, BranchType::Unconditional);
-  predictor.update(0, true, 4, BranchType::Unconditional);
-  predictor.update(0, false, 4, BranchType::Unconditional);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 33);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 34);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 35);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 36);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 37);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 16, BranchType::Conditional, 38);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 39);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 40);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 41);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 16, BranchType::Conditional, 42);
   // Get prediction
-  prediction = predictor.predict(0x1F, BranchType::Conditional, 0);
-  EXPECT_TRUE(prediction.taken);
+  prediction = predictor.predict(0x7C, BranchType::Conditional, 0);
+  EXPECT_TRUE(prediction.isTaken);
   EXPECT_EQ(prediction.target, 0xBA);
-  predictor.update(0x1F, true, 0xBA, BranchType::Conditional);
+  predictor.update(0x7C, true, 0xBA, BranchType::Conditional, 43);
+}
+
+// Test Flush of RAS functionality
+TEST_F(GenericPredictorTest, flush) {
+  simeng::config::SimInfo::addToConfig(
+      "{Branch-Predictor: {BTB-Tag-Bits: 11, Saturating-Count-Bits: 2, "
+      "Global-History-Length: 10, RAS-entries: 10, Fallback-Static-Predictor: "
+      "Always-Taken}}");
+  auto predictor = simeng::GenericPredictor();
+  // Add some entries to the RAS
+  auto prediction = predictor.predict(8, BranchType::SubroutineCall, 8);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 16);
+  prediction = predictor.predict(24, BranchType::SubroutineCall, 8);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 32);
+  prediction = predictor.predict(40, BranchType::SubroutineCall, 8);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 48);
+
+  // Start getting entries from RAS
+  prediction = predictor.predict(52, BranchType::Return, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 44);
+  prediction = predictor.predict(36, BranchType::Return, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 28);
+
+  // Flush address
+  predictor.flush(36);
+
+  // Continue getting entries from RAS
+  prediction = predictor.predict(20, BranchType::Return, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 28);
+  prediction = predictor.predict(16, BranchType::Return, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 12);
+}
+
+// Test that update correctly corrects the speculatively updated global history
+TEST_F(GenericPredictorTest, speculativeGlobalHistory) {
+  simeng::config::SimInfo::addToConfig(
+      "{Branch-Predictor: {Type: Generic, BTB-Tag-Bits: 6, "
+      "Saturating-Count-Bits: 2, Global-History-Length: 6, RAS-entries: 10, "
+      "Fallback-Static-Predictor: Always-Taken}}");
+  auto predictor = simeng::GenericPredictor();
+  BranchPrediction pred;
+
+  // Set up the target prediction for btb entry 000111 to be 65536.  No other
+  // target predictions will be set during this test, so we can confirm that
+  // we are accessing this btb entry by on the basis of this target prediction
+  pred = predictor.predict(28, BranchType::Conditional, 0);
+  EXPECT_TRUE(pred.isTaken);  // Default behaviour is to predict taken
+  EXPECT_EQ(pred.target, 0);  // Target prediction not yet set
+  predictor.update(28, true, 65536, BranchType::Conditional, 0);
+
+  // Set up a speculative global history of 111111 on the basis of predictions
+  pred = predictor.predict(4, BranchType::Conditional, 0);  // GH = 000011
+  EXPECT_TRUE(pred.isTaken);
+  EXPECT_EQ(pred.target, 0);
+  pred = predictor.predict(4, BranchType::Conditional, 0);  // GH = 000111
+  EXPECT_TRUE(pred.isTaken);
+  EXPECT_EQ(pred.target, 0);
+  pred = predictor.predict(4, BranchType::Conditional, 0);  // GH = 001111
+  EXPECT_TRUE(pred.isTaken);
+  EXPECT_EQ(pred.target, 0);
+  pred = predictor.predict(4, BranchType::Conditional, 0);  // GH = 011111
+  EXPECT_TRUE(pred.isTaken);
+  EXPECT_EQ(pred.target, 0);
+  pred = predictor.predict(4, BranchType::Conditional, 0);  // GH = 111111
+  EXPECT_TRUE(pred.isTaken);
+  EXPECT_EQ(pred.target, 0);
+
+  // Get prediction for address 224 to access btb entry 000111
+  pred = predictor.predict(224, BranchType::Conditional, 0);  // GH = 111111
+  // Confirm prediction target is 65536
+  EXPECT_EQ(pred.target, 65536);
+  EXPECT_TRUE(pred.isTaken);
+
+  // Now correct the speculative global history using updates
+  predictor.update(4, false, 8, BranchType::Conditional, 1);  // GH = 011111
+  predictor.update(4, false, 8, BranchType::Conditional, 2);  // GH = 001111
+  predictor.update(4, false, 8, BranchType::Conditional, 3);  // GH = 000111
+
+  // Now a prediction for address 0 should access btb entry 000111
+  pred = predictor.predict(0, BranchType::Conditional, 0);
+  EXPECT_EQ(pred.target, 65536);
 }
 
 }  // namespace simeng
diff --git a/test/unit/ISATest.cc b/test/unit/ISATest.cc
deleted file mode 100644
index 4f9416dc84..0000000000
--- a/test/unit/ISATest.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-#include "gtest/gtest.h"
-#include "simeng/RegisterFileSet.hh"
-#include "simeng/arch/aarch64/Architecture.hh"
-#include "simeng/arch/aarch64/Instruction.hh"
-
-namespace {
-
-// Test that we can create an AArch64 Architecture object
-TEST(ISATest, CreateAArch64) {
-  simeng::kernel::Linux kernel;
-  YAML::Node config = YAML::Load(
-      "{Core: {Simulation-Mode: emulation, Clock-Frequency: 2.5, "
-      "Timer-Frequency: 100, Micro-Operations: True, "
-      "Vector-Length: 512, Streaming-Vector-Length: 512}}");
-  // Pass a config file with only the options required by the aarch64
-  // architecture class to function
-  std::unique_ptr<simeng::arch::Architecture> isa =
-      std::make_unique<simeng::arch::aarch64::Architecture>(kernel, config);
-
-  EXPECT_GT(isa->getRegisterFileStructures().size(), 0);
-}
-
-// Test that we can set a value in a register file set
-TEST(ISATest, CreateRegisterFileSet) {
-  auto registerFileSet = simeng::RegisterFileSet({{8, 32}, {16, 32}, {1, 1}});
-  auto reg = simeng::Register{simeng::arch::aarch64::RegisterType::GENERAL, 0};
-
-  registerFileSet.set(reg, static_cast<uint64_t>(42));
-
-  EXPECT_TRUE(registerFileSet.get(reg));
-}
-
-}  // namespace
diff --git a/test/unit/LatencyMemoryInterfaceTest.cc b/test/unit/LatencyMemoryInterfaceTest.cc
deleted file mode 100644
index e728b9bf76..0000000000
--- a/test/unit/LatencyMemoryInterfaceTest.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-#include "gtest/gtest.h"
-#include "simeng/FixedLatencyMemoryInterface.hh"
-
-namespace {
-
-// Test that we can write data and it completes after a number of cycles.
-TEST(LatencyMemoryInterfaceTest, FixedWriteData) {
-  // Create a memory interface with a two cycle latency
-  uint32_t memoryData = 0;
-  simeng::FixedLatencyMemoryInterface memory(
-      reinterpret_cast<char*>(&memoryData), 4, 2);
-  EXPECT_FALSE(memory.hasPendingRequests());
-
-  // Write a 32-bit value to memory
-  // Should ignore the 7 cycle latency and opt for the interface defined latency
-  simeng::MemoryAccessTarget target = {0, 4};
-  simeng::RegisterValue value = (uint32_t)0xDEADBEEF;
-  memory.requestWrite(target, value);
-  EXPECT_TRUE(memory.hasPendingRequests());
-
-  // Tick once - request should still be pending
-  memory.tick();
-  EXPECT_TRUE(memory.hasPendingRequests());
-
-  // Tick again - request should have completed
-  memory.tick();
-  EXPECT_FALSE(memory.hasPendingRequests());
-  EXPECT_EQ(memoryData, 0xDEADBEEF);
-}
-
-// Test that out-of-bounds memory reads are correctly handled.
-TEST(LatencyMemoryInterfaceTest, OutofBoundsRead) {
-  uint32_t memoryData = 0;
-  simeng::FixedLatencyMemoryInterface memory(
-      reinterpret_cast<char*>(&memoryData), 4, 1);
-
-  // Create a target such that address + size will overflow
-  simeng::MemoryAccessTarget overflowTarget = {UINT64_MAX, 4};
-  memory.requestRead(overflowTarget, 1);
-
-  // Create a regular out-of-bounds target
-  simeng::MemoryAccessTarget target = {0, 8};
-  memory.requestRead(target, 2);
-
-  // Tick once - request should have completed
-  memory.tick();
-  EXPECT_FALSE(memory.hasPendingRequests());
-
-  auto entries = memory.getCompletedReads();
-  EXPECT_EQ(entries.size(), 2);
-
-  auto overflowResult = entries[0];
-  EXPECT_EQ(overflowResult.requestId, 1);
-  EXPECT_EQ(overflowResult.data, simeng::RegisterValue());
-  EXPECT_EQ(overflowResult.target, overflowTarget);
-
-  auto result = entries[1];
-  EXPECT_EQ(result.requestId, 2);
-  EXPECT_EQ(result.data, simeng::RegisterValue());
-  EXPECT_EQ(result.target, target);
-}
-
-}  // namespace
diff --git a/test/unit/MockArchitecture.hh b/test/unit/MockArchitecture.hh
index 5d7cd98442..c264a8d3ed 100644
--- a/test/unit/MockArchitecture.hh
+++ b/test/unit/MockArchitecture.hh
@@ -8,24 +8,20 @@ namespace simeng {
 /** Mock implementation of the `Architecture` interface. */
 class MockArchitecture : public arch::Architecture {
  public:
-  MOCK_CONST_METHOD5(predecode, uint8_t(const void* ptr, uint8_t bytesAvailable,
-                                        uint64_t instructionAddress,
-                                        MacroOp& output, std::string& disasm));
-  MOCK_CONST_METHOD0(getRegisterFileStructures,
-                     std::vector<RegisterFileStructure>());
+  MockArchitecture(kernel::Linux& kernel) : arch::Architecture(kernel) {}
+  MOCK_CONST_METHOD5(predecode,
+                     uint8_t(const uint8_t* ptr, uint16_t bytesAvailable,
+                             uint64_t instructionAddress, MacroOp& output,
+                             std::string& disasm));
   MOCK_CONST_METHOD1(canRename, bool(Register reg));
   MOCK_CONST_METHOD1(getSystemRegisterTag, int32_t(uint16_t reg));
-  MOCK_CONST_METHOD0(getNumSystemRegisters, uint16_t());
   MOCK_CONST_METHOD3(handleException,
                      std::shared_ptr<arch::ExceptionHandler>(
                          const std::shared_ptr<Instruction>& instruction,
-                         const Core& core, MemoryInterface& memory));
+                         const Core& core, memory::MemoryInterface& memory));
   MOCK_CONST_METHOD0(getInitialState, arch::ProcessStateChange());
   MOCK_CONST_METHOD0(getMaxInstructionSize, uint8_t());
-  MOCK_CONST_METHOD1(getConfigPhysicalRegisterStructure,
-                     std::vector<RegisterFileStructure>(YAML::Node config));
-  MOCK_CONST_METHOD1(getConfigPhysicalRegisterQuantities,
-                     std::vector<uint16_t>(YAML::Node config));
+  MOCK_CONST_METHOD0(getMinInstructionSize, uint8_t());
   MOCK_CONST_METHOD2(updateSystemTimerRegisters,
                      void(RegisterFileSet* regFile, const uint64_t iterations));
 };
diff --git a/test/unit/MockBranchPredictor.hh b/test/unit/MockBranchPredictor.hh
index 05868a6fed..2727e6db51 100644
--- a/test/unit/MockBranchPredictor.hh
+++ b/test/unit/MockBranchPredictor.hh
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "gmock/gmock.h"
-#include "simeng/BranchPredictor.hh"
+#include "simeng/branchpredictors/BranchPredictor.hh"
 
 namespace simeng {
 
@@ -10,8 +10,9 @@ class MockBranchPredictor : public BranchPredictor {
  public:
   MOCK_METHOD3(predict, BranchPrediction(uint64_t address, BranchType type,
                                          int64_t knownTarget));
-  MOCK_METHOD4(update, void(uint64_t address, bool taken,
-                            uint64_t targetAddress, BranchType type));
+  MOCK_METHOD5(update,
+               void(uint64_t address, bool taken, uint64_t targetAddress,
+                    BranchType type, uint64_t instructionId));
   MOCK_METHOD1(flush, void(uint64_t address));
 };
 
diff --git a/test/unit/MockCore.hh b/test/unit/MockCore.hh
new file mode 100644
index 0000000000..c76f8ea808
--- /dev/null
+++ b/test/unit/MockCore.hh
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "gmock/gmock.h"
+#include "simeng/Core.hh"
+
+namespace simeng {
+
+/** Mock implementation of the `Core` interface. */
+class MockCore : public Core {
+ public:
+  MockCore(memory::MemoryInterface& dataMemory, const arch::Architecture& isa,
+           const std::vector<RegisterFileStructure>& regFileStructure)
+      : Core(dataMemory, isa, regFileStructure) {}
+  MOCK_METHOD0(tick, void());
+  MOCK_CONST_METHOD0(hasHalted, bool());
+  MOCK_CONST_METHOD0(getArchitecturalRegisterFileSet,
+                     const ArchitecturalRegisterFileSet&());
+  MOCK_CONST_METHOD0(getInstructionsRetiredCount, uint64_t());
+  MOCK_CONST_METHOD0(getSystemTimer, uint64_t());
+  MOCK_CONST_METHOD0(getStats, std::map<std::string, std::string>());
+};
+
+}  // namespace simeng
diff --git a/test/unit/MockInstruction.hh b/test/unit/MockInstruction.hh
index f4e17b64ed..56510f4948 100644
--- a/test/unit/MockInstruction.hh
+++ b/test/unit/MockInstruction.hh
@@ -8,19 +8,21 @@ namespace simeng {
 /** Mock implementation of the `Instruction` interface. */
 class MockInstruction : public Instruction {
  public:
-  MOCK_CONST_METHOD0(getException, InstructionException());
-  MOCK_CONST_METHOD0(getOperandRegisters, const span<Register>());
+  MOCK_CONST_METHOD0(getSourceRegisters, const span<Register>());
+  MOCK_CONST_METHOD0(getSourceOperands, const span<RegisterValue>());
   MOCK_CONST_METHOD0(getDestinationRegisters, const span<Register>());
-  MOCK_METHOD2(renameSource, void(uint8_t i, Register renamed));
-  MOCK_METHOD2(renameDestination, void(uint8_t i, Register renamed));
-  MOCK_METHOD2(supplyOperand, void(uint8_t i, const RegisterValue& value));
+  MOCK_METHOD2(renameSource, void(uint16_t i, Register renamed));
+  MOCK_METHOD2(renameDestination, void(uint16_t i, Register renamed));
+  MOCK_METHOD2(supplyOperand, void(uint16_t i, const RegisterValue& value));
   MOCK_CONST_METHOD1(isOperandReady, bool(int i));
   MOCK_CONST_METHOD0(canExecute, bool());
   MOCK_METHOD0(execute, void());
   MOCK_CONST_METHOD0(getResults, const span<RegisterValue>());
-  MOCK_METHOD0(generateAddresses, span<const MemoryAccessTarget>());
+  MOCK_METHOD0(generateAddresses, span<const memory::MemoryAccessTarget>());
   MOCK_METHOD2(supplyData, void(uint64_t address, const RegisterValue& data));
-  MOCK_CONST_METHOD0(getGeneratedAddresses, span<const MemoryAccessTarget>());
+  MOCK_CONST_METHOD0(getGeneratedAddresses,
+                     span<const memory::MemoryAccessTarget>());
+  MOCK_CONST_METHOD0(hasAllData, bool());
   MOCK_CONST_METHOD0(getData, span<const RegisterValue>());
 
   MOCK_CONST_METHOD0(checkEarlyBranchMisprediction,
@@ -32,12 +34,14 @@ class MockInstruction : public Instruction {
   MOCK_CONST_METHOD0(isStoreData, bool());
   MOCK_CONST_METHOD0(isLoad, bool());
   MOCK_CONST_METHOD0(isBranch, bool());
-  MOCK_CONST_METHOD0(isASIMD, bool());
-  MOCK_CONST_METHOD0(isPredicate, bool());
   MOCK_CONST_METHOD0(getGroup, uint16_t());
 
+  MOCK_CONST_METHOD0(getLSQLatency, uint16_t());
+
   MOCK_METHOD0(getSupportedPorts, const std::vector<uint16_t>&());
 
+  MOCK_METHOD1(setExecutionInfo, void(const ExecutionInfo& info));
+
   void setBranchResults(bool wasTaken, uint64_t targetAddress) {
     branchTaken_ = wasTaken;
     branchAddress_ = targetAddress;
@@ -53,7 +57,13 @@ class MockInstruction : public Instruction {
 
   void setLatency(uint16_t cycles) { latency_ = cycles; }
 
+  void setLSQLatency(uint16_t cycles) { lsqExecutionLatency_ = cycles; }
+
   void setStallCycles(uint16_t cycles) { stallCycles_ = cycles; }
+
+  void setIsMicroOp(bool isMicroOp) { isMicroOp_ = isMicroOp; }
+
+  void setIsLastMicroOp(bool isLastOp) { isLastMicroOp_ = isLastOp; }
 };
 
 }  // namespace simeng
diff --git a/test/unit/MockMemoryInterface.hh b/test/unit/MockMemoryInterface.hh
index cb6946fd34..c0d7285338 100644
--- a/test/unit/MockMemoryInterface.hh
+++ b/test/unit/MockMemoryInterface.hh
@@ -1,20 +1,20 @@
 #pragma once
 
 #include "gmock/gmock.h"
-#include "simeng/MemoryInterface.hh"
+#include "simeng/memory/MemoryInterface.hh"
 
 namespace simeng {
 
-/** Mock implementation of MemoryInterface */
-class MockMemoryInterface : public MemoryInterface {
+/** Mock implementation of memory::MemoryInterface */
+class MockMemoryInterface : public memory::MemoryInterface {
  public:
-  MOCK_METHOD2(requestRead,
-               void(const MemoryAccessTarget& target, uint64_t requestId));
+  MOCK_METHOD2(requestRead, void(const memory::MemoryAccessTarget& target,
+                                 uint64_t requestId));
 
-  MOCK_METHOD2(requestWrite, void(const MemoryAccessTarget& target,
+  MOCK_METHOD2(requestWrite, void(const memory::MemoryAccessTarget& target,
                                   const RegisterValue& data));
 
-  MOCK_CONST_METHOD0(getCompletedReads, const span<MemoryReadResult>());
+  MOCK_CONST_METHOD0(getCompletedReads, const span<memory::MemoryReadResult>());
 
   MOCK_METHOD0(clearCompletedReads, void());
 
diff --git a/test/unit/MockPortAllocator.hh b/test/unit/MockPortAllocator.hh
new file mode 100644
index 0000000000..c09c605f52
--- /dev/null
+++ b/test/unit/MockPortAllocator.hh
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "gmock/gmock.h"
+#include "simeng/pipeline/PortAllocator.hh"
+
+namespace simeng {
+namespace pipeline {
+
+/** Mock implementation of the `PortAllocator` interface. */
+class MockPortAllocator : public pipeline::PortAllocator {
+ public:
+  MOCK_METHOD1(allocate, uint16_t(const std::vector<uint16_t>& ports));
+  MOCK_METHOD1(issued, void(uint16_t port));
+  MOCK_METHOD1(deallocate, void(uint16_t port));
+  MOCK_METHOD1(setRSSizeGetter,
+               void(std::function<void(std::vector<uint32_t>&)> rsSizes));
+  MOCK_METHOD0(tick, void());
+};
+
+}  // namespace pipeline
+}  // namespace simeng
diff --git a/test/unit/OSTest.cc b/test/unit/OSTest.cc
new file mode 100644
index 0000000000..7636c25395
--- /dev/null
+++ b/test/unit/OSTest.cc
@@ -0,0 +1,95 @@
+#include "ConfigInit.hh"
+#include "gtest/gtest.h"
+#include "simeng/kernel/Linux.hh"
+#include "simeng/kernel/LinuxProcess.hh"
+#include "simeng/span.hh"
+
+namespace simeng {
+
+class OSTest : public testing::Test {
+ public:
+  OSTest()
+      : os(config::SimInfo::getConfig()["CPU-Info"]["Special-File-Dir-Path"]
+               .as<std::string>()),
+        proc_elf(simeng::kernel::LinuxProcess(cmdLine)),
+        proc_hex(simeng::span(reinterpret_cast<const uint8_t*>(demoHex),
+                              sizeof(demoHex))) {}
+
+ protected:
+  ConfigInit configInit = ConfigInit(
+      config::ISA::AArch64,
+      R"YAML({Process-Image: {Heap-Size: 1073741824, Stack-Size: 1048576}})YAML");
+
+  const std::vector<std::string> cmdLine = {
+      SIMENG_SOURCE_DIR "/test/unit/data/stream-aarch64.elf"};
+
+  std::vector<std::string> defaultCmdLine = {SIMENG_SOURCE_DIR
+                                             "/SimEngDefaultProgram"};
+
+  simeng::kernel::Linux os;
+  simeng::kernel::LinuxProcess proc_elf;
+  simeng::kernel::LinuxProcess proc_hex;
+
+  // A simple program used to test the functionality of creating a process with
+  // a stream of hex instructions.
+  const uint32_t demoHex[7] = {
+      0x320C03E0,  // orr w0, wzr, #1048576
+      0x320003E1,  // orr w0, wzr, #1
+      0x71000400,  // subs w0, w0, #1
+      0x54FFFFC1,  // b.ne -8
+                   // .exit:
+      0xD2800000,  // mov x0, #0
+      0xD2800BC8,  // mov x8, #94
+      0xD4000001,  // svc #0
+  };
+};
+
+// These tests verify the functionality of both the `createProcess()` and
+// `getInitialStackPointer()` functions. All other functions for this class are
+// syscalls and are tested in the Regression suite.
+TEST_F(OSTest, processElf_stackPointer) {
+  os.createProcess(proc_elf);
+  // cmdLine[0] length will change depending on the host system so final stack
+  // pointer needs to be calculated manually
+  // cmdLineSize + 1 for null seperator
+  const uint64_t cmdLineSize = cmdLine[0].size() + 1;
+  // "OMP_NUM_THREADS=1" + 1 for null seperator
+  const uint64_t envStringsSize = 18;
+  // Size of initial stack frame as per LinuxProcess.cc:createStack()
+  // - (17 push_backs) * 8
+  // https://www.win.tue.nl/~aeb/linux/hh/stack-layout.html
+  const uint64_t stackFrameSize = 17 * 8;
+  // cmd + Env needs +1 for null seperator
+  const uint64_t stackPointer =
+      proc_elf.getStackStart() -
+      kernel::alignToBoundary(cmdLineSize + envStringsSize + 1, 32) -
+      kernel::alignToBoundary(stackFrameSize, 32);
+  EXPECT_EQ(os.getInitialStackPointer(), stackPointer);
+  EXPECT_EQ(os.getInitialStackPointer(), proc_elf.getInitialStackPointer());
+}
+
+TEST_F(OSTest, processHex_stackPointer) {
+  os.createProcess(proc_hex);
+  // cmdLine[0] length will change depending on the host system so final stack
+  // pointer needs to be calculated manually
+  // cmdLineSize + 1 for null seperator
+  const uint64_t cmdLineSize = defaultCmdLine[0].size() + 1;
+  // "OMP_NUM_THREADS=1" + 1 for null seperator
+  const uint64_t envStringsSize = 18;
+  // Size of initial stack frame as per LinuxProcess.cc:createStack()
+  // - (17 push_backs) * 8
+  // https://www.win.tue.nl/~aeb/linux/hh/stack-layout.html
+  const uint64_t stackFrameSize = 17 * 8;
+  // cmd + Env needs +1 for null seperator
+  const uint64_t stackPointer =
+      proc_hex.getStackStart() -
+      kernel::alignToBoundary(cmdLineSize + envStringsSize + 1, 32) -
+      kernel::alignToBoundary(stackFrameSize, 32);
+  EXPECT_EQ(os.getInitialStackPointer(), stackPointer);
+  EXPECT_EQ(os.getInitialStackPointer(), proc_hex.getInitialStackPointer());
+}
+
+// createProcess
+// getInitialStackPointer
+
+}  // namespace simeng
diff --git a/test/unit/PerceptronPredictorTest.cc b/test/unit/PerceptronPredictorTest.cc
new file mode 100644
index 0000000000..7768ab0ba0
--- /dev/null
+++ b/test/unit/PerceptronPredictorTest.cc
@@ -0,0 +1,330 @@
+#include "MockInstruction.hh"
+#include "gtest/gtest.h"
+#include "simeng/branchpredictors/PerceptronPredictor.hh"
+
+namespace simeng {
+
+class PerceptronPredictorTest : public testing::Test {
+ public:
+  PerceptronPredictorTest() : uop(new MockInstruction), uopPtr(uop) {
+    uop->setInstructionAddress(0);
+  }
+
+ protected:
+  MockInstruction* uop;
+  std::shared_ptr<Instruction> uopPtr;
+};
+
+// Tests that the PerceptronPredictor will predict the correct direction on a
+// miss
+TEST_F(PerceptronPredictorTest, Miss) {
+  simeng::config::SimInfo::addToConfig(
+      "{Branch-Predictor: {Type: Perceptron, BTB-Tag-Bits: 11, "
+      "Global-History-Length: 10, RAS-entries: 5}}");
+  auto predictor = simeng::PerceptronPredictor();
+  auto prediction = predictor.predict(0, BranchType::Conditional, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  prediction = predictor.predict(8, BranchType::Unconditional, 0);
+  EXPECT_TRUE(prediction.isTaken);
+}
+
+// Tests that the PerceptronPredictor will predict branch-and-link return pairs
+// correctly
+TEST_F(PerceptronPredictorTest, RAS) {
+  simeng::config::SimInfo::addToConfig(
+      "{Branch-Predictor: {Type: Perceptron, BTB-Tag-Bits: 11, "
+      "Global-History-Length: 10, RAS-entries: 10}}");
+  auto predictor = simeng::PerceptronPredictor();
+  auto prediction = predictor.predict(8, BranchType::SubroutineCall, 8);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 16);
+  prediction = predictor.predict(24, BranchType::SubroutineCall, 8);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 32);
+  prediction = predictor.predict(40, BranchType::SubroutineCall, 8);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 48);
+  prediction = predictor.predict(56, BranchType::SubroutineCall, 8);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 64);
+  prediction = predictor.predict(72, BranchType::SubroutineCall, 8);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 80);
+
+  prediction = predictor.predict(84, BranchType::Return, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 76);
+  prediction = predictor.predict(68, BranchType::Return, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 60);
+  prediction = predictor.predict(52, BranchType::Return, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 44);
+  prediction = predictor.predict(36, BranchType::Return, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 28);
+  prediction = predictor.predict(20, BranchType::Return, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 12);
+}
+
+// Tests that the PerceptronPredictor will predict a previously encountered
+// branch correctly, when no address aliasing has occurred
+TEST_F(PerceptronPredictorTest, Hit) {
+  simeng::config::SimInfo::addToConfig(
+      "{Branch-Predictor: {Type: Perceptron, BTB-Tag-Bits: 5, "
+      "Global-History-Length: 1, RAS-entries: 5}}");
+  auto predictor = simeng::PerceptronPredictor();
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 0);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 1);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 2);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 3);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 16, BranchType::Conditional, 4);
+
+  auto prediction = predictor.predict(0, BranchType::Conditional, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 16);
+}
+
+// Tests that the PeceptronPredictor will predict correctly for two different
+// behaviours of the same branch but in different states of the program
+TEST_F(PerceptronPredictorTest, GlobalIndexing) {
+  simeng::config::SimInfo::addToConfig(
+      "{Branch-Predictor: {Type: Perceptron, BTB-Tag-Bits: 10, "
+      "Global-History-Length: 10, RAS-entries: 5}}");
+  auto predictor = simeng::PerceptronPredictor();
+  // Spool up first global history pattern
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 0);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 1);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 2);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 3);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 4);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 5);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 6);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 7);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 8);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 9);
+  // Ensure default behaviour for first encounter
+  auto prediction = predictor.predict(0x7C, BranchType::Conditional, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 0);
+  // Set entry in BTB
+  predictor.update(0x7C, false, 0x80, BranchType::Conditional, 10);
+
+  // Spool up second global history pattern
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 11);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 12);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 13);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 14);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 15);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 16);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 17);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 18);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 19);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 20);
+  // Ensure default behaviour for re-encounter but with different global history
+  prediction = predictor.predict(0x7C, BranchType::Conditional, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 0);
+  // Set entry in BTB
+  predictor.update(0x7C, true, 0xBA, BranchType::Conditional, 21);
+
+  // Recreate first global history pattern
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 22);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 23);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 24);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 25);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 26);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 27);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 28);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 29);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 30);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 31);
+  // Get prediction
+  prediction = predictor.predict(0x7C, BranchType::Conditional, 0);
+  EXPECT_FALSE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 0x80);
+  // Set entry in BTB
+  predictor.update(0x7C, true, 0x80, BranchType::Conditional, 32);
+
+  // Recreate second global history pattern
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 33);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 34);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 35);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 36);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 37);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 38);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 39);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 40);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 4, BranchType::Conditional, 41);
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 42);
+  // Get prediction
+  prediction = predictor.predict(0x7C, BranchType::Conditional, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 0xBA);
+  predictor.update(0x7C, true, 0xBA, BranchType::Conditional, 43);
+}
+
+// Test Flush of RAS functionality
+TEST_F(PerceptronPredictorTest, flush) {
+  simeng::config::SimInfo::addToConfig(
+      "{Branch-Predictor: {Type: Perceptron, BTB-Tag-Bits: 11, "
+      "Global-History-Length: 10, RAS-entries: 10}}");
+  auto predictor = simeng::PerceptronPredictor();
+  // Add some entries to the RAS
+  auto prediction = predictor.predict(8, BranchType::SubroutineCall, 8);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 16);
+  prediction = predictor.predict(24, BranchType::SubroutineCall, 8);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 32);
+  prediction = predictor.predict(40, BranchType::SubroutineCall, 8);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 48);
+
+  // Start getting entries from RAS
+  prediction = predictor.predict(52, BranchType::Return, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 44);
+  prediction = predictor.predict(36, BranchType::Return, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 28);
+
+  // Flush address
+  predictor.flush(36);
+
+  // Continue getting entries from RAS
+  prediction = predictor.predict(20, BranchType::Return, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 28);
+  prediction = predictor.predict(16, BranchType::Return, 0);
+  EXPECT_TRUE(prediction.isTaken);
+  EXPECT_EQ(prediction.target, 12);
+}
+
+// Test that update corrects the speculatively updated global history
+TEST_F(PerceptronPredictorTest, speculativeGlobalHistory) {
+  simeng::config::SimInfo::addToConfig(
+      "{Branch-Predictor: {Type: Perceptron, BTB-Tag-Bits: 6, "
+      "Global-History-Length: 6, RAS-entries: 5}}");
+  auto predictor = simeng::PerceptronPredictor();
+  BranchPrediction pred;
+
+  // Set up the target prediction for btb entry 000111 to be 65536.  No other
+  // target predictions will be set during this test, so we can confirm that
+  // we are accessing this btb entry by on the basis of this target
+  // prediction.  This takes a bit more setting up than the Generic predictor
+  // as perceptrons are more complicated than saturating counters.
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 0);  // GH = 000000
+
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 1);  // GH = 000000
+
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, false, 4, BranchType::Conditional, 2);  // GH = 000000
+
+  predictor.predict(28, BranchType::Conditional, 0);
+  predictor.update(28, true, 65536, BranchType::Conditional, 3);  // GH = 000001
+
+  predictor.predict(24, BranchType::Conditional, 0);
+  predictor.update(24, true, 65536, BranchType::Conditional, 4);  // GH = 000011
+
+  predictor.predict(16, BranchType::Conditional, 0);
+  predictor.update(16, true, 65536, BranchType::Conditional, 5);  // GH = 000111
+
+  predictor.predict(0, BranchType::Conditional, 0);
+  predictor.update(0, true, 65536, BranchType::Conditional, 6);  // GH = 001111
+
+  predictor.predict(32, BranchType::Conditional, 0);
+  predictor.update(32, true, 65536, BranchType::Conditional, 7);  // GH = 011111
+
+  predictor.predict(96, BranchType::Conditional, 0);
+  predictor.update(96, true, 65536, BranchType::Conditional, 8);  // GH = 111111
+
+  pred = predictor.predict(224, BranchType::Conditional, 0);
+  EXPECT_TRUE(pred.isTaken);      // Should be set to taken
+  EXPECT_EQ(pred.target, 65536);  // Should be set to 65536
+  predictor.update(224, true, 65536, BranchType::Conditional,
+                   9);  // GH = 111111
+
+  // Set up a speculative global history of 111111 on the basis of predictions
+  pred = predictor.predict(4, BranchType::Conditional, 0);  // GH = 111111
+  EXPECT_TRUE(pred.isTaken);
+  EXPECT_EQ(pred.target, 0);
+  pred = predictor.predict(4, BranchType::Conditional, 0);  // GH = 111111
+  EXPECT_TRUE(pred.isTaken);
+  EXPECT_EQ(pred.target, 0);
+  pred = predictor.predict(4, BranchType::Conditional, 0);  // GH = 111111
+  EXPECT_TRUE(pred.isTaken);
+  EXPECT_EQ(pred.target, 0);
+  pred = predictor.predict(4, BranchType::Conditional, 0);  // GH = 111111
+  EXPECT_TRUE(pred.isTaken);
+  EXPECT_EQ(pred.target, 0);
+  pred = predictor.predict(4, BranchType::Conditional, 0);  // GH = 111111
+  EXPECT_TRUE(pred.isTaken);
+  EXPECT_EQ(pred.target, 0);
+
+  // Get prediction for address 224 to access btb entry 000111
+  pred = predictor.predict(224, BranchType::Conditional, 0);  // GH = 111111
+  // Confirm prediction target is 65536
+  EXPECT_EQ(pred.target, 65536);
+  EXPECT_TRUE(pred.isTaken);
+
+  // Now correct the speculative global history using updates
+  predictor.update(4, false, 8, BranchType::Conditional, 10);  // GH = 011111
+  predictor.update(4, false, 8, BranchType::Conditional, 11);  // GH = 001111
+  predictor.update(4, false, 8, BranchType::Conditional, 12);  // GH = 000111
+
+  // Now a prediction for address 0 should access btb entry 000111
+  pred = predictor.predict(0, BranchType::Conditional, 0);
+  EXPECT_TRUE(pred.isTaken);
+  EXPECT_EQ(pred.target, 65536);
+}
+
+}  // namespace simeng
diff --git a/test/unit/ProcessTest.cc b/test/unit/ProcessTest.cc
new file mode 100644
index 0000000000..c0401da184
--- /dev/null
+++ b/test/unit/ProcessTest.cc
@@ -0,0 +1,116 @@
+#include "ConfigInit.hh"
+#include "gtest/gtest.h"
+#include "simeng/kernel/LinuxProcess.hh"
+#include "simeng/version.hh"
+
+namespace simeng {
+
+class ProcessTest : public testing::Test {
+ public:
+  ProcessTest() {}
+
+ protected:
+  ConfigInit configInit = ConfigInit(
+      config::ISA::AArch64,
+      R"YAML({Process-Image: {Heap-Size: 1073741824, Stack-Size: 1048576}})YAML");
+
+  const std::vector<std::string> cmdLine = {
+      SIMENG_SOURCE_DIR "/test/unit/data/stream-aarch64.elf"};
+
+  // Program used when no executable is provided; counts down from
+  // 1024*1024, with an independent `orr` at the start of each branch.
+  const uint32_t demoHex[7] = {
+      0x320C03E0,  // orr w0, wzr, #1048576
+      0x320003E1,  // orr w0, wzr, #1
+      0x71000400,  // subs w0, w0, #1
+      0x54FFFFC1,  // b.ne -8
+                   // .exit:
+      0xD2800000,  // mov x0, #0
+      0xD2800BC8,  // mov x8, #94
+      0xD4000001,  // svc #0
+  };
+};
+
+TEST_F(ProcessTest, alignToBoundary) {
+  EXPECT_EQ(kernel::alignToBoundary(63, 64), 64);
+  EXPECT_EQ(kernel::alignToBoundary(1, 64), 64);
+  EXPECT_EQ(kernel::alignToBoundary(65, 64), 128);
+}
+
+// Tests createProcess(), isValid(), and getPath() functions.
+TEST_F(ProcessTest, createProcess_elf) {
+  kernel::LinuxProcess proc = kernel::LinuxProcess(cmdLine);
+  EXPECT_TRUE(proc.isValid());
+  EXPECT_EQ(proc.getPath(),
+            SIMENG_SOURCE_DIR "/test/unit/data/stream-aarch64.elf");
+}
+
+// Tests createProcess(), isValid(), and getPath() functions.
+TEST_F(ProcessTest, createProcess_hex) {
+  kernel::LinuxProcess proc = kernel::LinuxProcess(
+      span(reinterpret_cast<const uint8_t*>(demoHex), sizeof(demoHex)));
+  EXPECT_TRUE(proc.isValid());
+  EXPECT_EQ(proc.getPath(), SIMENG_SOURCE_DIR "/SimEngDefaultProgram\0");
+}
+
+// Tests get{Heap, Stack, Mmap}Start() functions
+TEST_F(ProcessTest, get_x_Start) {
+  kernel::LinuxProcess proc = kernel::LinuxProcess(cmdLine);
+  EXPECT_TRUE(proc.isValid());
+  const uint64_t heapStart = 5040480;
+  uint64_t heapSize =
+      config::SimInfo::getConfig()["Process-Image"]["Heap-Size"].as<uint64_t>();
+  uint64_t stackSize =
+      config::SimInfo::getConfig()["Process-Image"]["Stack-Size"]
+          .as<uint64_t>();
+  EXPECT_EQ(proc.getHeapStart(), heapStart);
+  EXPECT_EQ(proc.getMmapStart(),
+            kernel::alignToBoundary(heapStart + ((heapSize + stackSize) / 2),
+                                    proc.getPageSize()));
+  EXPECT_EQ(proc.getStackStart(), heapStart + heapSize + stackSize);
+}
+
+TEST_F(ProcessTest, getPageSize) {
+  kernel::LinuxProcess proc = kernel::LinuxProcess(cmdLine);
+  EXPECT_TRUE(proc.isValid());
+  EXPECT_EQ(proc.getPageSize(), 4096);
+}
+
+TEST_F(ProcessTest, getProcessImage) {
+  kernel::LinuxProcess proc = kernel::LinuxProcess(cmdLine);
+  EXPECT_TRUE(proc.isValid());
+  EXPECT_NE(proc.getProcessImage(), nullptr);
+}
+
+TEST_F(ProcessTest, getProcessImageSize) {
+  kernel::LinuxProcess proc = kernel::LinuxProcess(cmdLine);
+  EXPECT_TRUE(proc.isValid());
+  EXPECT_EQ(proc.getProcessImageSize(), 1079830880);
+}
+
+TEST_F(ProcessTest, getEntryPoint) {
+  kernel::LinuxProcess proc = kernel::LinuxProcess(cmdLine);
+  EXPECT_TRUE(proc.isValid());
+  EXPECT_EQ(proc.getEntryPoint(), 4206008);
+}
+
+TEST_F(ProcessTest, getInitialStackPointer) {
+  kernel::LinuxProcess proc = kernel::LinuxProcess(cmdLine);
+  EXPECT_TRUE(proc.isValid());
+  // cmdLine[0] length will change depending on the host system so final stack
+  // pointer needs to be calculated manually
+  // cmdLineSize + 1 for null seperator
+  const uint64_t cmdLineSize = cmdLine[0].size() + 1;
+  // "OMP_NUM_THREADS=1" + 1 for null seperator
+  const uint64_t envStringsSize = 18;
+  // Size of initial stack frame (17 push_backs) * 8
+  const uint64_t stackFrameSize = 17 * 8;
+  // cmd + Env needs +1 for null seperator
+  const uint64_t stackPointer =
+      proc.getStackStart() -
+      kernel::alignToBoundary(cmdLineSize + envStringsSize + 1, 32) -
+      kernel::alignToBoundary(stackFrameSize, 32);
+  EXPECT_EQ(proc.getInitialStackPointer(), stackPointer);
+}
+
+}  // namespace simeng
diff --git a/test/unit/RegisterFileSetTest.cc b/test/unit/RegisterFileSetTest.cc
new file mode 100644
index 0000000000..ed8485eb61
--- /dev/null
+++ b/test/unit/RegisterFileSetTest.cc
@@ -0,0 +1,48 @@
+#include "gtest/gtest.h"
+#include "simeng/RegisterFileSet.hh"
+
+namespace simeng {
+namespace pipeline {
+
+class RegisterFileSetTest : public ::testing::Test {
+ public:
+  RegisterFileSetTest() : regFileSet(regFileStruct) {}
+
+ protected:
+  const std::vector<RegisterFileStructure> regFileStruct = {
+      {8, 10}, {24, 15}, {256, 31}};
+
+  RegisterFileSet regFileSet;
+};
+
+// Ensure RegisterFileSet is constructed correctly
+TEST_F(RegisterFileSetTest, validConstruction) {
+  for (uint8_t i = 0; i < regFileStruct.size(); i++) {
+    for (uint16_t j = 0; j < regFileStruct[i].quantity; j++) {
+      const Register reg = {i, j};
+      EXPECT_EQ(regFileSet.get(reg), RegisterValue(0, regFileStruct[i].bytes));
+    }
+  }
+}
+
+// Ensure we can read and write values to the register file
+TEST_F(RegisterFileSetTest, readWrite) {
+  for (uint8_t i = 0; i < regFileStruct.size(); i++) {
+    const uint16_t regSize = regFileStruct[i].bytes;
+    const uint16_t maxRegTag = regFileStruct[i].quantity - 1;
+    const Register r0 = {i, 0};
+    const Register rMax = {i, maxRegTag};
+
+    EXPECT_EQ(regFileSet.get(r0), RegisterValue(0, regSize));
+    EXPECT_EQ(regFileSet.get(rMax), RegisterValue(0, regSize));
+
+    regFileSet.set(r0, RegisterValue(20, regSize));
+    regFileSet.set(rMax, RegisterValue(40, regSize));
+
+    EXPECT_EQ(regFileSet.get(r0), RegisterValue(20, regSize));
+    EXPECT_EQ(regFileSet.get(rMax), RegisterValue(40, regSize));
+  }
+}
+
+}  // namespace pipeline
+}  // namespace simeng
\ No newline at end of file
diff --git a/test/unit/ShiftValueTest.cc b/test/unit/ShiftValueTest.cc
deleted file mode 100644
index 8cfe74e731..0000000000
--- a/test/unit/ShiftValueTest.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-#include "gtest/gtest.h"
-#include "simeng/arch/aarch64/Instruction.hh"
-
-namespace {
-
-TEST(ShiftValueTest, ROR) {
-  const auto ARM64_SFT_ROR = 5;
-
-  // 32-bit
-  const uint32_t a = 0x0000FFFF;
-  EXPECT_EQ(simeng::arch::aarch64::shiftValue(a, ARM64_SFT_ROR, 16),
-            0xFFFF0000);
-
-  const uint32_t b = 0xFFFF0000;
-  EXPECT_EQ(simeng::arch::aarch64::shiftValue(b, ARM64_SFT_ROR, 31),
-            0xFFFE0001);
-
-  EXPECT_EQ(simeng::arch::aarch64::shiftValue(b, ARM64_SFT_ROR, 0), 0xFFFF0000);
-
-  // 64-bit
-  const uint64_t c = 0x00000000FFFFFFFF;
-  EXPECT_EQ(simeng::arch::aarch64::shiftValue(c, ARM64_SFT_ROR, 32),
-            0xFFFFFFFF00000000);
-
-  const uint64_t d = 0xFFFFFFFF00000000;
-  EXPECT_EQ(simeng::arch::aarch64::shiftValue(d, ARM64_SFT_ROR, 63),
-            0xFFFFFFFE00000001);
-
-  EXPECT_EQ(simeng::arch::aarch64::shiftValue(d, ARM64_SFT_ROR, 0),
-            0xFFFFFFFF00000000);
-}
-
-}  // namespace
\ No newline at end of file
diff --git a/test/unit/SpecialFileDirGenTest.cc b/test/unit/SpecialFileDirGenTest.cc
new file mode 100644
index 0000000000..993a650235
--- /dev/null
+++ b/test/unit/SpecialFileDirGenTest.cc
@@ -0,0 +1,136 @@
+#include "ConfigInit.hh"
+#include "gmock/gmock.h"
+#include "simeng/SpecialFileDirGen.hh"
+#include "simeng/version.hh"
+
+namespace simeng {
+
+#define TEST_SPEC_FILE_DIR SIMENG_SOURCE_DIR "/test/unit/specialFiles/"
+
+class SpecialFileDirGenTest : public testing::Test {
+ public:
+  SpecialFileDirGenTest() {}
+
+ protected:
+  ConfigInit configInit = ConfigInit(config::ISA::AArch64,
+                                     R"YAML({
+        CPU-Info: {
+          Generate-Special-Dir: True,
+          Special-File-Dir-Path: )YAML" TEST_SPEC_FILE_DIR R"YAML(,
+          Core-Count: 1,
+          Socket-Count: 1,
+          SMT: 1,
+          BogoMIPS: 200.00,
+          Features: fp asimd evtstrm sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm fcma dcpop sve,
+          CPU-Implementer: 0x46,
+          CPU-Architecture: 8,
+          CPU-Variant: 0x1,
+          CPU-Part: 0x001,
+          CPU-Revision: 0,
+          Package-Count: 1
+        }
+      })YAML");
+
+  SpecialFileDirGen specFile;
+
+  const std::vector<std::pair<std::string, std::vector<std::string>>>
+      allFiles_names_Lines = {
+          std::pair<std::string, std::vector<std::string>>(
+              "proc/cpuinfo",
+              {"processor	: 0", "BogoMIPS	: 200.00",
+               "Features	: fp asimd evtstrm sha1 sha2 "
+               "crc32 atomics fphp asimdhp cpuid "
+               "asimdrdm fcma dcpop sve",
+               "CPU implementer	: 0x46", "CPU architecture: 8",
+               "CPU variant	: 0x1", "CPU part	: 0x001",
+               "CPU revision	: 0", ""}),
+          std::pair<std::string, std::vector<std::string>>(
+              "proc/stat",
+              {"cpu  0 0 0 0 0 0 0 0 0 0", "cpu0 0 0 0 0 0 0 0 0 0 0",
+               "intr 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 "
+               "0 0 0 0 "
+               "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 "
+               "0 0 0 0 0 "
+               "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 "
+               "0 0 0 0 0 "
+               "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 "
+               "0 0 0 0 0 "
+               "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 "
+               "0 0 0 0 0 "
+               "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0",
+               "ctxt 0", "btime 0", "processes 0", "procs_running 1",
+               "procs_blocked 0", "softirq 0 0 0 0 0 0 0 0 0 0 0"}),
+          std::pair<std::string, std::vector<std::string>>(
+              "sys/devices/system/cpu/cpu0/topology/core_id", {"0"}),
+          std::pair<std::string, std::vector<std::string>>(
+              "sys/devices/system/cpu/cpu0/topology/physical_package_id",
+              {"0"}),
+          std::pair<std::string, std::vector<std::string>>(
+              "sys/devices/system/cpu/online", {"0-0"})};
+};
+
+// Test that we can generate and delete special files to a custom directory
+// (i.e. the one defined in the YAML string above)
+TEST_F(SpecialFileDirGenTest, genAndDelete) {
+  // Make sure files currently do not exist
+  for (size_t i = 0; i < allFiles_names_Lines.size(); i++) {
+    EXPECT_FALSE(
+        std::ifstream(TEST_SPEC_FILE_DIR + std::get<0>(allFiles_names_Lines[i]))
+            .good());
+  }
+
+  // Generate files
+  specFile.GenerateSFDir();
+
+  // Validate files exist and are correct
+  for (size_t i = 0; i < allFiles_names_Lines.size(); i++) {
+    EXPECT_TRUE(
+        std::ifstream(TEST_SPEC_FILE_DIR + std::get<0>(allFiles_names_Lines[i]))
+            .good());
+    std::ifstream file(TEST_SPEC_FILE_DIR +
+                       std::get<0>(allFiles_names_Lines[i]));
+    const std::vector<std::string>& knownLines =
+        std::get<1>(allFiles_names_Lines[i]);
+    std::string line;
+    size_t numOfLines = 0;
+    while (std::getline(file, line)) {
+      if (numOfLines > knownLines.size()) {
+        break;
+      }
+      EXPECT_EQ(line, knownLines[numOfLines]);
+      numOfLines++;
+    }
+    EXPECT_EQ(numOfLines, knownLines.size());
+  }
+
+  // Delete files
+  specFile.RemoveExistingSFDir();
+
+  // Make sure files don't exist
+  for (size_t i = 0; i < allFiles_names_Lines.size(); i++) {
+    EXPECT_FALSE(
+        std::ifstream(TEST_SPEC_FILE_DIR + std::get<0>(allFiles_names_Lines[i]))
+            .good());
+  }
+}
+
+// Test that a non-existant non-default special file directory causes the user
+// to be notified when generation is set to False
+TEST_F(SpecialFileDirGenTest, doesntExist) {
+  // Reset SimInfo Config
+  ASSERT_DEATH(
+      config::SimInfo::addToConfig(
+          "CPU-Info: {Generate-Special-Dir: False, "
+          "Special-File-Dir-Path: " SIMENG_BUILD_DIR "/thisDoesntExistDir/"
+          ", Core-Count: 1, Socket-Count: 1, SMT: 1, BogoMIPS: 200.00, "
+          "Features: "
+          "fp asimd evtstrm sha1 sha2 crc32 atomics fphp asimdhp cpuid "
+          "asimdrdm "
+          "fcma dcpop sve, CPU-Implementer: 0x46, CPU-Architecture: 8, "
+          "CPU-Variant: 0x1, CPU-Part: 0x001, CPU-Revision: 0, Package-Count: "
+          "1}}"),
+      "- Special File Directory '" SIMENG_BUILD_DIR
+      "/thisDoesntExistDir/' does not exist");
+}
+
+}  // namespace simeng
\ No newline at end of file
diff --git a/test/unit/aarch64/ArchInfoTest.cc b/test/unit/aarch64/ArchInfoTest.cc
new file mode 100644
index 0000000000..39e25a0bd1
--- /dev/null
+++ b/test/unit/aarch64/ArchInfoTest.cc
@@ -0,0 +1,76 @@
+#include "gtest/gtest.h"
+#include "simeng/arch/aarch64/ArchInfo.hh"
+#include "simeng/config/SimInfo.hh"
+#include "simeng/version.hh"
+
+namespace simeng {
+namespace arch {
+namespace aarch64 {
+
+class AArch64ArchInfoTest : public ::testing::Test {
+ public:
+  AArch64ArchInfoTest() {
+    simeng::config::SimInfo::setConfig(SIMENG_SOURCE_DIR
+                                       "/configs/a64fx_SME.yaml");
+  }
+
+ protected:
+  const std::vector<uint64_t> sysRegisterEnums = {
+      aarch64_sysreg::AARCH64_SYSREG_DCZID_EL0,
+      aarch64_sysreg::AARCH64_SYSREG_FPCR,
+      aarch64_sysreg::AARCH64_SYSREG_FPSR,
+      aarch64_sysreg::AARCH64_SYSREG_TPIDR_EL0,
+      aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1,
+      aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0,
+      aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0,
+      aarch64_sysreg::AARCH64_SYSREG_SVCR};
+
+  const std::vector<simeng::RegisterFileStructure> archRegStruct = {
+      {8, 32},
+      {256, 32},
+      {32, 17},
+      {1, 1},
+      {8, static_cast<uint16_t>(sysRegisterEnums.size())},
+      {256, 64},
+      {64, 1}};
+
+  const std::vector<simeng::RegisterFileStructure> physRegStruct = {
+      {8, 96},
+      {256, 128},
+      {32, 48},
+      {1, 128},
+      {8, static_cast<uint16_t>(sysRegisterEnums.size())},
+      {256, 128},
+      {64, 8}};
+
+  const std::vector<uint16_t> physRegQuants = {
+      96, 128, 48, 128, static_cast<uint16_t>(sysRegisterEnums.size()), 128, 8};
+};
+
+// Test for the getSysRegEnums() function
+TEST_F(AArch64ArchInfoTest, getSysRegEnums) {
+  ArchInfo info = ArchInfo(config::SimInfo::getConfig());
+  EXPECT_EQ(info.getSysRegEnums(), sysRegisterEnums);
+}
+
+// Test for the getArchRegStruct() function
+TEST_F(AArch64ArchInfoTest, getArchRegStruct) {
+  ArchInfo info = ArchInfo(config::SimInfo::getConfig());
+  EXPECT_EQ(info.getArchRegStruct(), archRegStruct);
+}
+
+// Test for the getPhysRegStruct() function
+TEST_F(AArch64ArchInfoTest, getPhysRegStruct) {
+  ArchInfo info = ArchInfo(config::SimInfo::getConfig());
+  EXPECT_EQ(info.getPhysRegStruct(), physRegStruct);
+}
+
+// Test for the getPhysRegQuantities() function
+TEST_F(AArch64ArchInfoTest, getPhysRegQuantities) {
+  ArchInfo info = ArchInfo(config::SimInfo::getConfig());
+  EXPECT_EQ(info.getPhysRegQuantities(), physRegQuants);
+}
+
+}  // namespace aarch64
+}  // namespace arch
+}  // namespace simeng
\ No newline at end of file
diff --git a/test/unit/aarch64/ArchitectureTest.cc b/test/unit/aarch64/ArchitectureTest.cc
new file mode 100644
index 0000000000..8f2619a283
--- /dev/null
+++ b/test/unit/aarch64/ArchitectureTest.cc
@@ -0,0 +1,262 @@
+#include <iostream>
+
+#include "../ConfigInit.hh"
+#include "gtest/gtest.h"
+#include "simeng/CoreInstance.hh"
+#include "simeng/RegisterFileSet.hh"
+#include "simeng/arch/aarch64/Architecture.hh"
+#include "simeng/arch/riscv/Architecture.hh"
+#include "simeng/span.hh"
+#include "simeng/version.hh"
+
+namespace simeng {
+namespace arch {
+namespace aarch64 {
+
+// AArch64 Tests
+class AArch64ArchitectureTest : public testing::Test {
+ public:
+  AArch64ArchitectureTest()
+      : kernel(config::SimInfo::getConfig()["CPU-Info"]["Special-File-Dir-Path"]
+                   .as<std::string>()) {
+    arch = std::make_unique<Architecture>(kernel);
+    kernel.createProcess(process);
+  }
+
+ protected:
+  // Setting core model to complex OoO model to more verbosely test the
+  // Architecture class.
+  ConfigInit configInit = ConfigInit(config::ISA::AArch64, R"YAML({
+    Core: {
+      Simulation-Mode: outoforder,
+      Vector-Length: 512,
+      Streaming-Vector-Length: 128
+    },
+    LSQ-L1-Interface: {
+      Load-Bandwidth: 64,
+      Store-Bandwidth: 64
+    },
+    Ports: { 
+      '0': {Portname: Port 0, Instruction-Group-Support: [FP, SVE]},
+      '1': {Portname: Port 1, Instruction-Group-Support: [PREDICATE]},
+      '2': {Portname: Port 2, Instruction-Group-Support: [INT_SIMPLE, INT_MUL, STORE_DATA]},
+      '3': {Portname: Port 3, Instruction-Group-Support: [FP_SIMPLE, FP_MUL, SVE_SIMPLE, SVE_MUL]},
+      '4': {Portname: Port 4, Instruction-Group-Support: [INT_SIMPLE, INT_DIV_OR_SQRT]},
+      '5': {Portname: Port 5, Instruction-Group-Support: [LOAD, STORE_ADDRESS, INT_SIMPLE_ARTH_NOSHIFT, INT_SIMPLE_LOGICAL_NOSHIFT, INT_SIMPLE_CMP]},
+      '6': {Portname: Port 6, Instruction-Group-Support: [LOAD, STORE_ADDRESS, INT_SIMPLE_ARTH_NOSHIFT, INT_SIMPLE_LOGICAL_NOSHIFT, INT_SIMPLE_CMP]},
+      '7': {Portname: Port 7, Instruction-Group-Support: [BRANCH]}
+    },
+    Reservation-Stations: {
+      '0': {Size: 20, Dispatch-Rate: 2, Ports: [Port 0, Port 1, Port 2]},
+      '1': {Size: 20, Dispatch-Rate: 2, Ports: [Port 3, Port 4]},
+      '2': {Size: 10, Dispatch-Rate: 1, Ports: [Port 5]},
+      '3': {Size: 10, Dispatch-Rate: 1, Ports: [Port 6]},
+      '4': {Size: 19, Dispatch-Rate: 1, Ports: [Port 7]},
+    },
+    Execution-Units: {
+      '0': {Pipelined: True, Blocking-Groups: [INT_DIV_OR_SQRT, FP_DIV_OR_SQRT, SVE_DIV_OR_SQRT]},
+      '1': {Pipelined: True, Blocking-Groups: [INT_DIV_OR_SQRT, FP_DIV_OR_SQRT, SVE_DIV_OR_SQRT]},
+      '2': {Pipelined: True, Blocking-Groups: [INT_DIV_OR_SQRT, FP_DIV_OR_SQRT, SVE_DIV_OR_SQRT]},
+      '3': {Pipelined: True, Blocking-Groups: [INT_DIV_OR_SQRT, FP_DIV_OR_SQRT, SVE_DIV_OR_SQRT]},
+      '4': {Pipelined: True, Blocking-Groups: [INT_DIV_OR_SQRT, FP_DIV_OR_SQRT, SVE_DIV_OR_SQRT]},
+      '5': {Pipelined: True, Blocking-Groups: [INT_DIV_OR_SQRT, FP_DIV_OR_SQRT, SVE_DIV_OR_SQRT]},
+      '6': {Pipelined: True, Blocking-Groups: [INT_DIV_OR_SQRT, FP_DIV_OR_SQRT, SVE_DIV_OR_SQRT]},
+      '7': {Pipelined: True, Blocking-Groups: [INT_DIV_OR_SQRT, FP_DIV_OR_SQRT, SVE_DIV_OR_SQRT]}
+    },
+    Latencies: {
+      '0': {Instruction-Groups: [INT], Execution-Latency: 2, Execution-Throughput: 2},
+      '1': {Instruction-Groups: [INT_SIMPLE_ARTH_NOSHIFT, INT_SIMPLE_LOGICAL_NOSHIFT, INT_SIMPLE_CVT], Execution-Latency: 1, Execution-Throughput: 1},
+      '2': {Instruction-Groups: [INT_MUL], Execution-Latency: 5, Execution-Throughput: 1},
+      '3': {Instruction-Groups: [INT_DIV_OR_SQRT], Execution-Latency: 41, Execution-Throughput: 41},
+      '4': {Instruction-Groups: [SCALAR_SIMPLE, VECTOR_SIMPLE_LOGICAL, SVE_SIMPLE_LOGICAL, VECTOR_SIMPLE_CMP, SVE_SIMPLE_CMP], Execution-Latency: 4, Execution-Throughput: 1},
+      '5': {Instruction-Groups: [FP_DIV_OR_SQRT], Execution-Latency: 29, Execution-Throughput: 29},
+      '6': {Instruction-Groups: [VECTOR_SIMPLE, SVE_SIMPLE, SCALAR_SIMPLE_CVT, FP_MUL, SVE_MUL], Execution-Latency: 9, Execution-Throughput: 1},
+      '7': {Instruction-Groups: [SVE_DIV_OR_SQRT], Execution-Latency: 98, Execution-Throughput: 98},
+      '8': {Instruction-Groups: [PREDICATE], Execution-Latency: 3, Execution-Throughput: 1},
+      '9': {Instruction-Groups: [LOAD_SCALAR, LOAD_VECTOR, STORE_ADDRESS_SCALAR, STORE_ADDRESS_VECTOR], Execution-Latency: 3, Execution-Throughput: 1},
+      '10': {Instruction-Groups: [LOAD_SVE, STORE_ADDRESS_SVE], Execution-Latency: 6, Execution-Throughput: 1}
+    }
+  })YAML");
+
+  // fdivr z1.s, p0/m, z1.s, z0.s
+  const std::array<uint8_t, 4> validInstrBytes = {0x01, 0x80, 0x8c, 0x65};
+  const std::array<uint8_t, 4> invalidInstrBytes = {0x20, 0x00, 0x02, 0x8c};
+
+  std::unique_ptr<Architecture> arch;
+  kernel::Linux kernel;
+  kernel::LinuxProcess process = kernel::LinuxProcess(
+      span(validInstrBytes.data(), validInstrBytes.size()));
+};
+
+TEST_F(AArch64ArchitectureTest, predecode) {
+  // Test that mis-aligned instruction address results in error
+  MacroOp output;
+  uint8_t result = arch->predecode(validInstrBytes.data(),
+                                   validInstrBytes.size(), 0x7, output);
+  Instruction* aarch64Insn = reinterpret_cast<Instruction*>(output[0].get());
+  EXPECT_EQ(result, 1);
+  EXPECT_EQ(aarch64Insn->getInstructionAddress(), 0x7);
+  EXPECT_EQ(aarch64Insn->exceptionEncountered(), true);
+  EXPECT_EQ(aarch64Insn->getException(), InstructionException::MisalignedPC);
+
+  // Test that an invalid instruction returns instruction with an exception
+  output = MacroOp();
+  result = arch->predecode(invalidInstrBytes.data(), invalidInstrBytes.size(),
+                           0x8, output);
+  aarch64Insn = reinterpret_cast<Instruction*>(output[0].get());
+  EXPECT_EQ(result, 4);
+  EXPECT_EQ(aarch64Insn->getInstructionAddress(), 0x8);
+  EXPECT_EQ(aarch64Insn->exceptionEncountered(), true);
+  EXPECT_EQ(aarch64Insn->getException(),
+            InstructionException::EncodingUnallocated);
+
+  // Test that an instruction can be properly decoded
+  output = MacroOp();
+  result = arch->predecode(validInstrBytes.data(), validInstrBytes.size(), 0x4,
+                           output);
+  EXPECT_EQ(result, 4);
+  EXPECT_EQ(output[0]->getInstructionAddress(), 0x4);
+  EXPECT_EQ(output[0]->exceptionEncountered(), false);
+  EXPECT_EQ(output[0]->getGroup(), InstructionGroups::SVE_DIV_OR_SQRT);
+}
+
+TEST_F(AArch64ArchitectureTest, getSystemRegisterTag) {
+  // Test incorrect system register will fail
+  int32_t output = arch->getSystemRegisterTag(-1);
+  EXPECT_EQ(output, -1);
+
+  // Test for correct behaviour
+  output = arch->getSystemRegisterTag(AARCH64_SYSREG_DCZID_EL0);
+  EXPECT_EQ(output, 0);
+}
+
+TEST_F(AArch64ArchitectureTest, handleException) {
+  // Get Instruction
+  MacroOp insn;
+  uint8_t bytes = arch->predecode(invalidInstrBytes.data(),
+                                  invalidInstrBytes.size(), 0x4, insn);
+  Instruction* aarch64Insn = reinterpret_cast<Instruction*>(insn[0].get());
+  EXPECT_EQ(bytes, 4);
+  EXPECT_EQ(aarch64Insn->getInstructionAddress(), 0x4);
+  EXPECT_EQ(aarch64Insn->exceptionEncountered(), true);
+  EXPECT_EQ(aarch64Insn->getException(),
+            InstructionException::EncodingUnallocated);
+
+  // Get Core
+  std::string executablePath = SIMENG_SOURCE_DIR "/SimEngDefaultProgram";
+  std::vector<std::string> executableArgs = {};
+  std::unique_ptr<CoreInstance> coreInstance =
+      std::make_unique<CoreInstance>(executablePath, executableArgs);
+  const Core& core = *coreInstance->getCore();
+  memory::MemoryInterface& memInt = *coreInstance->getDataMemory();
+  auto exceptionHandler = arch->handleException(insn[0], core, memInt);
+
+  bool tickRes = exceptionHandler->tick();
+  auto result = exceptionHandler->getResult();
+  EXPECT_TRUE(tickRes);
+  EXPECT_TRUE(result.fatal);
+  // Instruction address for fatal exception is always 0.
+  EXPECT_EQ(result.instructionAddress, 0x0);
+}
+
+TEST_F(AArch64ArchitectureTest, getInitialState) {
+  std::vector<Register> regs = {
+      {RegisterType::GENERAL, 31},
+      {RegisterType::SYSTEM,
+       (uint16_t)arch->getSystemRegisterTag(AARCH64_SYSREG_DCZID_EL0)}};
+  std::vector<RegisterValue> regVals = {{kernel.getInitialStackPointer(), 8},
+                                        {20, 8}};
+
+  arch::ProcessStateChange changes = arch->getInitialState();
+  EXPECT_EQ(changes.type, arch::ChangeType::REPLACEMENT);
+  EXPECT_EQ(changes.modifiedRegisters, regs);
+  EXPECT_EQ(changes.modifiedRegisterValues, regVals);
+}
+
+TEST_F(AArch64ArchitectureTest, getMaxInstructionSize) {
+  EXPECT_EQ(arch->getMaxInstructionSize(), 4);
+}
+
+TEST_F(AArch64ArchitectureTest, getVectorLength) {
+  EXPECT_EQ(arch->getVectorLength(), 512);
+}
+
+TEST_F(AArch64ArchitectureTest, getStreamingVectorLength) {
+  // Default SVL value is 128
+  EXPECT_EQ(arch->getStreamingVectorLength(), 128);
+}
+
+TEST_F(AArch64ArchitectureTest, updateSystemTimerRegisters) {
+  RegisterFileSet regFile = config::SimInfo::getArchRegStruct();
+
+  uint8_t vctCount = 0;
+  // In A64FX, Timer frequency = (2.5 * 1e9) / (100 * 1e6) = 18
+  uint64_t vctModulo =
+      (config::SimInfo::getConfig()["Core"]["Clock-Frequency-GHz"].as<float>() *
+       1e9) /
+      (config::SimInfo::getConfig()["Core"]["Timer-Frequency-MHz"]
+           .as<uint32_t>() *
+       1e6);
+  for (int i = 0; i < 30; i++) {
+    vctCount += (i % vctModulo) == 0 ? 1 : 0;
+    arch->updateSystemTimerRegisters(&regFile, i);
+    EXPECT_EQ(
+        regFile
+            .get({RegisterType::SYSTEM, (uint16_t)arch->getSystemRegisterTag(
+                                            AARCH64_SYSREG_PMCCNTR_EL0)})
+            .get<uint64_t>(),
+        i);
+    EXPECT_EQ(
+        regFile
+            .get({RegisterType::SYSTEM, (uint16_t)arch->getSystemRegisterTag(
+                                            AARCH64_SYSREG_CNTVCT_EL0)})
+            .get<uint64_t>(),
+        vctCount);
+  }
+}
+
+TEST_F(AArch64ArchitectureTest, getExecutionInfo) {
+  MacroOp insn;
+  uint64_t bytes = arch->predecode(validInstrBytes.data(),
+                                   validInstrBytes.size(), 0x4, insn);
+  // Insn[0] = fdivr z1.s, p0/m, z1.s, z0.s
+  Instruction* aarch64Insn = reinterpret_cast<Instruction*>(insn[0].get());
+  EXPECT_EQ(bytes, 4);
+  EXPECT_EQ(aarch64Insn->getInstructionAddress(), 0x4);
+  EXPECT_EQ(aarch64Insn->exceptionEncountered(), false);
+
+  ExecutionInfo info = arch->getExecutionInfo(*aarch64Insn);
+
+  // Latencies and Port numbers from a64fx.yaml
+  EXPECT_EQ(info.latency, 98);
+  EXPECT_EQ(info.stallCycles, 98);
+  std::vector<uint16_t> ports = {0};
+  EXPECT_EQ(info.ports, ports);
+}
+
+TEST_F(AArch64ArchitectureTest, get_set_SVCRVal) {
+  EXPECT_EQ(arch->getSVCRval(), 0);
+  arch->setSVCRval(3);
+  EXPECT_EQ(arch->getSVCRval(), 3);
+}
+
+TEST_F(AArch64ArchitectureTest, isSM_ZA_enabled) {
+  EXPECT_FALSE(arch->isStreamingModeEnabled());
+  EXPECT_FALSE(arch->isZARegisterEnabled());
+  arch->setSVCRval(1);
+  EXPECT_TRUE(arch->isStreamingModeEnabled());
+  EXPECT_FALSE(arch->isZARegisterEnabled());
+  arch->setSVCRval(2);
+  EXPECT_FALSE(arch->isStreamingModeEnabled());
+  EXPECT_TRUE(arch->isZARegisterEnabled());
+  arch->setSVCRval(3);
+  EXPECT_TRUE(arch->isStreamingModeEnabled());
+  EXPECT_TRUE(arch->isZARegisterEnabled());
+  arch->setSVCRval(0);
+  EXPECT_FALSE(arch->isStreamingModeEnabled());
+  EXPECT_FALSE(arch->isZARegisterEnabled());
+}
+
+}  // namespace aarch64
+}  // namespace arch
+}  // namespace simeng
diff --git a/test/unit/aarch64/AuxiliaryFunctionsTest.cc b/test/unit/aarch64/AuxiliaryFunctionsTest.cc
new file mode 100644
index 0000000000..f319b55dbb
--- /dev/null
+++ b/test/unit/aarch64/AuxiliaryFunctionsTest.cc
@@ -0,0 +1,696 @@
+#include "gtest/gtest.h"
+#include "simeng/arch/aarch64/helpers/auxiliaryFunctions.hh"
+
+namespace simeng {
+namespace arch {
+namespace aarch64 {
+
+/** `nzcv` Tests */
+TEST(AArch64AuxiliaryFunctionTest, NzcvTest) {
+  EXPECT_EQ(nzcv(true, true, true, true), 0b00001111);
+  EXPECT_EQ(nzcv(false, false, false, false), 0b00000000);
+  EXPECT_EQ(nzcv(true, false, false, true), 0b00001001);
+  EXPECT_EQ(nzcv(false, true, false, false), 0b00000100);
+}
+
+/** `addWithCarry` Tests */
+TEST(AArch64AuxiliaryFunctionTest, AddWithCarry) {
+  std::tuple<uint8_t, uint8_t> u8Res = {111, 0b0010};
+  EXPECT_EQ(addWithCarry<uint8_t>(123, 244, false), u8Res);
+
+  std::tuple<uint16_t, uint8_t> u16Res = {0xFFFD, 0b1000};
+  EXPECT_EQ(addWithCarry<uint16_t>(0xFFF0, 0x000C, true), u16Res);
+
+  std::tuple<uint32_t, uint8_t> u32Res = {2147483649, 0b1001};
+  EXPECT_EQ(addWithCarry<uint32_t>(1, 2147483647, true), u32Res);
+
+  std::tuple<uint64_t, uint8_t> u64Res = {0, 0b0110};
+  EXPECT_EQ(addWithCarry<uint64_t>(0xFFFFFFFFFFFFFFFF, 1, false), u64Res);
+}
+
+/** `bitfieldManipulate` Tests */
+TEST(AArch64AuxiliaryFunctionTest, BitfieldManipulate) {
+  // uint8
+  EXPECT_EQ(bitfieldManipulate<uint8_t>(0x0F, 0xF0, 0, 0, false), 0xF1);
+  EXPECT_EQ(bitfieldManipulate<uint8_t>(0x0F, 0xF0, 0, 0, true), 0xFF);
+
+  EXPECT_EQ(bitfieldManipulate<uint8_t>(0x0F, 0xF0, 0, 7, false), 0x0F);
+  EXPECT_EQ(bitfieldManipulate<uint8_t>(0x0F, 0xF0, 0, 7, true), 0x0F);
+  EXPECT_EQ(bitfieldManipulate<uint8_t>(0x0F, 0xF0, 7, 0, false), 0xF2);
+  EXPECT_EQ(bitfieldManipulate<uint8_t>(0x0F, 0xF0, 7, 0, true), 0xFE);
+
+  EXPECT_EQ(bitfieldManipulate<uint8_t>(0x18, 0xF0, 2, 5, false), 0xF6);
+  EXPECT_EQ(bitfieldManipulate<uint8_t>(0x18, 0xF0, 2, 5, true), 0x06);
+  EXPECT_EQ(bitfieldManipulate<uint8_t>(0x7, 0xF0, 5, 2, false), 0xF8);
+  EXPECT_EQ(bitfieldManipulate<uint8_t>(0x7, 0xF0, 5, 2, true), 0xF8);
+
+  ASSERT_DEATH(
+      { bitfieldManipulate<uint8_t>(0, 0, 8, 0, false); },
+      "Attempted to use a rotate amount of 8 in bitfieldManipulate which is "
+      "greater than or equal to the data type size of 8b in use");
+  ASSERT_DEATH(
+      { bitfieldManipulate<uint8_t>(0, 0, 0, 8, false); },
+      "Attempted to use a source bit position value of 8 in bitfieldManipulate "
+      "which is greater than or equal to the data type size of 8b in use");
+
+  // uint16
+  EXPECT_EQ(bitfieldManipulate<uint16_t>(0x00FF, 0xFF00, 0, 0, false), 0xFF01);
+  EXPECT_EQ(bitfieldManipulate<uint16_t>(0x00FF, 0xFF00, 0, 0, true), 0xFFFF);
+
+  EXPECT_EQ(bitfieldManipulate<uint16_t>(0x00FF, 0xFF00, 0, 15, false), 0x00FF);
+  EXPECT_EQ(bitfieldManipulate<uint16_t>(0x00FF, 0xFF00, 0, 15, true), 0x00FF);
+  EXPECT_EQ(bitfieldManipulate<uint16_t>(0x00FF, 0xFF00, 15, 0, false), 0xFF02);
+  EXPECT_EQ(bitfieldManipulate<uint16_t>(0x00FF, 0xFF00, 15, 0, true), 0xFFFE);
+
+  EXPECT_EQ(bitfieldManipulate<uint16_t>(0x03C0, 0xFF00, 4, 11, false), 0xFF3C);
+  EXPECT_EQ(bitfieldManipulate<uint16_t>(0x03C0, 0xFF00, 4, 11, true), 0x003C);
+  EXPECT_EQ(bitfieldManipulate<uint16_t>(0x1F, 0xFF00, 11, 4, false), 0xFFE0);
+  EXPECT_EQ(bitfieldManipulate<uint16_t>(0x1F, 0xFF00, 11, 4, true), 0xFFE0);
+
+  ASSERT_DEATH(
+      { bitfieldManipulate<uint16_t>(0, 0, 16, 0, false); },
+      "Attempted to use a rotate amount of 16 in bitfieldManipulate which is "
+      "greater than or equal to the data type size of 16b in use");
+  ASSERT_DEATH({ bitfieldManipulate<uint16_t>(0, 0, 0, 16, false); },
+               "Attempted to use a source bit position value of 16 in "
+               "bitfieldManipulate which is greater than or equal to the data "
+               "type size of 16b in use");
+
+  // uint32
+  EXPECT_EQ(bitfieldManipulate<uint32_t>(0x0000FFFF, 0xFFFF0000, 0, 0, false),
+            0xFFFF0001);
+  EXPECT_EQ(bitfieldManipulate<uint32_t>(0x0000FFFF, 0xFFFF0000, 0, 0, true),
+            0xFFFFFFFF);
+
+  EXPECT_EQ(bitfieldManipulate<uint32_t>(0x0000FFFF, 0xFFFF0000, 0, 31, false),
+            0x0000FFFF);
+  EXPECT_EQ(bitfieldManipulate<uint32_t>(0x0000FFFF, 0xFFFF0000, 0, 31, true),
+            0x0000FFFF);
+  EXPECT_EQ(bitfieldManipulate<uint32_t>(0x0000FFFF, 0xFFFF0000, 31, 0, false),
+            0xFFFF0002);
+  EXPECT_EQ(bitfieldManipulate<uint32_t>(0x0000FFFF, 0xFFFF0000, 31, 0, true),
+            0xFFFFFFFE);
+
+  EXPECT_EQ(bitfieldManipulate<uint32_t>(0x000FF000, 0xFFFF0000, 8, 23, false),
+            0xFFFF0FF0);
+  EXPECT_EQ(bitfieldManipulate<uint32_t>(0x000FF000, 0xFFFF0000, 8, 23, true),
+            0x00000FF0);
+  EXPECT_EQ(bitfieldManipulate<uint32_t>(0x1FF, 0xFFFF0000, 23, 8, false),
+            0xFFFFFE00);
+  EXPECT_EQ(bitfieldManipulate<uint32_t>(0x1FF, 0xFFFF0000, 23, 8, true),
+            0xFFFFFE00);
+
+  ASSERT_DEATH(
+      { bitfieldManipulate<uint32_t>(0, 0, 32, 0, false); },
+      "Attempted to use a rotate amount of 32 in bitfieldManipulate which is "
+      "greater than or equal to the data type size of 32b in use");
+  ASSERT_DEATH({ bitfieldManipulate<uint32_t>(0, 0, 0, 32, false); },
+               "Attempted to use a source bit position value of 32 in "
+               "bitfieldManipulate which is greater than or equal to the data "
+               "type size of 32b in use");
+
+  // uint64
+  EXPECT_EQ(bitfieldManipulate<uint64_t>(0x00000000FFFFFFFF, 0xFFFFFFFF00000000,
+                                         0, 0, false),
+            0xFFFFFFFF00000001);
+  EXPECT_EQ(bitfieldManipulate<uint64_t>(0x00000000FFFFFFFF, 0xFFFFFFFF00000000,
+                                         0, 0, true),
+            0xFFFFFFFFFFFFFFFF);
+
+  EXPECT_EQ(bitfieldManipulate<uint64_t>(0x00000000FFFFFFFF, 0xFFFFFFFF00000000,
+                                         0, 63, false),
+            0x00000000FFFFFFFF);
+  EXPECT_EQ(bitfieldManipulate<uint64_t>(0x00000000FFFFFFFF, 0xFFFFFFFF00000000,
+                                         0, 63, true),
+            0x00000000FFFFFFFF);
+  EXPECT_EQ(bitfieldManipulate<uint64_t>(0x00000000FFFFFFFF, 0xFFFFFFFF00000000,
+                                         63, 0, false),
+            0xFFFFFFFF00000002);
+  EXPECT_EQ(bitfieldManipulate<uint64_t>(0x00000000FFFFFFFF, 0xFFFFFFFF00000000,
+                                         63, 0, true),
+            0xFFFFFFFFFFFFFFFE);
+
+  EXPECT_EQ(bitfieldManipulate<uint64_t>(0x000000FFFF000000, 0xFFFFFFFF00000000,
+                                         16, 47, false),
+            0xFFFFFFFF00FFFF00);
+  EXPECT_EQ(bitfieldManipulate<uint64_t>(0x000000FFFF000000, 0xFFFFFFFF00000000,
+                                         16, 47, true),
+            0x0000000000FFFF00);
+  EXPECT_EQ(
+      bitfieldManipulate<uint64_t>(0x1FFFF, 0xFFFFFFFF00000000, 47, 16, false),
+      0xFFFFFFFFFFFE0000);
+  EXPECT_EQ(
+      bitfieldManipulate<uint64_t>(0x1FFFF, 0xFFFFFFFF00000000, 47, 16, true),
+      0xFFFFFFFFFFFE0000);
+
+  ASSERT_DEATH(
+      { bitfieldManipulate<uint64_t>(0, 0, 64, 0, false); },
+      "Attempted to use a rotate amount of 64 in bitfieldManipulate which is "
+      "greater than or equal to the data type size of 64b in use");
+  ASSERT_DEATH({ bitfieldManipulate<uint64_t>(0, 0, 0, 64, false); },
+               "Attempted to use a source bit position value of 64 in "
+               "bitfieldManipulate which is greater than or equal to the data "
+               "type size of 64b in use");
+}
+
+/** `conditionHolds` Tests */
+TEST(AArch64AuxiliaryFunctionTest, ConditionHolds) {
+  // Run each condition at least twice, one which we expect to be true, one we
+  // expect to be false
+
+  // Inverse False
+  // EQ/NE
+  EXPECT_TRUE(conditionHolds(0b0000, 0b0100));
+  EXPECT_FALSE(conditionHolds(0b0000, 0b1011));
+
+  // CS/CC
+  EXPECT_TRUE(conditionHolds(0b0010, 0b0010));
+  EXPECT_FALSE(conditionHolds(0b0010, 0b1101));
+
+  // MI/PL
+  EXPECT_TRUE(conditionHolds(0b0100, 0b1000));
+  EXPECT_FALSE(conditionHolds(0b0100, 0b0111));
+
+  // VS/VC
+  EXPECT_TRUE(conditionHolds(0b0110, 0b0001));
+  EXPECT_FALSE(conditionHolds(0b0110, 0b1110));
+
+  // HI/LS
+  EXPECT_TRUE(conditionHolds(0b1000, 0b1010));
+  EXPECT_FALSE(conditionHolds(0b1000, 0b1111));
+  EXPECT_FALSE(conditionHolds(0b1000, 0b1001));
+
+  // GE/LT
+  EXPECT_TRUE(conditionHolds(0b1010, 0b1001));
+  EXPECT_TRUE(conditionHolds(0b1010, 0b0000));
+  EXPECT_FALSE(conditionHolds(0b1010, 0b1000));
+
+  // GT/LE
+  EXPECT_TRUE(conditionHolds(0b1100, 0b1001));
+  EXPECT_TRUE(conditionHolds(0b1100, 0b0000));
+  EXPECT_FALSE(conditionHolds(0b1100, 0b0001));
+  EXPECT_FALSE(conditionHolds(0b1100, 0b1000));
+  EXPECT_FALSE(conditionHolds(0b1100, 0b1101));
+
+  // Condition of 0b111 always returns `true`
+  // AL
+  EXPECT_TRUE(conditionHolds(0b1110, 0b1111));
+  EXPECT_TRUE(conditionHolds(0b1110, 0b0000));
+
+  // Inverse True
+  // EQ/NE
+  EXPECT_FALSE(conditionHolds(0b0001, 0b0100));
+  EXPECT_TRUE(conditionHolds(0b0001, 0b1011));
+
+  // CS/CC
+  EXPECT_FALSE(conditionHolds(0b0011, 0b0010));
+  EXPECT_TRUE(conditionHolds(0b0011, 0b1101));
+
+  // MI/PL
+  EXPECT_FALSE(conditionHolds(0b0101, 0b1000));
+  EXPECT_TRUE(conditionHolds(0b0101, 0b0111));
+
+  // VS/VC
+  EXPECT_FALSE(conditionHolds(0b0111, 0b0001));
+  EXPECT_TRUE(conditionHolds(0b0111, 0b1110));
+
+  // HI/LS
+  EXPECT_FALSE(conditionHolds(0b1001, 0b1010));
+  EXPECT_TRUE(conditionHolds(0b1001, 0b1111));
+  EXPECT_TRUE(conditionHolds(0b1001, 0b1001));
+
+  // GE/LT
+  EXPECT_FALSE(conditionHolds(0b1011, 0b1001));
+  EXPECT_FALSE(conditionHolds(0b1011, 0b0000));
+  EXPECT_TRUE(conditionHolds(0b1011, 0b1000));
+
+  // GT/LE
+  EXPECT_FALSE(conditionHolds(0b1101, 0b1001));
+  EXPECT_FALSE(conditionHolds(0b1101, 0b0000));
+  EXPECT_TRUE(conditionHolds(0b1101, 0b0001));
+  EXPECT_TRUE(conditionHolds(0b1101, 0b1000));
+  EXPECT_TRUE(conditionHolds(0b1101, 0b1101));
+
+  // AL
+  // Cond=0b111 and inverse of 1 always returns `true`
+  EXPECT_TRUE(conditionHolds(0b1111, 0b1111));
+  EXPECT_TRUE(conditionHolds(0b1111, 0b0000));
+}
+
+/** `extendValue` Tests */
+TEST(AArch64AuxiliaryFunctionTest, ExtendValue) {
+  // Test special case
+  EXPECT_EQ(extendValue(123, AARCH64_EXT_INVALID, 0), 123);
+
+  // Results validated on XCI and A64FX hardware
+  EXPECT_EQ(extendValue(270, AARCH64_EXT_UXTB, 3), 112);
+  EXPECT_EQ(extendValue(65560, AARCH64_EXT_UXTH, 3), 192);
+  EXPECT_EQ(extendValue(0xFFFFFFFF, AARCH64_EXT_UXTW, 3), 34359738360);
+  EXPECT_EQ(extendValue(0x0F0F0F0F0F0F0F01, AARCH64_EXT_UXTX, 4),
+            0xF0F0F0F0F0F0F010);
+
+  EXPECT_EQ(extendValue(133, AARCH64_EXT_SXTB, 3), -984);
+  EXPECT_EQ(extendValue(32768, AARCH64_EXT_SXTH, 3), -262144);
+  EXPECT_EQ(extendValue(2147483648, AARCH64_EXT_SXTW, 3), -17179869184);
+  EXPECT_EQ(extendValue(0x8000000000000000, AARCH64_EXT_SXTX, 3), 0);
+}
+
+/** `getNZCVfromPred` Tests */
+TEST(AArch64AuxiliaryFunctionTest, getNZCVfromPred) {
+  uint64_t vl = 128;
+  // VL 128 will only use array[0]
+  EXPECT_EQ(getNZCVfromPred(
+                {0, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF},
+                vl, 1),
+            0b0110);
+  EXPECT_EQ(
+      getNZCVfromPred({0xFFFFFFFFFFFFFFFF, 0, 0, 0x300000000000000C}, vl, 2),
+      0b1000);
+  EXPECT_EQ(getNZCVfromPred(
+                {0xE000000000000000, 0xE000000000000000, 0xE000000000000000, 0},
+                vl, 4),
+            0b0010);
+  EXPECT_EQ(getNZCVfromPred({0, 0x8000000000000001, 0, 0}, vl, 8), 0b0110);
+
+  vl = 256;
+  // VL 256 will only use array[0]
+  EXPECT_EQ(getNZCVfromPred(
+                {0, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF},
+                vl, 1),
+            0b0110);
+  EXPECT_EQ(
+      getNZCVfromPred({0xFFFFFFFFFFFFFFFF, 0, 0, 0x300000000000000C}, vl, 2),
+      0b1000);
+  EXPECT_EQ(getNZCVfromPred(
+                {0xE000000000000000, 0xE000000000000000, 0xE000000000000000, 0},
+                vl, 4),
+            0b0010);
+  EXPECT_EQ(getNZCVfromPred({0, 0x8000000000000001, 0, 0}, vl, 8), 0b0110);
+
+  vl = 512;
+  // VL 512 will only use array[0]
+  EXPECT_EQ(getNZCVfromPred(
+                {0, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF},
+                vl, 1),
+            0b0110);
+  EXPECT_EQ(
+      getNZCVfromPred({0xFFFFFFFFFFFFFFFF, 0, 0, 0x300000000000000C}, vl, 2),
+      0b1000);
+  EXPECT_EQ(getNZCVfromPred(
+                {0xE000000000000000, 0xE000000000000000, 0xE000000000000000, 0},
+                vl, 4),
+            0b0010);
+  EXPECT_EQ(getNZCVfromPred({0, 0x8000000000000001, 0, 0}, vl, 8), 0b0110);
+
+  vl = 1024;
+  // VL 1024 will only use array[0, 1]
+  EXPECT_EQ(getNZCVfromPred(
+                {0, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF},
+                vl, 1),
+            0b0000);
+  EXPECT_EQ(
+      getNZCVfromPred({0xFFFFFFFFFFFFFFFF, 0, 0, 0x300000000000000C}, vl, 2),
+      0b1010);
+  EXPECT_EQ(getNZCVfromPred(
+                {0xE000000000000000, 0xE000000000000000, 0xE000000000000000, 0},
+                vl, 4),
+            0b0010);
+  EXPECT_EQ(getNZCVfromPred({0, 0x8000000000000000, 0, 0}, vl, 8), 0b0010);
+
+  vl = 2048;
+  // VL 2048 will only use array[0, 1, 2, 3]
+  EXPECT_EQ(getNZCVfromPred(
+                {0, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF},
+                vl, 1),
+            0b0000);
+  EXPECT_EQ(
+      getNZCVfromPred({0xFFFFFFFFFFFFFFFF, 0, 0, 0x300000000000000C}, vl, 2),
+      0b1010);
+  EXPECT_EQ(getNZCVfromPred(
+                {0xE000000000000000, 0xE000000000000000, 0xE000000000000000, 0},
+                vl, 4),
+            0b0010);
+  EXPECT_EQ(getNZCVfromPred({0, 0x8000000000000001, 0, 0}, vl, 8), 0b0010);
+}
+
+/** `mulhi` Tests */
+TEST(AArch64AuxiliaryFunctionTest, Mulhi) {
+  EXPECT_EQ(mulhi(0xFFFFFFFFFFFFFFFF, 2), 1);
+  EXPECT_EQ(mulhi(1, 245), 0);
+
+  EXPECT_EQ(mulhi(0xF000000000000000, 4), 3);
+  EXPECT_EQ(mulhi(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), 0xFFFFFFFFFFFFFFFE);
+  EXPECT_EQ(mulhi(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000), 0xFFFFFFFEFFFFFFFF);
+  EXPECT_EQ(mulhi(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF), 0xFFFFFFFE);
+}
+
+/** `getElemsFromPattern` Tests */
+TEST(AArch64AuxiliaryFunctionTest, getElemsFromPattern) {
+  uint16_t vl = 128;
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_ALL, 64, vl), 2);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_ALL, 16, vl), 8);
+
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL1, 64, vl), 1);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL2, 64, vl), 2);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL3, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL4, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL5, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL6, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL7, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL8, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL16, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL32, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL64, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL128, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL256, 64, vl), 0);
+
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_MUL4, 8, vl), 16);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_MUL3, 8, vl), 15);
+
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_POW2, 8, vl), 16);
+
+  vl = 256;
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_ALL, 64, vl), 4);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_ALL, 16, vl), 16);
+
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL1, 64, vl), 1);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL2, 64, vl), 2);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL3, 64, vl), 3);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL4, 64, vl), 4);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL5, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL6, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL7, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL8, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL16, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL32, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL64, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL128, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL256, 64, vl), 0);
+
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_MUL4, 8, vl), 32);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_MUL3, 8, vl), 30);
+
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_POW2, 8, vl), 32);
+
+  vl = 512;
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_ALL, 64, vl), 8);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_ALL, 16, vl), 32);
+
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL1, 64, vl), 1);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL2, 64, vl), 2);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL3, 64, vl), 3);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL4, 64, vl), 4);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL5, 64, vl), 5);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL6, 64, vl), 6);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL7, 64, vl), 7);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL8, 64, vl), 8);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL16, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL32, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL64, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL128, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL256, 64, vl), 0);
+
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_MUL4, 8, vl), 64);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_MUL3, 8, vl), 63);
+
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_POW2, 8, vl), 64);
+
+  vl = 1024;
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_ALL, 64, vl), 16);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_ALL, 16, vl), 64);
+
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL1, 64, vl), 1);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL2, 64, vl), 2);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL3, 64, vl), 3);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL4, 64, vl), 4);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL5, 64, vl), 5);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL6, 64, vl), 6);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL7, 64, vl), 7);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL8, 64, vl), 8);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL16, 64, vl), 16);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL32, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL64, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL128, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL256, 64, vl), 0);
+
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_MUL4, 8, vl), 128);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_MUL3, 8, vl), 126);
+
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_POW2, 8, vl), 128);
+
+  vl = 2048;
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_ALL, 64, vl), 32);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_ALL, 16, vl), 128);
+
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL1, 64, vl), 1);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL2, 64, vl), 2);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL3, 64, vl), 3);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL4, 64, vl), 4);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL5, 64, vl), 5);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL6, 64, vl), 6);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL7, 64, vl), 7);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL8, 64, vl), 8);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL16, 64, vl), 16);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL32, 64, vl), 32);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL64, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL128, 64, vl), 0);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_VL256, 64, vl), 0);
+
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_MUL4, 8, vl), 256);
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_MUL3, 8, vl), 255);
+
+  EXPECT_EQ(getElemsFromPattern(AARCH64_SVEPREDPAT_POW2, 8, vl), 256);
+}
+
+/** `ShiftValue` Tests */
+TEST(AArch64AuxiliaryFunctionTest, ShiftValueTest_LSL) {
+  // 8-bit
+  const uint8_t a = 0x0F;
+  EXPECT_EQ(shiftValue(a, AARCH64_SFT_LSL, 4), 0xF0);
+
+  const uint8_t b = 0xF0;
+  EXPECT_EQ(shiftValue(b, AARCH64_SFT_LSL, 7), 0x00);
+
+  EXPECT_EQ(shiftValue(b, AARCH64_SFT_LSL, 0), b);
+
+  // 16-bit
+  const uint16_t c = 0x00FF;
+  EXPECT_EQ(shiftValue(c, AARCH64_SFT_LSL, 8), 0xFF00);
+
+  const uint16_t d = 0xFF00;
+  EXPECT_EQ(shiftValue(d, AARCH64_SFT_LSL, 15), 0x0000);
+
+  EXPECT_EQ(shiftValue(d, AARCH64_SFT_LSL, 0), d);
+
+  // 32-bit
+  const uint32_t e = 0x0000FFFF;
+  EXPECT_EQ(shiftValue(e, AARCH64_SFT_LSL, 16), 0xFFFF0000);
+
+  const uint32_t f = 0xFFFF0000;
+  EXPECT_EQ(shiftValue(f, AARCH64_SFT_LSL, 31), 0x00000000);
+
+  EXPECT_EQ(shiftValue(f, AARCH64_SFT_LSL, 0), f);
+
+  // 64-bit
+  const uint64_t g = 0x00000000FFFFFFFF;
+  EXPECT_EQ(shiftValue(g, AARCH64_SFT_LSL, 32), 0xFFFFFFFF00000000);
+
+  const uint64_t h = 0xFFFFFFFF00000000;
+  EXPECT_EQ(shiftValue(h, AARCH64_SFT_LSL, 63), 0x0000000000000000);
+
+  EXPECT_EQ(shiftValue(h, AARCH64_SFT_LSL, 0), h);
+}
+
+TEST(AArch64AuxiliaryFunctionTest, ShiftValueTest_LSR) {
+  // 8-bit
+  const uint8_t a = 0x0F;
+  EXPECT_EQ(shiftValue(a, AARCH64_SFT_LSR, 4), 0x00);
+
+  const uint8_t b = 0xF0;
+  EXPECT_EQ(shiftValue(b, AARCH64_SFT_LSR, 7), 0x01);
+
+  EXPECT_EQ(shiftValue(b, AARCH64_SFT_LSR, 0), b);
+
+  // 16-bit
+  const uint16_t c = 0x00FF;
+  EXPECT_EQ(shiftValue(c, AARCH64_SFT_LSR, 8), 0x0);
+
+  const uint16_t d = 0xFF00;
+  EXPECT_EQ(shiftValue(d, AARCH64_SFT_LSR, 15), 0x0001);
+
+  EXPECT_EQ(shiftValue(d, AARCH64_SFT_LSR, 0), d);
+
+  // 32-bit
+  const uint32_t e = 0x0000FFFF;
+  EXPECT_EQ(shiftValue(e, AARCH64_SFT_LSR, 16), 0x00000000);
+
+  const uint32_t f = 0xFFFF0000;
+  EXPECT_EQ(shiftValue(f, AARCH64_SFT_LSR, 31), 0x00000001);
+
+  EXPECT_EQ(shiftValue(f, AARCH64_SFT_LSR, 0), f);
+
+  // 64-bit
+  const uint64_t g = 0x00000000FFFFFFFF;
+  EXPECT_EQ(shiftValue(g, AARCH64_SFT_LSR, 32), 0x0000000000000000);
+
+  const uint64_t h = 0xFFFFFFFF00000000;
+  EXPECT_EQ(shiftValue(h, AARCH64_SFT_LSR, 63), 0x0000000000000001);
+
+  EXPECT_EQ(shiftValue(h, AARCH64_SFT_LSR, 0), h);
+}
+
+TEST(AArch64AuxiliaryFunctionTest, ShiftValueTest_ASR) {
+  // 8-bit
+  const uint8_t a = 0x0F;
+  EXPECT_EQ(shiftValue(a, AARCH64_SFT_ASR, 4), 0x00);
+
+  const uint8_t b = 0xF0;
+  EXPECT_EQ(shiftValue(b, AARCH64_SFT_ASR, 7), 0xFF);
+
+  EXPECT_EQ(shiftValue(b, AARCH64_SFT_ASR, 0), b);
+
+  // 16-bit
+  const uint16_t c = 0x00FF;
+  EXPECT_EQ(shiftValue(c, AARCH64_SFT_ASR, 8), 0x0000);
+
+  const uint16_t d = 0xFF00;
+  EXPECT_EQ(shiftValue(d, AARCH64_SFT_ASR, 15), 0xFFFF);
+
+  EXPECT_EQ(shiftValue(d, AARCH64_SFT_ASR, 0), d);
+
+  // 32-bit
+  const uint32_t e = 0x0000FFFF;
+  EXPECT_EQ(shiftValue(e, AARCH64_SFT_ASR, 16), 0x00000000);
+
+  const uint32_t f = 0xFFFF0000;
+  EXPECT_EQ(shiftValue(f, AARCH64_SFT_ASR, 31), 0xFFFFFFFF);
+
+  EXPECT_EQ(shiftValue(f, AARCH64_SFT_ASR, 0), f);
+
+  // 64-bit
+  const uint64_t g = 0x00000000FFFFFFFF;
+  EXPECT_EQ(shiftValue(g, AARCH64_SFT_ASR, 32), 0x0000000000000000);
+
+  const uint64_t h = 0xFFFFFFFF00000000;
+  EXPECT_EQ(shiftValue(h, AARCH64_SFT_ASR, 63), 0xFFFFFFFFFFFFFFFF);
+
+  EXPECT_EQ(shiftValue(h, AARCH64_SFT_ASR, 0), h);
+}
+
+TEST(AArch64AuxiliaryFunctionTest, ShiftValueTest_ROR) {
+  // 8-bit
+  const uint8_t a = 0x0F;
+  EXPECT_EQ(shiftValue(a, AARCH64_SFT_ROR, 4), 0xF0);
+
+  const uint8_t b = 0xF0;
+  EXPECT_EQ(shiftValue(b, AARCH64_SFT_ROR, 7), 0xE1);
+
+  EXPECT_EQ(shiftValue(b, AARCH64_SFT_ROR, 0), b);
+
+  // 16-bit
+  const uint16_t c = 0x00FF;
+  EXPECT_EQ(shiftValue(c, AARCH64_SFT_ROR, 8), 0xFF00);
+
+  const uint16_t d = 0xFF00;
+  EXPECT_EQ(shiftValue(d, AARCH64_SFT_ROR, 15), 0xFE01);
+
+  EXPECT_EQ(shiftValue(d, AARCH64_SFT_ROR, 0), d);
+
+  // 32-bit
+  const uint32_t e = 0x0000FFFF;
+  EXPECT_EQ(shiftValue(e, AARCH64_SFT_ROR, 16), 0xFFFF0000);
+
+  const uint32_t f = 0xFFFF0000;
+  EXPECT_EQ(shiftValue(f, AARCH64_SFT_ROR, 31), 0xFFFE0001);
+
+  EXPECT_EQ(shiftValue(f, AARCH64_SFT_ROR, 0), f);
+
+  // 64-bit
+  const uint64_t g = 0x00000000FFFFFFFF;
+  EXPECT_EQ(shiftValue(g, AARCH64_SFT_ROR, 32), 0xFFFFFFFF00000000);
+
+  const uint64_t h = 0xFFFFFFFF00000000;
+  EXPECT_EQ(shiftValue(h, AARCH64_SFT_ROR, 63), 0xFFFFFFFE00000001);
+
+  EXPECT_EQ(shiftValue(h, AARCH64_SFT_ROR, 0), h);
+}
+
+TEST(AArch64AuxiliaryFunctionTest, ShiftValueTest_MSL) {
+  // 8-bit
+  const uint8_t a = 0x0F;
+  EXPECT_EQ(shiftValue(a, AARCH64_SFT_MSL, 4), 0xFF);
+
+  const uint8_t b = 0xF0;
+  EXPECT_EQ(shiftValue(b, AARCH64_SFT_MSL, 7), 0x7F);
+
+  EXPECT_EQ(shiftValue(b, AARCH64_SFT_MSL, 0), b);
+
+  // 16-bit
+  const uint16_t c = 0x00FF;
+  EXPECT_EQ(shiftValue(c, AARCH64_SFT_MSL, 8), 0xFFFF);
+
+  const uint16_t d = 0xFF00;
+  EXPECT_EQ(shiftValue(d, AARCH64_SFT_MSL, 15), 0x7FFF);
+
+  EXPECT_EQ(shiftValue(d, AARCH64_SFT_MSL, 0), d);
+
+  // 32-bit
+  const uint32_t e = 0x0000FFFF;
+  EXPECT_EQ(shiftValue(e, AARCH64_SFT_MSL, 16), 0xFFFFFFFF);
+
+  const uint32_t f = 0xFFFF0000;
+  EXPECT_EQ(shiftValue(f, AARCH64_SFT_MSL, 31), 0x7FFFFFFF);
+
+  EXPECT_EQ(shiftValue(f, AARCH64_SFT_MSL, 0), f);
+
+  // 64-bit
+  const uint64_t g = 0x00000000FFFFFFFF;
+  EXPECT_EQ(shiftValue(g, AARCH64_SFT_MSL, 32), 0xFFFFFFFFFFFFFFFF);
+
+  const uint64_t h = 0xFFFFFFFF00000000;
+  EXPECT_EQ(shiftValue(h, AARCH64_SFT_MSL, 63), 0x7FFFFFFFFFFFFFFF);
+
+  EXPECT_EQ(shiftValue(h, AARCH64_SFT_MSL, 0), h);
+}
+
+TEST(AArch64AuxiliaryFunctionTest, ShiftValueTest_INVALID) {
+  // 8-bit
+  const uint8_t a = 0x0F;
+  EXPECT_EQ(shiftValue(a, AARCH64_SFT_INVALID, 4), a);
+
+  const uint8_t b = 0xF0;
+  EXPECT_EQ(shiftValue(b, AARCH64_SFT_INVALID, 7), b);
+
+  EXPECT_EQ(shiftValue(b, AARCH64_SFT_INVALID, 0), b);
+
+  // 16-bit
+  const uint16_t c = 0x00FF;
+  EXPECT_EQ(shiftValue(c, AARCH64_SFT_INVALID, 8), c);
+
+  const uint16_t d = 0xFF00;
+  EXPECT_EQ(shiftValue(d, AARCH64_SFT_INVALID, 15), d);
+
+  EXPECT_EQ(shiftValue(d, AARCH64_SFT_INVALID, 0), d);
+
+  // 32-bit
+  const uint32_t e = 0x0000FFFF;
+  EXPECT_EQ(shiftValue(e, AARCH64_SFT_INVALID, 16), e);
+
+  const uint32_t f = 0xFFFF0000;
+  EXPECT_EQ(shiftValue(f, AARCH64_SFT_INVALID, 31), f);
+
+  EXPECT_EQ(shiftValue(f, AARCH64_SFT_INVALID, 0), f);
+
+  // 64-bit
+  const uint64_t g = 0x00000000FFFFFFFF;
+  EXPECT_EQ(shiftValue(g, AARCH64_SFT_INVALID, 32), g);
+
+  const uint64_t h = 0xFFFFFFFF00000000;
+  EXPECT_EQ(shiftValue(h, AARCH64_SFT_INVALID, 63), h);
+
+  EXPECT_EQ(shiftValue(h, AARCH64_SFT_INVALID, 0), h);
+}
+
+}  // namespace aarch64
+}  // namespace arch
+}  // namespace simeng
\ No newline at end of file
diff --git a/test/unit/aarch64/ExceptionHandlerTest.cc b/test/unit/aarch64/ExceptionHandlerTest.cc
new file mode 100644
index 0000000000..26e6f8dc7a
--- /dev/null
+++ b/test/unit/aarch64/ExceptionHandlerTest.cc
@@ -0,0 +1,697 @@
+#include "../ConfigInit.hh"
+#include "../MockCore.hh"
+#include "../MockInstruction.hh"
+#include "../MockMemoryInterface.hh"
+#include "gmock/gmock.h"
+#include "simeng/ArchitecturalRegisterFileSet.hh"
+#include "simeng/arch/aarch64/Architecture.hh"
+#include "simeng/arch/aarch64/ExceptionHandler.hh"
+#include "simeng/arch/aarch64/Instruction.hh"
+
+namespace simeng {
+namespace arch {
+namespace aarch64 {
+
+using ::testing::HasSubstr;
+using ::testing::Return;
+using ::testing::ReturnRef;
+
+class AArch64ExceptionHandlerTest : public ::testing::Test {
+ public:
+  AArch64ExceptionHandlerTest()
+      : kernel(config::SimInfo::getConfig()["CPU-Info"]["Special-File-Dir-Path"]
+                   .as<std::string>()),
+        arch(kernel),
+        physRegFileSet(config::SimInfo::getArchRegStruct()),
+        archRegFileSet(physRegFileSet),
+        core(memory, arch, config::SimInfo::getArchRegStruct()) {}
+
+ protected:
+  ConfigInit configInit = ConfigInit(config::ISA::AArch64, "");
+
+  MockMemoryInterface memory;
+  kernel::Linux kernel;
+  Architecture arch;
+
+  RegisterFileSet physRegFileSet;
+  ArchitecturalRegisterFileSet archRegFileSet;
+
+  MockCore core;
+
+  // fdivr z1.s, p0/m, z1.s, z0.s --- Just need a valid instruction to hijack
+  const std::array<uint8_t, 4> validInstrBytes = {0x01, 0x80, 0x8c, 0x65};
+
+  /** Helper constants for AArch64 general-purpose registers. */
+  static constexpr Register R0 = {RegisterType::GENERAL, 0};
+  static constexpr Register R1 = {RegisterType::GENERAL, 1};
+  static constexpr Register R2 = {RegisterType::GENERAL, 2};
+  static constexpr Register R3 = {RegisterType::GENERAL, 3};
+  static constexpr Register R4 = {RegisterType::GENERAL, 4};
+  static constexpr Register R5 = {RegisterType::GENERAL, 5};
+  static constexpr Register R8 = {RegisterType::GENERAL, 8};
+};
+
+// The following exceptions are tested in /test/regression/aarch64/Exception.cc
+// - InstructionException::StreamingModeUpdate,
+// - InstructionException::ZAregisterStatusUpdate,
+// - InstructionException::SMZAUpdate
+// All system calls are tested in /test/regression/aarch64/Syscall.cc
+
+// Test that a syscall is processed sucessfully
+TEST_F(AArch64ExceptionHandlerTest, testSyscall) {
+  // Create "syscall" instruction
+  uint64_t insnAddr = 0x4;
+  MacroOp uops;
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  InstructionException exception = InstructionException::SupervisorCall;
+  std::shared_ptr<Instruction> insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  insn->setInstructionAddress(insnAddr);
+
+  // Setup register file for `uname` syscall (chosen as minimal functionality)
+  archRegFileSet.set(R0, RegisterValue(1234, 8));
+  archRegFileSet.set(R8, RegisterValue(160, 8));
+
+  // Create ExceptionHandler
+  ExceptionHandler handler(insn, core, memory, kernel);
+
+  // Tick exceptionHandler
+  ON_CALL(core, getArchitecturalRegisterFileSet())
+      .WillByDefault(ReturnRef(archRegFileSet));
+  EXPECT_CALL(core, getArchitecturalRegisterFileSet()).Times(1);
+  bool retVal = handler.tick();
+  ExceptionResult result = handler.getResult();
+
+  EXPECT_TRUE(retVal);
+  EXPECT_FALSE(result.fatal);
+  EXPECT_EQ(result.instructionAddress, insnAddr + 4);
+  EXPECT_EQ(result.stateChange.type, ChangeType::REPLACEMENT);
+  std::vector<Register> modRegs = {R0};
+  EXPECT_EQ(result.stateChange.modifiedRegisters, modRegs);
+  std::vector<RegisterValue> modRegVals = {{0ull, 8}};
+  EXPECT_EQ(result.stateChange.modifiedRegisterValues, modRegVals);
+  std::vector<memory::MemoryAccessTarget> modMemTargets = {
+      {1234, 6},
+      {1234 + 65, 25},
+      {1234 + (65 * 2), 7},
+      {1234 + (65 * 3), 39},
+      {1234 + (65 * 4), 8}};
+  EXPECT_EQ(result.stateChange.memoryAddresses, modMemTargets);
+  std::vector<RegisterValue> modMemVals = {
+      RegisterValue("Linux"), RegisterValue("simeng.hpc.cs.bris.ac.uk"),
+      RegisterValue("4.14.0"),
+      RegisterValue("#1 SimEng Mon Apr 29 16:28:37 UTC 2019"),
+      RegisterValue("aarch64")};
+  EXPECT_EQ(result.stateChange.memoryAddressValues, modMemVals);
+}
+
+// Test that `readStringThen()` operates as expected
+TEST_F(AArch64ExceptionHandlerTest, readStringThen) {
+  // Create new mock instruction and ExceptionHandler
+  std::shared_ptr<MockInstruction> uopPtr(new MockInstruction);
+  ExceptionHandler handler(uopPtr, core, memory, kernel);
+
+  // Initialise variables
+  size_t retVal = 0;
+  char* buffer;
+  buffer = (char*)malloc(256);
+  for (int i = 0; i < 256; i++) {
+    buffer[i] = 'q';
+  }
+  uint64_t addr = 1024;
+  int maxLen = kernel::Linux::LINUX_PATH_MAX;
+
+  memory::MemoryAccessTarget target1 = {addr, 1};
+  memory::MemoryReadResult res1 = {target1, RegisterValue(0xAB, 1), 1};
+  span<memory::MemoryReadResult> res1Span =
+      span<memory::MemoryReadResult>(&res1, 1);
+
+  memory::MemoryAccessTarget target2 = {addr + 1, 1};
+  memory::MemoryReadResult res2 = {target2,
+                                   RegisterValue(static_cast<int>('\0'), 1), 1};
+  span<memory::MemoryReadResult> res2Span =
+      span<memory::MemoryReadResult>(&res2, 1);
+
+  // On first call to readStringThen, expect return of false and retVal to still
+  // be 0, and buffer to be filled with `q`
+  memory::MemoryAccessTarget tar = {addr, 1};
+  EXPECT_CALL(memory, requestRead(tar, 0)).Times(1);
+  bool outcome =
+      handler.readStringThen(buffer, addr, maxLen, [&retVal](auto length) {
+        retVal = length;
+        return true;
+      });
+  EXPECT_FALSE(outcome);
+  EXPECT_EQ(retVal, 0);
+  for (int i = 0; i < 256; i++) {
+    EXPECT_EQ(buffer[i], 'q');
+  }
+
+  // ResumeHandling (called on tick()) should now be set to `readStringThen()`
+  // so call this for our second pass.
+  ON_CALL(memory, getCompletedReads())
+      .WillByDefault(Return(span<memory::MemoryReadResult>()));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  outcome = handler.tick();
+  // No memory reads completed yet so again expect to return false and no change
+  // to `retval` or buffer
+  EXPECT_FALSE(outcome);
+  EXPECT_EQ(retVal, 0);
+  for (int i = 0; i < 256; i++) {
+    EXPECT_EQ(buffer[i], 'q');
+  }
+
+  // Call tick() again, but mimic a memory read completing
+  tar = {addr + 1, 1};
+  ON_CALL(memory, getCompletedReads()).WillByDefault(Return(res1Span));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  EXPECT_CALL(memory, requestRead(tar, 0)).Times(1);
+  outcome = handler.tick();
+  // Completed read but still not complete, so outcome should be false, retVal
+  // unchanged, but some data in the buffer
+  EXPECT_FALSE(outcome);
+  EXPECT_EQ(retVal, 0);
+  for (int i = 0; i < 256; i++) {
+    if (i == 0) {
+      EXPECT_EQ(buffer[i], (char)0xAB);
+    } else {
+      EXPECT_EQ(buffer[i], 'q');
+    }
+  }
+
+  // Call tick() for a final time, getting the final read result
+  ON_CALL(memory, getCompletedReads()).WillByDefault(Return(res2Span));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  outcome = handler.tick();
+  // End of string '\0' found so expect `then()` to have been called, the
+  // outcome to be true, and the buffer again to have updated
+  EXPECT_TRUE(outcome);
+  EXPECT_EQ(retVal, 1);
+  for (int i = 0; i < 256; i++) {
+    if (i == 0) {
+      EXPECT_EQ(buffer[i], (char)0xAB);
+    } else if (i == 1) {
+      EXPECT_EQ(buffer[i], '\0');
+    } else {
+      EXPECT_EQ(buffer[i], 'q');
+    }
+  }
+}
+
+// Test that in `readStringThen()` if max length is 0, then is called straight
+// away
+TEST_F(AArch64ExceptionHandlerTest, readStringThen_maxLen0) {
+  // Create new mock instruction and ExceptionHandler
+  std::shared_ptr<MockInstruction> uopPtr(new MockInstruction);
+  ExceptionHandler handler(uopPtr, core, memory, kernel);
+  size_t retVal = 100;
+  char* buffer;
+  buffer = (char*)malloc(256);
+  for (int i = 0; i < 256; i++) {
+    buffer[i] = 'q';
+  }
+  uint64_t addr = 1024;
+  int maxLen = 0;
+
+  bool outcome =
+      handler.readStringThen(buffer, addr, maxLen, [&retVal](auto length) {
+        retVal = length;
+        return true;
+      });
+  EXPECT_TRUE(outcome);
+  EXPECT_EQ(retVal, -1);
+  for (int i = 0; i < 256; i++) {
+    EXPECT_EQ(buffer[i], 'q');
+  }
+}
+
+// Test that in `readStringThen()` if max length has been met, then() is called
+// and no more string is fetched
+TEST_F(AArch64ExceptionHandlerTest, readStringThen_maxLenReached) {
+  // Create new mock instruction and ExceptionHandler
+  std::shared_ptr<MockInstruction> uopPtr(new MockInstruction);
+  ExceptionHandler handler(uopPtr, core, memory, kernel);
+
+  // Initialise variables
+  size_t retVal = 100;
+  char* buffer;
+  buffer = (char*)malloc(256);
+  for (int i = 0; i < 256; i++) {
+    buffer[i] = 'q';
+  }
+  uint64_t addr = 1024;
+  int maxLen = 1;
+
+  memory::MemoryAccessTarget target1 = {addr, 1};
+  memory::MemoryReadResult res1 = {target1, RegisterValue(0xAB, 1), 1};
+  span<memory::MemoryReadResult> res1Span =
+      span<memory::MemoryReadResult>(&res1, 1);
+
+  // On first call to readStringThen, expect return of false and retVal to still
+  // be 0, and buffer to be filled with `q`
+  memory::MemoryAccessTarget tar = {addr, 1};
+  EXPECT_CALL(memory, requestRead(tar, 0)).Times(1);
+  bool outcome =
+      handler.readStringThen(buffer, addr, maxLen, [&retVal](auto length) {
+        retVal = length;
+        return true;
+      });
+  EXPECT_FALSE(outcome);
+  EXPECT_EQ(retVal, 100);
+  for (int i = 0; i < 256; i++) {
+    EXPECT_EQ(buffer[i], 'q');
+  }
+
+  // ResumeHandling (called on tick()) should now be set to `readStringThen()`
+  // so call this for our second pass.
+  ON_CALL(memory, getCompletedReads())
+      .WillByDefault(Return(span<memory::MemoryReadResult>()));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  outcome = handler.tick();
+  // No memory reads completed yet so again expect to return false and no change
+  // to `retval` or buffer
+  EXPECT_FALSE(outcome);
+  EXPECT_EQ(retVal, 100);
+  for (int i = 0; i < 256; i++) {
+    EXPECT_EQ(buffer[i], 'q');
+  }
+
+  // Call tick() again, but mimic a memory read completing
+  ON_CALL(memory, getCompletedReads()).WillByDefault(Return(res1Span));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  outcome = handler.tick();
+  // Completed read and maxLength reached. Expect then() to have been called,
+  // the outcome to be true, and the buffer to have updated. RetVal should be
+  // maxLength
+  EXPECT_TRUE(outcome);
+  EXPECT_EQ(retVal, 1);
+  for (int i = 0; i < 256; i++) {
+    if (i == 0) {
+      EXPECT_EQ(buffer[i], (char)0xAB);
+    } else {
+      EXPECT_EQ(buffer[i], 'q');
+    }
+  }
+}
+
+// Test that `readBufferThen()` operates as expected
+TEST_F(AArch64ExceptionHandlerTest, readBufferThen) {
+  // Create new mock instruction and ExceptionHandler
+  std::shared_ptr<MockInstruction> uopPtr(new MockInstruction);
+  uopPtr->setSequenceId(5);
+  ExceptionHandler handler(uopPtr, core, memory, kernel);
+
+  // Initialise needed values for function
+  uint64_t retVal = 0;
+  uint64_t ptr = 0;
+  uint64_t length = 192;
+
+  // Initialise data to "read" from MockMemory
+  std::vector<char> dataVec(length, 'q');
+  std::vector<char> dataVec2(length, 'q');
+  // Initialise the two required targets (128-bytes per read request in
+  // readBufferThen())
+  memory::MemoryAccessTarget tar1 = {ptr, 128};
+  memory::MemoryAccessTarget tar2 = {ptr + 128,
+                                     static_cast<uint16_t>(length - 128)};
+  // Initialise "responses" from the MockMemory
+  memory::MemoryReadResult res1 = {
+      tar1, RegisterValue(dataVec.data() + ptr, 128), uopPtr->getSequenceId()};
+  memory::MemoryReadResult res2 = {
+      tar2, RegisterValue(dataVec.data() + ptr + 128, length - 128),
+      uopPtr->getSequenceId()};
+
+  // Confirm that internal dataBuffer_ is empty
+  EXPECT_EQ(handler.dataBuffer_.size(), 0);
+
+  // Initial call to readBufferThen - expect resumeHandling to be updated to
+  // readBufferThen and a memory read request to have occurred
+  EXPECT_CALL(memory, requestRead(tar1, uopPtr->getSequenceId())).Times(1);
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  bool outcome = handler.readBufferThen(ptr, length, [&retVal]() {
+    retVal = 10;
+    return true;
+  });
+  EXPECT_FALSE(outcome);
+  EXPECT_EQ(retVal, 0);
+  EXPECT_EQ(handler.dataBuffer_.size(), 0);
+
+  // Can now call tick() - on call, emulate no reads completed
+  ON_CALL(memory, getCompletedReads())
+      .WillByDefault(Return(span<memory::MemoryReadResult>()));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  outcome = handler.tick();
+  EXPECT_FALSE(outcome);
+  EXPECT_EQ(retVal, 0);
+  EXPECT_EQ(handler.dataBuffer_.size(), 0);
+
+  // Call tick() again, simulating completed read + new read requested as still
+  // data to fetch
+  ON_CALL(memory, getCompletedReads())
+      .WillByDefault(Return(span<memory::MemoryReadResult>(&res1, 1)));
+  // Make sure clearCompletedReads() alters functionality of getCompletedReads()
+  ON_CALL(memory, clearCompletedReads())
+      .WillByDefault(::testing::InvokeWithoutArgs([&]() {
+        ON_CALL(memory, getCompletedReads())
+            .WillByDefault(Return(span<memory::MemoryReadResult>()));
+      }));
+  EXPECT_CALL(memory, getCompletedReads()).Times(2);
+  EXPECT_CALL(memory, clearCompletedReads()).Times(1);
+  EXPECT_CALL(memory, requestRead(tar2, uopPtr->getSequenceId())).Times(1);
+  outcome = handler.tick();
+  EXPECT_FALSE(outcome);
+  EXPECT_EQ(retVal, 0);
+  EXPECT_EQ(handler.dataBuffer_.size(), 128);
+  for (size_t i = 0; i < handler.dataBuffer_.size(); i++) {
+    EXPECT_EQ(handler.dataBuffer_[i], 'q');
+  }
+
+  // One final call to tick() to get last bits of data from memory and call
+  // then()
+  ON_CALL(memory, getCompletedReads())
+      .WillByDefault(Return(span<memory::MemoryReadResult>(&res2, 1)));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  EXPECT_CALL(memory, clearCompletedReads()).Times(1);
+  outcome = handler.tick();
+  EXPECT_TRUE(outcome);
+  EXPECT_EQ(retVal, 10);
+  EXPECT_EQ(handler.dataBuffer_.size(), length);
+  for (uint64_t i = 0; i < length; i++) {
+    EXPECT_EQ(handler.dataBuffer_[i], static_cast<unsigned char>('q'));
+  }
+}
+
+// Test that `readBufferThen()` calls then if length is 0
+TEST_F(AArch64ExceptionHandlerTest, readBufferThen_length0) {
+  // Create new mock instruction and ExceptionHandler
+  std::shared_ptr<MockInstruction> uopPtr(new MockInstruction);
+  ExceptionHandler handler(uopPtr, core, memory, kernel);
+
+  const size_t expectedVal = 10;
+  uint64_t retVal = 0;
+  uint64_t ptr = 0;
+  uint64_t length = 0;
+
+  bool outcome = handler.readBufferThen(ptr, length, [&retVal]() {
+    retVal = 10;
+    return true;
+  });
+  EXPECT_TRUE(outcome);
+  EXPECT_EQ(retVal, expectedVal);
+}
+
+// Test that all AArch64 exception types print as expected
+TEST_F(AArch64ExceptionHandlerTest, printException) {
+  ON_CALL(core, getArchitecturalRegisterFileSet())
+      .WillByDefault(ReturnRef(archRegFileSet));
+  uint64_t insnAddr = 0x4;
+  MacroOp uops;
+
+  // Create instruction for EncodingUnallocated
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  InstructionException exception = InstructionException::EncodingUnallocated;
+  std::shared_ptr<Instruction> insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_0(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  std::stringstream buffer;
+  std::streambuf* sbuf = std::cout.rdbuf();  // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());           // Redirect cout to buffer
+  handler_0.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(),
+              HasSubstr("[SimEng:ExceptionHandler] Encountered unallocated "
+                        "instruction encoding exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for ExecutionNotYetImplemented
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::ExecutionNotYetImplemented;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_1(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_1.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(),
+              HasSubstr("[SimEng:ExceptionHandler] Encountered execution "
+                        "not-yet-implemented exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for MisalignedPC
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::MisalignedPC;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_3(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_3.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(),
+              HasSubstr("[SimEng:ExceptionHandler] Encountered misaligned "
+                        "program counter exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for DataAbort
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::DataAbort;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_4(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_4.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(
+      buffer.str(),
+      HasSubstr("[SimEng:ExceptionHandler] Encountered data abort exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for SupervisorCall
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::SupervisorCall;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_5(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_5.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(
+      buffer.str(),
+      HasSubstr(
+          "[SimEng:ExceptionHandler] Encountered supervisor call exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for HypervisorCall
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::HypervisorCall;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_6(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_6.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(
+      buffer.str(),
+      HasSubstr(
+          "[SimEng:ExceptionHandler] Encountered hypervisor call exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for SecureMonitorCall
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::SecureMonitorCall;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_7(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_7.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(), HasSubstr("[SimEng:ExceptionHandler] Encountered "
+                                      "secure monitor call exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for NoAvailablePort
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::NoAvailablePort;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_8(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_8.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(), HasSubstr("[SimEng:ExceptionHandler] Encountered "
+                                      "unsupported execution port exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for UnmappedSysReg
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::UnmappedSysReg;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_9(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_9.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(), HasSubstr("[SimEng:ExceptionHandler] Encountered "
+                                      "unmapped system register exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for StreamingModeUpdate
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::StreamingModeUpdate;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_10(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_10.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(), HasSubstr("[SimEng:ExceptionHandler] Encountered "
+                                      "streaming mode update exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for ZAregisterStatusUpdate
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::ZAregisterStatusUpdate;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_11(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_11.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(), HasSubstr("[SimEng:ExceptionHandler] Encountered "
+                                      "ZA register status update exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for SMZAUpdate
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::SMZAUpdate;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_12(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_12.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(),
+              HasSubstr("[SimEng:ExceptionHandler] Encountered streaming mode "
+                        "& ZA register status update exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for ZAdisabled
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::ZAdisabled;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_13(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_13.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(),
+              HasSubstr("[SimEng:ExceptionHandler] Encountered ZA register "
+                        "access attempt when disabled exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for SMdisabled
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::SMdisabled;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_14(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_14.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(),
+              HasSubstr("[SimEng:ExceptionHandler] Encountered SME execution "
+                        "attempt when streaming mode disabled exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for default case
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::None;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_15(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_15.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(),
+              HasSubstr("[SimEng:ExceptionHandler] Encountered unknown (id: "
+                        "0) exception"));
+  buffer.str(std::string());
+  uops.clear();
+}
+
+}  // namespace aarch64
+}  // namespace arch
+}  // namespace simeng
\ No newline at end of file
diff --git a/test/unit/aarch64/InstructionTest.cc b/test/unit/aarch64/InstructionTest.cc
new file mode 100644
index 0000000000..8d4b0d87f6
--- /dev/null
+++ b/test/unit/aarch64/InstructionTest.cc
@@ -0,0 +1,607 @@
+#include "../ConfigInit.hh"
+#include "arch/aarch64/InstructionMetadata.hh"
+#include "gmock/gmock.h"
+#include "simeng/arch/aarch64/Instruction.hh"
+#include "simeng/version.hh"
+
+namespace simeng {
+namespace arch {
+namespace aarch64 {
+
+// AArch64 Instruction Tests
+class AArch64InstructionTest : public testing::Test {
+ public:
+  AArch64InstructionTest()
+      : os(config::SimInfo::getConfig()["CPU-Info"]["Special-File-Dir-Path"]
+               .as<std::string>()),
+        arch(os) {
+    // Create InstructionMetadata objects
+    cs_open(CS_ARCH_AARCH64, CS_MODE_ARM, &capstoneHandle);
+    cs_option(capstoneHandle, CS_OPT_DETAIL, CS_OPT_ON);
+
+    // Create instructions which cover the 3 main types: Arithmetic, Memory,
+    // Branch. This allows for full testing of the Instruction class.
+
+    // fdiv
+    cs_insn rawInsn_fdiv;
+    cs_detail rawDetail_fdiv;
+    rawInsn_fdiv.detail = &rawDetail_fdiv;
+    size_t size_fdiv = 4;
+    uint64_t address_fdiv = 0;
+    const uint8_t* encoding_fdiv =
+        reinterpret_cast<const uint8_t*>(fdivInstrBytes.data());
+    cs_disasm_iter(capstoneHandle, &encoding_fdiv, &size_fdiv, &address_fdiv,
+                   &rawInsn_fdiv);
+    fdivMetadata = std::make_unique<InstructionMetadata>(rawInsn_fdiv);
+
+    // ldp
+    cs_insn rawInsn_ldp;
+    cs_detail rawDetail_ldp;
+    rawInsn_ldp.detail = &rawDetail_ldp;
+    size_t size_ldp = 4;
+    uint64_t address_ldp = 0;
+    const uint8_t* encoding_ldp =
+        reinterpret_cast<const uint8_t*>(ldpInstrBytes.data());
+    cs_disasm_iter(capstoneHandle, &encoding_ldp, &size_ldp, &address_ldp,
+                   &rawInsn_ldp);
+    ldpMetadata = std::make_unique<InstructionMetadata>(rawInsn_ldp);
+
+    // cbz
+    cs_insn rawInsn_cbz;
+    cs_detail rawDetail_cbz;
+    rawInsn_cbz.detail = &rawDetail_cbz;
+    size_t size_cbz = 4;
+    uint64_t address_cbz = 0;
+    const uint8_t* encoding_cbz =
+        reinterpret_cast<const uint8_t*>(cbzInstrBytes.data());
+    cs_disasm_iter(capstoneHandle, &encoding_cbz, &size_cbz, &address_cbz,
+                   &rawInsn_cbz);
+    cbzMetadata = std::make_unique<InstructionMetadata>(rawInsn_cbz);
+
+    // psel
+    cs_insn rawInsn_psel;
+    cs_detail rawDetail_psel;
+    rawInsn_psel.detail = &rawDetail_psel;
+    size_t size_psel = 4;
+    uint64_t address_psel = 0;
+    const uint8_t* encoding_psel =
+        reinterpret_cast<const uint8_t*>(pselInstrBytes.data());
+    cs_disasm_iter(capstoneHandle, &encoding_psel, &size_psel, &address_psel,
+                   &rawInsn_psel);
+    pselMetadata = std::make_unique<InstructionMetadata>(rawInsn_psel);
+
+    const uint8_t* badEncoding =
+        reinterpret_cast<const uint8_t*>(invalidInstrBytes.data());
+    invalidMetadata = std::make_unique<InstructionMetadata>(badEncoding);
+  }
+
+  ~AArch64InstructionTest() { cs_close(&capstoneHandle); }
+
+ protected:
+  ConfigInit configInit = ConfigInit(config::ISA::AArch64, "");
+
+  // fdivr z1.s, p0/m, z1.s, z0.s
+  std::array<uint8_t, 4> fdivInstrBytes = {0x01, 0x80, 0x8c, 0x65};
+  // ldp x1, x2, [x3]
+  std::array<uint8_t, 4> ldpInstrBytes = {0x61, 0x08, 0x40, 0xA9};
+  // cbz x2, #0x28
+  std::array<uint8_t, 4> cbzInstrBytes = {0x42, 0x01, 0x00, 0xB4};
+  // psel	p4, p0, p2.s[w13, 0]
+  std::array<uint8_t, 4> pselInstrBytes = {0x44, 0x40, 0x31, 0x25};
+  std::array<uint8_t, 4> invalidInstrBytes = {0x20, 0x00, 0x02, 0x8c};
+
+  // A Capstone decoding library handle, for decoding instructions.
+  csh capstoneHandle;
+
+  kernel::Linux os;
+  Architecture arch;
+
+  std::unique_ptr<InstructionMetadata> fdivMetadata;
+  std::unique_ptr<InstructionMetadata> ldpMetadata;
+  std::unique_ptr<InstructionMetadata> cbzMetadata;
+  std::unique_ptr<InstructionMetadata> pselMetadata;
+  std::unique_ptr<InstructionMetadata> invalidMetadata;
+  std::unique_ptr<MicroOpInfo> uopInfo;
+  InstructionException exception;
+};
+
+// Test that a valid instruction is created correctly
+TEST_F(AArch64InstructionTest, validInsn) {
+  // Insn is `fdivr z1.s, p0/m, z1.s, z0.s`
+  Instruction insn = Instruction(arch, *fdivMetadata.get(), MicroOpInfo());
+  // Define instruction's registers
+  std::vector<Register> destRegs = {{RegisterType::VECTOR, 1}};
+  std::vector<Register> srcRegs = {{RegisterType::PREDICATE, 0},
+                                   {RegisterType::VECTOR, 1},
+                                   {RegisterType::VECTOR, 0}};
+  const std::vector<uint16_t> ports = {1, 2, 3};
+  insn.setExecutionInfo({3, 4, ports});
+  insn.setInstructionAddress(0x48);
+  insn.setInstructionId(11);
+  insn.setSequenceId(12);
+
+  // Ensure that all instruction values are as expected after creation
+  BranchPrediction pred = {false, 0};
+  bool matchingPred = (insn.getBranchPrediction() == pred) ? true : false;
+  EXPECT_EQ(&insn.getArchitecture(), &arch);
+  EXPECT_EQ(insn.getBranchAddress(), 0);
+  EXPECT_TRUE(matchingPred);
+  EXPECT_EQ(insn.getBranchType(), BranchType::Unknown);
+  EXPECT_EQ(insn.getData().size(), 0);
+  EXPECT_EQ(insn.getDestinationRegisters().size(), destRegs.size());
+  for (size_t i = 0; i < destRegs.size(); i++) {
+    EXPECT_EQ(insn.getDestinationRegisters()[i], destRegs[i]);
+  }
+  EXPECT_EQ(insn.getException(), InstructionException::None);
+  EXPECT_EQ(insn.getGeneratedAddresses().size(), 0);
+  EXPECT_EQ(insn.getGroup(), InstructionGroups::SVE_DIV_OR_SQRT);
+  EXPECT_EQ(insn.getInstructionAddress(), 0x48);
+  EXPECT_EQ(insn.getInstructionId(), 11);
+  EXPECT_EQ(insn.getKnownOffset(), 0);
+  EXPECT_EQ(insn.getLatency(), 3);
+  EXPECT_EQ(insn.getLSQLatency(), 1);
+  EXPECT_EQ(&insn.getMetadata(), fdivMetadata.get());
+  EXPECT_EQ(insn.getMicroOpIndex(), 0);
+  // Results vector resized at decode
+  EXPECT_EQ(insn.getResults().size(), 1);
+  EXPECT_EQ(insn.getSequenceId(), 12);
+  // Operands vector resized at decode
+  EXPECT_EQ(insn.getSourceOperands().size(), 3);
+  EXPECT_EQ(insn.getSourceRegisters().size(), srcRegs.size());
+  for (size_t i = 0; i < srcRegs.size(); i++) {
+    EXPECT_EQ(insn.getSourceRegisters()[i], srcRegs[i]);
+    EXPECT_FALSE(insn.isOperandReady(i));
+  }
+  EXPECT_EQ(insn.getStallCycles(), 4);
+  EXPECT_EQ(insn.getSupportedPorts(), ports);
+
+  EXPECT_FALSE(insn.canExecute());
+  EXPECT_FALSE(insn.isStoreAddress());
+  EXPECT_FALSE(insn.isStoreData());
+  EXPECT_FALSE(insn.isLoad());
+  EXPECT_FALSE(insn.isBranch());
+  EXPECT_FALSE(insn.exceptionEncountered());
+  EXPECT_FALSE(insn.hasExecuted());
+  EXPECT_FALSE(insn.canCommit());
+  EXPECT_TRUE(insn.hasAllData());
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_FALSE(insn.isFlushed());
+  EXPECT_FALSE(insn.isMicroOp());
+  EXPECT_TRUE(insn.isLastMicroOp());
+  EXPECT_FALSE(insn.isWaitingCommit());
+}
+
+// Test that an invalid instruction can be created - invalid due to byte stream
+TEST_F(AArch64InstructionTest, invalidInsn_1) {
+  Instruction insn = Instruction(arch, *invalidMetadata.get(), MicroOpInfo());
+  // Define instruction's registers
+  std::vector<Register> destRegs = {};
+  std::vector<Register> srcRegs = {};
+  const std::vector<uint16_t> ports = {};
+  insn.setExecutionInfo({1, 1, ports});
+  insn.setInstructionAddress(0x44);
+  insn.setInstructionId(13);
+  insn.setSequenceId(14);
+
+  // Ensure that all instruction values are as expected after creation
+  BranchPrediction pred = {false, 0};
+  bool matchingPred = (insn.getBranchPrediction() == pred) ? true : false;
+  EXPECT_EQ(&insn.getArchitecture(), &arch);
+  EXPECT_EQ(insn.getBranchAddress(), 0);
+  EXPECT_TRUE(matchingPred);
+  EXPECT_EQ(insn.getBranchType(), BranchType::Unknown);
+  EXPECT_EQ(insn.getData().size(), 0);
+  EXPECT_EQ(insn.getDestinationRegisters().size(), destRegs.size());
+  for (size_t i = 0; i < destRegs.size(); i++) {
+    EXPECT_EQ(insn.getDestinationRegisters()[i], destRegs[i]);
+  }
+  EXPECT_EQ(insn.getException(), InstructionException::EncodingUnallocated);
+  EXPECT_EQ(insn.getGeneratedAddresses().size(), 0);
+  // Default Group for instruction that is not decoded
+  EXPECT_EQ(insn.getGroup(), InstructionGroups::INT_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_EQ(insn.getInstructionAddress(), 0x44);
+  EXPECT_EQ(insn.getInstructionId(), 13);
+  EXPECT_EQ(insn.getKnownOffset(), 0);
+  EXPECT_EQ(insn.getLatency(), 1);
+  EXPECT_EQ(insn.getLSQLatency(), 1);
+  EXPECT_EQ(&insn.getMetadata(), invalidMetadata.get());
+  EXPECT_EQ(insn.getMicroOpIndex(), 0);
+  // Results vector resized at decode
+  EXPECT_EQ(insn.getResults().size(), 0);
+  EXPECT_EQ(insn.getSequenceId(), 14);
+  // Operands vector resized at decode
+  EXPECT_EQ(insn.getSourceOperands().size(), 0);
+  EXPECT_EQ(insn.getSourceRegisters().size(), srcRegs.size());
+  for (size_t i = 0; i < srcRegs.size(); i++) {
+    EXPECT_EQ(insn.getSourceRegisters()[i], srcRegs[i]);
+    EXPECT_FALSE(insn.isOperandReady(i));
+  }
+  EXPECT_EQ(insn.getStallCycles(), 1);
+  EXPECT_EQ(insn.getSupportedPorts(), ports);
+
+  EXPECT_TRUE(insn.canExecute());
+  EXPECT_FALSE(insn.isStoreAddress());
+  EXPECT_FALSE(insn.isStoreData());
+  EXPECT_FALSE(insn.isLoad());
+  EXPECT_FALSE(insn.isBranch());
+  EXPECT_TRUE(insn.exceptionEncountered());
+  EXPECT_FALSE(insn.hasExecuted());
+  EXPECT_FALSE(insn.canCommit());
+  EXPECT_TRUE(insn.hasAllData());
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_FALSE(insn.isFlushed());
+  EXPECT_FALSE(insn.isMicroOp());
+  EXPECT_TRUE(insn.isLastMicroOp());
+  EXPECT_FALSE(insn.isWaitingCommit());
+}
+
+// Test that an invalid instruction can be created - invalid due to exception
+// provided
+TEST_F(AArch64InstructionTest, invalidInsn_2) {
+  Instruction insn = Instruction(arch, *invalidMetadata.get(),
+                                 InstructionException::HypervisorCall);
+  // Define instruction's registers
+  std::vector<Register> destRegs = {};
+  std::vector<Register> srcRegs = {};
+  const std::vector<uint16_t> ports = {};
+  insn.setExecutionInfo({1, 1, ports});
+  insn.setInstructionAddress(0x43);
+  insn.setInstructionId(15);
+  insn.setSequenceId(16);
+
+  // Ensure that all instruction values are as expected after creation
+  BranchPrediction pred = {false, 0};
+  bool matchingPred = (insn.getBranchPrediction() == pred) ? true : false;
+  EXPECT_EQ(&insn.getArchitecture(), &arch);
+  EXPECT_EQ(insn.getBranchAddress(), 0);
+  EXPECT_TRUE(matchingPred);
+  EXPECT_EQ(insn.getBranchType(), BranchType::Unknown);
+  EXPECT_EQ(insn.getData().size(), 0);
+  EXPECT_EQ(insn.getDestinationRegisters().size(), destRegs.size());
+  for (size_t i = 0; i < destRegs.size(); i++) {
+    EXPECT_EQ(insn.getDestinationRegisters()[i], destRegs[i]);
+  }
+  EXPECT_EQ(insn.getException(), InstructionException::HypervisorCall);
+  EXPECT_EQ(insn.getGeneratedAddresses().size(), 0);
+  // Default Group for instruction that is not decoded
+  EXPECT_EQ(insn.getGroup(), InstructionGroups::INT_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_EQ(insn.getInstructionAddress(), 0x43);
+  EXPECT_EQ(insn.getInstructionId(), 15);
+  EXPECT_EQ(insn.getKnownOffset(), 0);
+  EXPECT_EQ(insn.getLatency(), 1);
+  EXPECT_EQ(insn.getLSQLatency(), 1);
+  EXPECT_EQ(&insn.getMetadata(), invalidMetadata.get());
+  EXPECT_EQ(insn.getMicroOpIndex(), 0);
+  // Results vector resized at decode
+  EXPECT_EQ(insn.getResults().size(), 0);
+  EXPECT_EQ(insn.getSequenceId(), 16);
+  // Operands vector resized at decode
+  EXPECT_EQ(insn.getSourceOperands().size(), 0);
+  EXPECT_EQ(insn.getSourceRegisters().size(), srcRegs.size());
+  for (size_t i = 0; i < srcRegs.size(); i++) {
+    EXPECT_EQ(insn.getSourceRegisters()[i], srcRegs[i]);
+    EXPECT_FALSE(insn.isOperandReady(i));
+  }
+  EXPECT_EQ(insn.getStallCycles(), 1);
+  EXPECT_EQ(insn.getSupportedPorts(), ports);
+
+  EXPECT_TRUE(insn.canExecute());
+  EXPECT_FALSE(insn.isStoreAddress());
+  EXPECT_FALSE(insn.isStoreData());
+  EXPECT_FALSE(insn.isLoad());
+  EXPECT_FALSE(insn.isBranch());
+  EXPECT_TRUE(insn.exceptionEncountered());
+  EXPECT_FALSE(insn.hasExecuted());
+  EXPECT_FALSE(insn.canCommit());
+  EXPECT_TRUE(insn.hasAllData());
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_FALSE(insn.isFlushed());
+  EXPECT_FALSE(insn.isMicroOp());
+  EXPECT_TRUE(insn.isLastMicroOp());
+  EXPECT_FALSE(insn.isWaitingCommit());
+}
+
+// Test to ensure that source and operand registers can be renamed correctly
+TEST_F(AArch64InstructionTest, renameRegs) {
+  // Insn is `fdivr z1.s, p0/m, z1.s, z0.s`
+  Instruction insn = Instruction(arch, *fdivMetadata.get(), MicroOpInfo());
+  // Define instruction's registers
+  std::vector<Register> destRegs = {{RegisterType::VECTOR, 1}};
+  std::vector<Register> srcRegs = {{RegisterType::PREDICATE, 0},
+                                   {RegisterType::VECTOR, 1},
+                                   {RegisterType::VECTOR, 0}};
+  // Ensure registers decoded correctly
+  EXPECT_EQ(insn.getSourceRegisters().size(), srcRegs.size());
+  for (size_t i = 0; i < srcRegs.size(); i++) {
+    EXPECT_EQ(insn.getSourceRegisters()[i], srcRegs[i]);
+  }
+  EXPECT_EQ(insn.getDestinationRegisters().size(), destRegs.size());
+  for (size_t i = 0; i < destRegs.size(); i++) {
+    EXPECT_EQ(insn.getDestinationRegisters()[i], destRegs[i]);
+  }
+
+  // Define renamed registers
+  std::vector<Register> destRegs_new = {{RegisterType::VECTOR, 24}};
+  std::vector<Register> srcRegs_new = {{RegisterType::PREDICATE, 0},
+                                       {RegisterType::VECTOR, 97},
+                                       {RegisterType::VECTOR, 0}};
+  insn.renameDestination(0, destRegs_new[0]);
+  insn.renameSource(1, srcRegs_new[1]);
+  // Ensure renaming functionality works as expected
+  EXPECT_EQ(insn.getSourceRegisters().size(), srcRegs_new.size());
+  for (size_t i = 0; i < srcRegs_new.size(); i++) {
+    EXPECT_EQ(insn.getSourceRegisters()[i], srcRegs_new[i]);
+  }
+  EXPECT_EQ(insn.getDestinationRegisters().size(), destRegs_new.size());
+  for (size_t i = 0; i < destRegs_new.size(); i++) {
+    EXPECT_EQ(insn.getDestinationRegisters()[i], destRegs_new[i]);
+  }
+}
+
+// Test that operand values can be properly supplied and change the state of
+// `canExecute`
+TEST_F(AArch64InstructionTest, supplyOperand) {
+  // Insn is `fdivr z1.s, p0/m, z1.s, z0.s`
+  Instruction insn = Instruction(arch, *fdivMetadata.get(), MicroOpInfo());
+  // Define instruction's registers
+  std::vector<Register> destRegs = {{RegisterType::VECTOR, 1}};
+  std::vector<Register> srcRegs = {{RegisterType::PREDICATE, 0},
+                                   {RegisterType::VECTOR, 1},
+                                   {RegisterType::VECTOR, 0}};
+  // Check initial state is as expected
+  EXPECT_FALSE(insn.canExecute());
+  EXPECT_FALSE(insn.isOperandReady(0));
+  EXPECT_FALSE(insn.isOperandReady(1));
+  EXPECT_FALSE(insn.isOperandReady(2));
+
+  // Define mock register values for source registers
+  RegisterValue vec = {0xABBACAFE01234567, 256};
+  uint64_t pred_vals[4] = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
+  RegisterValue pred = {pred_vals, 32};
+  // Supply values for all source registers
+  insn.supplyOperand(0, pred);
+  insn.supplyOperand(1, vec);
+  insn.supplyOperand(2, vec);
+  // Ensure Instruction state has updated as expected
+  EXPECT_TRUE(insn.canExecute());
+  EXPECT_TRUE(insn.isOperandReady(0));
+  EXPECT_TRUE(insn.isOperandReady(1));
+  EXPECT_TRUE(insn.isOperandReady(2));
+  auto sourceVals = insn.getSourceOperands();
+  EXPECT_EQ(sourceVals.size(), 3);
+  EXPECT_EQ(sourceVals[0], pred);
+  EXPECT_EQ(sourceVals[1], vec);
+  EXPECT_EQ(sourceVals[2], vec);
+
+  // Ensure instruction execute updates instruction state as expected, and
+  // produces the expected result.
+  EXPECT_FALSE(insn.hasExecuted());
+  insn.execute();
+  EXPECT_TRUE(insn.hasExecuted());
+  auto results = insn.getResults();
+  float vals[4] = {1.f, 1.f, std::nanf(""), std::nanf("")};
+  RegisterValue refRes = {vals, 256};
+  EXPECT_EQ(results.size(), 1);
+  EXPECT_EQ(results[0], refRes);
+}
+
+// Test that data can be supplied successfully
+TEST_F(AArch64InstructionTest, supplyData) {
+  // Insn is `ldp x1, x2, [x3]`
+  Instruction insn = Instruction(arch, *ldpMetadata.get(), MicroOpInfo());
+  // Define instruction's registers
+  std::vector<Register> destRegs = {{RegisterType::GENERAL, 1},
+                                    {RegisterType::GENERAL, 2}};
+  std::vector<Register> srcRegs = {{RegisterType::GENERAL, 3}};
+
+  // Check instruction created correctly
+  EXPECT_FALSE(insn.exceptionEncountered());
+  EXPECT_EQ(&insn.getMetadata(), ldpMetadata.get());
+  EXPECT_EQ(insn.getGroup(), InstructionGroups::LOAD_INT);
+
+  // Check source and destination registers extracted correctly
+  EXPECT_EQ(insn.getSourceRegisters().size(), srcRegs.size());
+  for (size_t i = 0; i < srcRegs.size(); i++) {
+    EXPECT_EQ(insn.getSourceRegisters()[i], srcRegs[i]);
+  }
+  EXPECT_EQ(insn.getDestinationRegisters().size(), destRegs.size());
+  for (size_t i = 0; i < destRegs.size(); i++) {
+    EXPECT_EQ(insn.getDestinationRegisters()[i], destRegs[i]);
+  }
+
+  // Supply needed operands
+  EXPECT_FALSE(insn.isOperandReady(0));
+  RegisterValue addr = {0x480, 8};
+  insn.supplyOperand(0, addr);
+  EXPECT_TRUE(insn.isOperandReady(0));
+
+  // Generate memory addresses
+  EXPECT_EQ(insn.getGeneratedAddresses().size(), 0);
+  insn.generateAddresses();
+  auto generatedAddresses = insn.getGeneratedAddresses();
+  EXPECT_EQ(generatedAddresses.size(), 2);
+  for (size_t i = 0; i < generatedAddresses.size(); i++) {
+    EXPECT_EQ(generatedAddresses[i].address, 0x480 + (i * 0x8));
+    EXPECT_EQ(generatedAddresses[i].size, 8);
+  }
+
+  // Supply required data
+  EXPECT_FALSE(insn.hasAllData());
+  std::vector<RegisterValue> data = {{123, 8}, {456, 8}};
+  EXPECT_EQ(generatedAddresses.size(), data.size());
+  for (size_t i = 0; i < generatedAddresses.size(); i++) {
+    insn.supplyData(generatedAddresses[i].address, data[i]);
+  }
+  // Ensure data was supplied correctly
+  auto retrievedData = insn.getData();
+  for (size_t i = 0; i < retrievedData.size(); i++) {
+    EXPECT_EQ(retrievedData[i], data[i]);
+  }
+  EXPECT_TRUE(insn.hasAllData());
+}
+
+// Test DataAbort Exception is triggered correctly when supplying data
+TEST_F(AArch64InstructionTest, supplyData_dataAbort) {
+  // Insn is `ldp x1, x2, [x3]`
+  Instruction insn = Instruction(arch, *ldpMetadata.get(), MicroOpInfo());
+  // Define instruction's registers
+  std::vector<Register> destRegs = {{RegisterType::GENERAL, 1},
+                                    {RegisterType::GENERAL, 2}};
+  std::vector<Register> srcRegs = {{RegisterType::GENERAL, 3}};
+
+  // Check instruction created correctly
+  EXPECT_EQ(&insn.getMetadata(), ldpMetadata.get());
+  EXPECT_EQ(insn.getGroup(), InstructionGroups::LOAD_INT);
+
+  // Supply needed operands
+  EXPECT_FALSE(insn.isOperandReady(0));
+  RegisterValue addr = {0x480, 8};
+  insn.supplyOperand(0, addr);
+  EXPECT_TRUE(insn.isOperandReady(0));
+
+  // Generate memory addresses
+  EXPECT_EQ(insn.getGeneratedAddresses().size(), 0);
+  insn.generateAddresses();
+  auto generatedAddresses = insn.getGeneratedAddresses();
+  EXPECT_EQ(generatedAddresses.size(), 2);
+  for (size_t i = 0; i < generatedAddresses.size(); i++) {
+    EXPECT_EQ(generatedAddresses[i].address, 0x480 + (i * 0x8));
+    EXPECT_EQ(generatedAddresses[i].size, 8);
+  }
+
+  // Trigger data abort
+  EXPECT_FALSE(insn.exceptionEncountered());
+  insn.supplyData(generatedAddresses[0].address, RegisterValue());
+  EXPECT_TRUE(insn.exceptionEncountered());
+  EXPECT_EQ(insn.getException(), InstructionException::DataAbort);
+}
+
+// Test that a correct prediction (branch taken) is handled correctly
+TEST_F(AArch64InstructionTest, correctPred_taken) {
+  // insn is `cbz x2, #0x28`
+  Instruction insn = Instruction(arch, *cbzMetadata.get(), MicroOpInfo());
+  insn.setInstructionAddress(80);
+
+  // Check initial state of an instruction's branch related options
+  BranchPrediction pred = {false, 0};
+  bool matchingPred = (insn.getBranchPrediction() == pred);
+  EXPECT_TRUE(matchingPred);
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_EQ(insn.getBranchAddress(), 0);
+  EXPECT_EQ(insn.getBranchType(), BranchType::Conditional);
+  EXPECT_TRUE(insn.isBranch());
+
+  // Test a correct prediction where branch is taken is handled correctly
+  pred = {true, 80 + 0x28};
+  insn.setBranchPrediction(pred);
+  matchingPred = (insn.getBranchPrediction() == pred);
+  insn.supplyOperand(0, RegisterValue(0, 8));
+  insn.execute();
+  EXPECT_TRUE(matchingPred);
+  EXPECT_TRUE(insn.wasBranchTaken());
+  EXPECT_FALSE(insn.wasBranchMispredicted());
+  EXPECT_EQ(insn.getBranchAddress(), pred.target);
+}
+
+// Test that a correct prediction (branch not taken) is handled correctly
+TEST_F(AArch64InstructionTest, correctPred_notTaken) {
+  // insn is `cbz x2, #0x28`
+  Instruction insn = Instruction(arch, *cbzMetadata.get(), MicroOpInfo());
+  insn.setInstructionAddress(80);
+
+  // Check initial state of an instruction's branch related options
+  BranchPrediction pred = {false, 0};
+  bool matchingPred = (insn.getBranchPrediction() == pred);
+  EXPECT_TRUE(matchingPred);
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_EQ(insn.getBranchAddress(), 0);
+  EXPECT_EQ(insn.getBranchType(), BranchType::Conditional);
+  EXPECT_TRUE(insn.isBranch());
+
+  // Test a correct prediction where a branch isn't taken is handled correctly
+  pred = {false, 80 + 4};
+  insn.setBranchPrediction(pred);
+  matchingPred = (insn.getBranchPrediction() == pred);
+  insn.supplyOperand(0, RegisterValue(1, 8));
+  insn.execute();
+  EXPECT_TRUE(matchingPred);
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_FALSE(insn.wasBranchMispredicted());
+  EXPECT_EQ(insn.getBranchAddress(), pred.target);
+}
+
+// Test that an incorrect prediction (wrong target) is handled correctly
+TEST_F(AArch64InstructionTest, incorrectPred_target) {
+  // insn is `cbz x2, #0x28`
+  Instruction insn = Instruction(arch, *cbzMetadata.get(), MicroOpInfo());
+  insn.setInstructionAddress(100);
+
+  // Check initial state of an instruction's branch related options
+  BranchPrediction pred = {false, 0};
+  bool matchingPred = (insn.getBranchPrediction() == pred);
+  EXPECT_TRUE(matchingPred);
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_EQ(insn.getBranchAddress(), 0);
+  EXPECT_EQ(insn.getBranchType(), BranchType::Conditional);
+  EXPECT_TRUE(insn.isBranch());
+
+  // Test an incorrect prediction is handled correctly - target is wrong
+  pred = {true, 80 + 0x28};
+  insn.setBranchPrediction(pred);
+  matchingPred = (insn.getBranchPrediction() == pred);
+  insn.supplyOperand(0, RegisterValue(0, 8));
+  insn.execute();
+  EXPECT_TRUE(matchingPred);
+  EXPECT_TRUE(insn.wasBranchTaken());
+  EXPECT_TRUE(insn.wasBranchMispredicted());
+  EXPECT_EQ(insn.getBranchAddress(), 100 + 0x28);
+}
+
+// Test that an incorrect prediction (wrong taken) is handled correctly
+TEST_F(AArch64InstructionTest, incorrectPred_taken) {
+  // insn is `cbz x2, #0x28`
+  Instruction insn = Instruction(arch, *cbzMetadata.get(), MicroOpInfo());
+  insn.setInstructionAddress(100);
+
+  // Check initial state of an instruction's branch related options
+  BranchPrediction pred = {false, 0};
+  bool matchingPred = (insn.getBranchPrediction() == pred);
+  EXPECT_TRUE(matchingPred);
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_EQ(insn.getBranchAddress(), 0);
+  EXPECT_EQ(insn.getBranchType(), BranchType::Conditional);
+  EXPECT_TRUE(insn.isBranch());
+
+  // Test an incorrect prediction is handled correctly - taken is wrong
+  pred = {true, 100 + 0x28};
+  insn.setBranchPrediction(pred);
+  matchingPred = (insn.getBranchPrediction() == pred);
+  insn.supplyOperand(0, RegisterValue(1, 8));
+  insn.execute();
+  EXPECT_TRUE(matchingPred);
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_TRUE(insn.wasBranchMispredicted());
+  EXPECT_EQ(insn.getBranchAddress(), 100 + 4);
+}
+
+// Test commit and flush setters such as `setFlushed`, `setCommitReady`, etc.
+TEST_F(AArch64InstructionTest, setters) {
+  // Insn is `fdivr z1.s, p0/m, z1.s, z0.s`
+  Instruction insn = Instruction(arch, *fdivMetadata.get(), MicroOpInfo());
+
+  EXPECT_FALSE(insn.canCommit());
+  insn.setCommitReady();
+  EXPECT_TRUE(insn.canCommit());
+
+  EXPECT_FALSE(insn.isFlushed());
+  insn.setFlushed();
+  EXPECT_TRUE(insn.isFlushed());
+
+  EXPECT_FALSE(insn.isWaitingCommit());
+  insn.setWaitingCommit();
+  EXPECT_TRUE(insn.isWaitingCommit());
+}
+
+}  // namespace aarch64
+}  // namespace arch
+}  // namespace simeng
\ No newline at end of file
diff --git a/test/unit/aarch64/OperandContainerTest.cc b/test/unit/aarch64/OperandContainerTest.cc
new file mode 100644
index 0000000000..dd0ed1b655
--- /dev/null
+++ b/test/unit/aarch64/OperandContainerTest.cc
@@ -0,0 +1,100 @@
+#include "gmock/gmock.h"
+#include "simeng/RegisterValue.hh"
+#include "simeng/arch/aarch64/operandContainer.hh"
+#include "simeng/version.hh"
+
+namespace simeng {
+namespace arch {
+namespace aarch64 {
+
+TEST(AArch64OperandContainerTest, correctInit) {
+  operandContainer<std::string, MAX_SOURCE_REGISTERS> cont;
+
+  EXPECT_EQ(cont.size(), MAX_SOURCE_REGISTERS);
+  for (int i = 0; i < MAX_SOURCE_REGISTERS; i++) {
+    EXPECT_EQ(cont[i], "");
+  }
+
+  if (strcmp(SIMENG_BUILD_TYPE, "Debug") == 0) {
+    // `resize()` will only work if std::vector is used.
+    ASSERT_DEATH(
+        { cont.resize(MAX_SOURCE_REGISTERS * 2); },
+        "resize can only be called when the active member is std::vector");
+  }
+}
+
+TEST(AArch64OperandContainerTest, useVec) {
+  operandContainer<std::string, MAX_SOURCE_REGISTERS> cont;
+  EXPECT_EQ(cont.size(), MAX_SOURCE_REGISTERS);
+  for (int i = 0; i < MAX_SOURCE_REGISTERS; i++) {
+    EXPECT_EQ(cont[i], "");
+  }
+
+  // Initialise some of the data
+  cont[0] = "elem0";
+  cont[1] = "elem1";
+  cont[2] = "elem2";
+
+  // Convert to Vector
+  cont.addSMEOperand(10);
+  // Check size is correct after addSMEOperand call
+  EXPECT_EQ(cont.size(), MAX_SOURCE_REGISTERS + ADDITIONAL_SME_REGISTERS + 10);
+  // Check initialised data was maintained
+  for (size_t i = 0; i < cont.size(); i++) {
+    if (i == 0 || i == 1 || i == 2) {
+      EXPECT_EQ(cont[i], "elem" + std::to_string(i));
+    } else {
+      EXPECT_EQ(cont[i], "");
+    }
+  }
+
+  // Ensure re-size works as expected
+  cont.resize(2);
+  EXPECT_EQ(cont.size(), 2);
+  EXPECT_EQ(cont[0], "elem0");
+  EXPECT_EQ(cont[1], "elem1");
+}
+
+// Call addSMEOperand multiple times to ensure correct behaviour
+TEST(AArch64OperandContainerTest, multipleMakeSMECalls) {
+  operandContainer<std::string, MAX_SOURCE_REGISTERS> cont;
+  EXPECT_EQ(cont.size(), MAX_SOURCE_REGISTERS);
+  for (int i = 0; i < MAX_SOURCE_REGISTERS; i++) {
+    EXPECT_EQ(cont[i], "");
+  }
+
+  // Initialise some of the data
+  cont[0] = "elem0";
+  cont[1] = "elem1";
+  cont[2] = "elem2";
+
+  // Call addSMEOperand for the first time - convert to Vector
+  cont.addSMEOperand(10);
+  // Check size is correct after addSMEOperand call
+  size_t vecSize = MAX_SOURCE_REGISTERS + ADDITIONAL_SME_REGISTERS + 10;
+  EXPECT_EQ(cont.size(), vecSize);
+  // Check initialised data was maintained
+  for (size_t i = 0; i < cont.size(); i++) {
+    if (i == 0 || i == 1 || i == 2) {
+      EXPECT_EQ(cont[i], "elem" + std::to_string(i));
+    } else {
+      EXPECT_EQ(cont[i], "");
+    }
+  }
+
+  // Call addSMEOperand again, ensuring size grows as expected
+  cont.addSMEOperand(10);
+  EXPECT_EQ(cont.size(), vecSize + 10);
+  // Check initialised data was maintained
+  for (size_t i = 0; i < cont.size(); i++) {
+    if (i == 0 || i == 1 || i == 2) {
+      EXPECT_EQ(cont[i], "elem" + std::to_string(i));
+    } else {
+      EXPECT_EQ(cont[i], "");
+    }
+  }
+}
+
+}  // namespace aarch64
+}  // namespace arch
+}  // namespace simeng
\ No newline at end of file
diff --git a/test/unit/data/stream-aarch64.elf b/test/unit/data/stream-aarch64.elf
new file mode 100755
index 0000000000..881a5150a0
Binary files /dev/null and b/test/unit/data/stream-aarch64.elf differ
diff --git a/test/unit/data/stream.rv32ima.elf b/test/unit/data/stream.rv32ima.elf
new file mode 100644
index 0000000000..ded6502b12
Binary files /dev/null and b/test/unit/data/stream.rv32ima.elf differ
diff --git a/test/unit/pipeline/A64FXPortAllocatorTest.cc b/test/unit/pipeline/A64FXPortAllocatorTest.cc
index e949f6156b..f2923f46ba 100644
--- a/test/unit/pipeline/A64FXPortAllocatorTest.cc
+++ b/test/unit/pipeline/A64FXPortAllocatorTest.cc
@@ -8,19 +8,29 @@
 namespace simeng {
 namespace pipeline {
 
-std::vector<uint64_t> rsFreeEntries = {20, 20, 10, 10, 19};
+class A64FXPortAllocatorTest : public testing::Test {
+ public:
+  A64FXPortAllocatorTest() : portAllocator(portArrangement) {
+    portAllocator.setRSSizeGetter(
+        [this](std::vector<uint32_t>& sizeVec) { rsSizes(sizeVec); });
+  }
 
-void rsSizes(std::vector<uint64_t>& sizeVec) { sizeVec = rsFreeEntries; }
+  void rsSizes(std::vector<uint32_t>& sizeVec) const {
+    sizeVec = rsFreeEntries;
+  }
 
-// Representation of the A64FX port layout
-const std::vector<std::vector<uint16_t>> portArrangement = {{0}, {1}, {2}, {3},
-                                                            {4}, {5}, {6}, {7}};
+ protected:
+  // Representation of the A64FX reservation station layout
+  std::vector<uint32_t> rsFreeEntries = {20, 20, 10, 10, 19};
+  // Representation of the A64FX port layout
+  const std::vector<std::vector<uint16_t>> portArrangement = {
+      {0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}};
+
+  A64FXPortAllocator portAllocator;
+};
 
 // Tests correct allocation for RSE0/RSE1/BR attribute groups
-TEST(A64FXPortAllocatorTest, singlePortAllocation) {
-  auto portAllocator = A64FXPortAllocator(portArrangement);
-  portAllocator.setRSSizeGetter(
-      [](std::vector<uint64_t>& sizeVec) { rsSizes(sizeVec); });
+TEST_F(A64FXPortAllocatorTest, singlePortAllocation) {
   // Allocate in blocks of 4 to simulate dispatch width of 4 and test dispatch
   // slot logic
 
@@ -57,10 +67,7 @@ TEST(A64FXPortAllocatorTest, singlePortAllocation) {
 }
 
 // Tests correct allocation when for RSX
-TEST(A64FXPortAllocatorTest, RSX) {
-  auto portAllocator = A64FXPortAllocator(portArrangement);
-  portAllocator.setRSSizeGetter(
-      [](std::vector<uint64_t>& sizeVec) { rsSizes(sizeVec); });
+TEST_F(A64FXPortAllocatorTest, RSX) {
   rsFreeEntries = {10, 10, 10, 10, 19};
   portAllocator.tick();
   EXPECT_EQ(portAllocator.allocate({2, 4, 5, 6}), 2);
@@ -83,10 +90,7 @@ TEST(A64FXPortAllocatorTest, RSX) {
 }
 
 // Tests correct allocation when for RSE/RSA
-TEST(A64FXPortAllocatorTest, RSEA) {
-  auto portAllocator = A64FXPortAllocator(portArrangement);
-  portAllocator.setRSSizeGetter(
-      [](std::vector<uint64_t>& sizeVec) { rsSizes(sizeVec); });
+TEST_F(A64FXPortAllocatorTest, RSEA) {
   rsFreeEntries = {20, 20, 10, 10, 19};
   // RSE
   portAllocator.tick();
@@ -120,10 +124,7 @@ TEST(A64FXPortAllocatorTest, RSEA) {
 }
 
 // Test correct allocation for Table 1 condition
-TEST(A64FXPortAllocator, table1) {
-  auto portAllocator = A64FXPortAllocator(portArrangement);
-  portAllocator.setRSSizeGetter(
-      [](std::vector<uint64_t>& sizeVec) { rsSizes(sizeVec); });
+TEST_F(A64FXPortAllocatorTest, table1) {
   rsFreeEntries = {20, 0, 0, 0, 19};
   portAllocator.tick();
   EXPECT_EQ(portAllocator.allocate({2, 4, 5, 6}), 2);
@@ -146,10 +147,7 @@ TEST(A64FXPortAllocator, table1) {
 }
 
 // Test correct allocation for Table 2 condition
-TEST(A64FXPortAllocator, table2) {
-  auto portAllocator = A64FXPortAllocator(portArrangement);
-  portAllocator.setRSSizeGetter(
-      [](std::vector<uint64_t>& sizeVec) { rsSizes(sizeVec); });
+TEST_F(A64FXPortAllocatorTest, table2) {
   rsFreeEntries = {20, 20, 0, 0, 19};
   portAllocator.tick();
   EXPECT_EQ(portAllocator.allocate({2, 4, 5, 6}), 2);
@@ -172,10 +170,7 @@ TEST(A64FXPortAllocator, table2) {
 }
 
 // Test correct allocation for Table 3 condition
-TEST(A64FXPortAllocator, table3) {
-  auto portAllocator = A64FXPortAllocator(portArrangement);
-  portAllocator.setRSSizeGetter(
-      [](std::vector<uint64_t>& sizeVec) { rsSizes(sizeVec); });
+TEST_F(A64FXPortAllocatorTest, table3) {
   rsFreeEntries = {0, 0, 10, 10, 19};
   portAllocator.tick();
   EXPECT_EQ(portAllocator.allocate({2, 4, 5, 6}), 5);
@@ -198,10 +193,7 @@ TEST(A64FXPortAllocator, table3) {
 }
 
 // Test correct allocation for Table 5  condition
-TEST(A64FXPortAllocator, table5) {
-  auto portAllocator = A64FXPortAllocator(portArrangement);
-  portAllocator.setRSSizeGetter(
-      [](std::vector<uint64_t>& sizeVec) { rsSizes(sizeVec); });
+TEST_F(A64FXPortAllocatorTest, table5) {
   rsFreeEntries = {9, 9, 10, 9, 19};
   portAllocator.tick();
   EXPECT_EQ(portAllocator.allocate({2, 4, 5, 6}), 5);
@@ -223,11 +215,8 @@ TEST(A64FXPortAllocator, table5) {
   rsFreeEntries[1]--;
 }
 
-// Test correct allocation for Table 6  condition
-TEST(A64FXPortAllocator, table6) {
-  auto portAllocator = A64FXPortAllocator(portArrangement);
-  portAllocator.setRSSizeGetter(
-      [](std::vector<uint64_t>& sizeVec) { rsSizes(sizeVec); });
+// Test correct allocation for Table 6 condition
+TEST_F(A64FXPortAllocatorTest, table6) {
   rsFreeEntries = {20, 0, 10, 0, 19};
   portAllocator.tick();
   EXPECT_EQ(portAllocator.allocate({2, 4}), 2);
@@ -250,14 +239,11 @@ TEST(A64FXPortAllocator, table6) {
 }
 
 // Test adherence to the dispatch slot logic
-TEST(A64FXPortAllocator, dispatchSlots) {
-  auto portAllocator = A64FXPortAllocator(portArrangement);
-  portAllocator.setRSSizeGetter(
-      [](std::vector<uint64_t>& sizeVec) { rsSizes(sizeVec); });
+TEST_F(A64FXPortAllocatorTest, dispatchSlots) {
   rsFreeEntries = {10, 10, 10, 10, 19};
 
   // With less than 4 instructions dispatched in a cycle, the next cycle should
-  // reset the displatchSlot to 0 and start the allocation logic at the
+  // reset the dispatchSlot to 0 and start the allocation logic at the
   // appropriate place in the mechanism
   portAllocator.tick();
   EXPECT_EQ(portAllocator.allocate({2, 4, 5, 6}), 2);
@@ -273,7 +259,7 @@ TEST(A64FXPortAllocator, dispatchSlots) {
   EXPECT_EQ(portAllocator.allocate({2, 4, 5, 6}), 2);
   rsFreeEntries[0]--;
 
-  // Dispatch slot values should be shared amoungst all instruction attribute
+  // Dispatch slot values should be shared amongst all instruction attribute
   // dispatch mechanisms
   rsFreeEntries = {10, 10, 10, 10, 19};
   portAllocator.tick();
diff --git a/test/unit/pipeline/DecodeUnitTest.cc b/test/unit/pipeline/DecodeUnitTest.cc
index 71062f35bc..bd89f3c291 100644
--- a/test/unit/pipeline/DecodeUnitTest.cc
+++ b/test/unit/pipeline/DecodeUnitTest.cc
@@ -22,6 +22,8 @@ class PipelineDecodeUnitTest : public testing::Test {
         decodeUnit(input, output, predictor),
         uop(new MockInstruction),
         uopPtr(uop),
+        uop2(new MockInstruction),
+        uop2Ptr(uop2),
         sourceRegisters({{0, 0}}) {}
 
  protected:
@@ -33,6 +35,8 @@ class PipelineDecodeUnitTest : public testing::Test {
 
   MockInstruction* uop;
   std::shared_ptr<Instruction> uopPtr;
+  MockInstruction* uop2;
+  std::shared_ptr<Instruction> uop2Ptr;
 
   std::vector<Register> sourceRegisters;
 };
@@ -49,9 +53,6 @@ TEST_F(PipelineDecodeUnitTest, TickEmpty) {
 TEST_F(PipelineDecodeUnitTest, Tick) {
   input.getHeadSlots()[0] = {uopPtr};
 
-  EXPECT_CALL(*uop, checkEarlyBranchMisprediction())
-      .WillOnce(Return(std::tuple<bool, uint64_t>(false, 0)));
-
   decodeUnit.tick();
 
   // Check result uop is the same as the one provided
@@ -60,30 +61,26 @@ TEST_F(PipelineDecodeUnitTest, Tick) {
 
   // Check no flush was requested
   EXPECT_EQ(decodeUnit.shouldFlush(), false);
+  EXPECT_EQ(decodeUnit.getEarlyFlushes(), 0);
 }
 
-// Tests that the decode unit requests a flush when a non-branch is mispredicted
-TEST_F(PipelineDecodeUnitTest, Flush) {
-  input.getHeadSlots()[0] = {uopPtr};
-
-  uop->setInstructionAddress(2);
-
-  // Return branch type as unconditional by default
-  ON_CALL(*uop, getBranchType())
-      .WillByDefault(Return(BranchType::Unconditional));
+// Tests that PurgeFlushed empties the microOps queue
+TEST_F(PipelineDecodeUnitTest, purgeFlushed) {
+  input.getHeadSlots()[0] = {uopPtr, uop2Ptr};
 
-  EXPECT_CALL(*uop, checkEarlyBranchMisprediction())
-      .WillOnce(Return(std::tuple<bool, uint64_t>(true, 1)));
-  EXPECT_CALL(*uop, isBranch()).WillOnce(Return(false));
+  decodeUnit.tick();
+  EXPECT_EQ(output.getTailSlots()[0].get(), uop);
+  EXPECT_EQ(input.getHeadSlots()[0].size(), 0);
 
-  // Check the predictor is updated with the correct instruction address and PC
-  EXPECT_CALL(predictor, update(2, false, 1, BranchType::Unconditional));
+  // Clear micro-ops queue
+  decodeUnit.purgeFlushed();
+  // Swap output head and tail
+  output.tick();
 
   decodeUnit.tick();
-
-  // Check that a flush was correctly requested
-  EXPECT_EQ(decodeUnit.shouldFlush(), true);
-  EXPECT_EQ(decodeUnit.getFlushAddress(), 1);
+  EXPECT_EQ(output.getTailSlots()[0], nullptr);
+  EXPECT_EQ(output.getHeadSlots()[0].get(), uop);
+  EXPECT_EQ(input.getHeadSlots()[0].size(), 0);
 }
 
 }  // namespace pipeline
diff --git a/test/unit/pipeline/DispatchIssueUnitTest.cc b/test/unit/pipeline/DispatchIssueUnitTest.cc
new file mode 100644
index 0000000000..f7ecb2b9b6
--- /dev/null
+++ b/test/unit/pipeline/DispatchIssueUnitTest.cc
@@ -0,0 +1,573 @@
+#include "../ConfigInit.hh"
+#include "../MockInstruction.hh"
+#include "../MockPortAllocator.hh"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "simeng/pipeline/DispatchIssueUnit.hh"
+#include "simeng/version.hh"
+
+namespace simeng {
+namespace pipeline {
+
+using ::testing::_;
+using ::testing::Return;
+using ::testing::ReturnRef;
+
+class PipelineDispatchIssueUnitTest : public testing::Test {
+ public:
+  PipelineDispatchIssueUnitTest()
+      : regFile(physRegStruct),
+        input(1, nullptr),
+        output(config::SimInfo::getConfig()["Execution-Units"].num_children(),
+               {1, nullptr}),
+        diUnit(input, output, regFile, portAlloc, physRegQuants),
+        uop(new MockInstruction),
+        uopPtr(uop),
+        uop2(new MockInstruction),
+        uop2Ptr(uop2) {}
+
+ protected:
+  // More complex model used to enable better testing of the DispatchIssueUnit
+  // class.
+  ConfigInit configInit = ConfigInit(config::ISA::AArch64, R"YAML({
+  Ports: {
+    '0': {Portname: Port 0, Instruction-Group-Support: [FP, SVE]},
+    '1': {Portname: Port 1, Instruction-Group-Support: [PREDICATE]},
+    '2': {Portname: Port 2, Instruction-Group-Support: [INT_SIMPLE, INT_MUL, STORE_DATA]},
+    '3': {Portname: Port 3, Instruction-Group-Support: [FP_SIMPLE, FP_MUL, SVE_SIMPLE, SVE_MUL]},
+    '4': {Portname: Port 4, Instruction-Group-Support: [INT_SIMPLE, INT_DIV_OR_SQRT]},
+    '5': {Portname: Port 5, Instruction-Group-Support: [LOAD, STORE_ADDRESS, INT_SIMPLE_ARTH_NOSHIFT, INT_SIMPLE_LOGICAL_NOSHIFT, INT_SIMPLE_CMP]},
+    '6': {Portname: Port 6, Instruction-Group-Support: [LOAD, STORE_ADDRESS, INT_SIMPLE_ARTH_NOSHIFT, INT_SIMPLE_LOGICAL_NOSHIFT, INT_SIMPLE_CMP]},
+    '7': {Portname: Port 7, Instruction-Group-Support: [BRANCH]}
+  },
+  Reservation-Stations: {
+    '0': {Size: 20, Dispatch-Rate: 2, Ports: [Port 0, Port 1, Port 2]},
+    '1': {Size: 20, Dispatch-Rate: 2, Ports: [Port 3, Port 4]},
+    '2': {Size: 10, Dispatch-Rate: 1, Ports: [Port 5]},
+    '3': {Size: 10, Dispatch-Rate: 1, Ports: [Port 6]},
+    '4': {Size: 19, Dispatch-Rate: 1, Ports: [Port 7]}
+  },
+  Execution-Units: {
+    '0': {Pipelined: True, Blocking-Groups: [INT_DIV_OR_SQRT, FP_DIV_OR_SQRT, SVE_DIV_OR_SQRT]},
+    '1': {Pipelined: True, Blocking-Groups: [INT_DIV_OR_SQRT, FP_DIV_OR_SQRT, SVE_DIV_OR_SQRT]},
+    '2': {Pipelined: True, Blocking-Groups: [INT_DIV_OR_SQRT, FP_DIV_OR_SQRT, SVE_DIV_OR_SQRT]},
+    '3': {Pipelined: True, Blocking-Groups: [INT_DIV_OR_SQRT, FP_DIV_OR_SQRT, SVE_DIV_OR_SQRT]},
+    '4': {Pipelined: True, Blocking-Groups: [INT_DIV_OR_SQRT, FP_DIV_OR_SQRT, SVE_DIV_OR_SQRT]},
+    '5': {Pipelined: True, Blocking-Groups: [INT_DIV_OR_SQRT, FP_DIV_OR_SQRT, SVE_DIV_OR_SQRT]},
+    '6': {Pipelined: True, Blocking-Groups: [INT_DIV_OR_SQRT, FP_DIV_OR_SQRT, SVE_DIV_OR_SQRT]},
+    '7': {Pipelined: True, Blocking-Groups: [INT_DIV_OR_SQRT, FP_DIV_OR_SQRT, SVE_DIV_OR_SQRT]}
+  }
+  })YAML");
+
+  // Using AArch64 as basis: {GP, FP/SVE, PRED, COND, SYS, SME}
+  const std::vector<uint16_t> physRegQuants = {96, 128, 48, 128, 64, 64};
+  const std::vector<RegisterFileStructure> physRegStruct = {
+      {8, physRegQuants[0]}, {256, physRegQuants[1]}, {32, physRegQuants[2]},
+      {1, physRegQuants[3]}, {8, physRegQuants[4]},   {256, physRegQuants[5]}};
+  RegisterFileSet regFile;
+
+  PipelineBuffer<std::shared_ptr<Instruction>> input;
+  std::vector<PipelineBuffer<std::shared_ptr<Instruction>>> output;
+
+  MockPortAllocator portAlloc;
+
+  simeng::pipeline::DispatchIssueUnit diUnit;
+
+  MockInstruction* uop;
+  std::shared_ptr<Instruction> uopPtr;
+  MockInstruction* uop2;
+  std::shared_ptr<Instruction> uop2Ptr;
+
+  // As per a64fx.yaml
+  const uint16_t EAGA = 5;    // Maps to RS index 2
+  const uint8_t RS_EAGA = 2;  // RS associated with EAGA in A64FX
+  const std::vector<uint32_t> refRsSizes = {20, 20, 10, 10, 19};
+
+  const Register r0 = {0, 0};
+  const Register r1 = {0, 1};
+  const Register r2 = {0, 2};
+};
+
+// No instruction issued due to empty input buffer
+TEST_F(PipelineDispatchIssueUnitTest, emptyTick) {
+  // Ensure empty Reservation stations pre tick()
+  std::vector<uint32_t> rsSizes;
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes, refRsSizes);
+
+  diUnit.tick();
+  // Post tick(), ensure RS sizes are still the same + no RS stalls
+  rsSizes.clear();
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes, refRsSizes);
+  EXPECT_EQ(diUnit.getRSStalls(), 0);
+
+  diUnit.issue();
+  // Post issue(), ensure Reservation stations are empty
+  rsSizes.clear();
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes, refRsSizes);
+  // Post issue(), ensure output buffers are empty
+  for (size_t i = 0; i < output.size(); i++) {
+    EXPECT_EQ(output[i].getTailSlots()[0], nullptr);
+  }
+  // Post issue(), ensure only front-end stall recorded
+  EXPECT_EQ(diUnit.getFrontendStalls(), 1);
+  EXPECT_EQ(diUnit.getBackendStalls(), 0);
+  EXPECT_EQ(diUnit.getPortBusyStalls(), 0);
+}
+
+// Single instruction has no exception, 2 source operands (both ready), 1
+// destination operand
+TEST_F(PipelineDispatchIssueUnitTest, singleInstr) {
+  // Set-up source & destination registers and ports for this instruction
+  std::array<Register, 2> srcRegs = {r1, r2};
+  std::array<Register, 1> destRegs = {r0};
+  const std::vector<uint16_t> suppPorts = {EAGA};
+
+  // All expected calls to instruction during tick()
+  EXPECT_CALL(*uop, getSupportedPorts()).WillOnce(ReturnRef(suppPorts));
+  uop->setExceptionEncountered(false);
+  EXPECT_CALL(*uop, getSourceRegisters())
+      .WillOnce(Return(span<Register>(srcRegs)));
+  EXPECT_CALL(*uop, isOperandReady(0)).WillOnce(Return(false));
+  EXPECT_CALL(*uop, supplyOperand(0, RegisterValue(0, 8)));
+  EXPECT_CALL(*uop, isOperandReady(1)).WillOnce(Return(false));
+  EXPECT_CALL(*uop, supplyOperand(1, RegisterValue(0, 8)));
+  EXPECT_CALL(*uop, getDestinationRegisters())
+      .WillOnce(Return(span<Register>(destRegs)));
+
+  // Expected call to port allocator during tick()
+  EXPECT_CALL(portAlloc, allocate(suppPorts)).WillOnce(Return(EAGA));
+
+  // Ensure empty reservation stations pre tick()
+  std::vector<uint32_t> rsSizes;
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes, refRsSizes);
+
+  input.getHeadSlots()[0] = uopPtr;
+  diUnit.tick();
+  // Ensure post tick that EAGA's reservation station size has decreased by 1
+  rsSizes.clear();
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes.size(), refRsSizes.size());
+  EXPECT_EQ(rsSizes[RS_EAGA], refRsSizes[RS_EAGA] - 1);
+  // Ensure no stalls recorded in tick()
+  EXPECT_EQ(diUnit.getFrontendStalls(), 0);
+  EXPECT_EQ(diUnit.getBackendStalls(), 0);
+  EXPECT_EQ(diUnit.getPortBusyStalls(), 0);
+  EXPECT_EQ(diUnit.getRSStalls(), 0);
+  // Ensure empty output buffers post tick()
+  for (size_t i = 0; i < output.size(); i++) {
+    EXPECT_EQ(output[i].getTailSlots()[0], nullptr);
+  }
+
+  // Detail expected call to port allocator during tick()
+  EXPECT_CALL(portAlloc, issued(EAGA));
+
+  diUnit.issue();
+  // Ensure all reservation stations empty again post issue()
+  rsSizes.clear();
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes, refRsSizes);
+  // Ensure no stalls recorded during issue()
+  EXPECT_EQ(diUnit.getFrontendStalls(), 0);
+  EXPECT_EQ(diUnit.getBackendStalls(), 0);
+  EXPECT_EQ(diUnit.getPortBusyStalls(), 0);
+  EXPECT_EQ(diUnit.getRSStalls(), 0);
+  // Ensure all output buffers are empty, except the one associated with EAGA
+  // port which contains the uop
+  for (size_t i = 0; i < output.size(); i++) {
+    if (i != EAGA)
+      EXPECT_EQ(output[i].getTailSlots()[0], nullptr);
+    else
+      EXPECT_EQ(output[i].getTailSlots()[0].get(), uop);
+  }
+}
+
+// Single instruction with exception
+TEST_F(PipelineDispatchIssueUnitTest, singleInstr_exception) {
+  // Setup supported port instruction can use
+  const std::vector<uint16_t> suppPorts = {EAGA};
+
+  // All expected calls to instruction during tick()
+  EXPECT_CALL(*uop, getSupportedPorts()).WillOnce(ReturnRef(suppPorts));
+  uop->setExceptionEncountered(true);
+
+  input.getHeadSlots()[0] = uopPtr;
+  diUnit.tick();
+  // Check that instruction has encountered an exception and that it is ready to
+  // commit
+  EXPECT_TRUE(uop->canCommit());
+  EXPECT_TRUE(uop->exceptionEncountered());
+  // Ensure all reservation stations are empty post tick()
+  std::vector<uint32_t> rsSizes;
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes, refRsSizes);
+  // Ensure input buffer has been emptied
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
+
+  // Perform issue()
+  diUnit.issue();
+  // Ensure RS still empty post issue()
+  rsSizes.clear();
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes, refRsSizes);
+  // Ensure all output ports are empty
+  for (size_t i = 0; i < output.size(); i++) {
+    EXPECT_EQ(output[i].getTailSlots()[0], nullptr);
+  }
+  // Ensure frontend stall recorded
+  EXPECT_EQ(diUnit.getFrontendStalls(), 1);
+  EXPECT_EQ(diUnit.getBackendStalls(), 0);
+  EXPECT_EQ(diUnit.getPortBusyStalls(), 0);
+  EXPECT_EQ(diUnit.getRSStalls(), 0);
+}
+
+// Single instruction that can't be issued in 1 cycle as RS is full
+TEST_F(PipelineDispatchIssueUnitTest, singleInstr_rsFull) {
+  // Setup supported port instructions can use
+  const std::vector<uint16_t> suppPorts = {EAGA};
+
+  // Artificially fill Reservation station with index 2
+  std::vector<std::shared_ptr<MockInstruction>> insns(refRsSizes[RS_EAGA]);
+  for (size_t i = 0; i < insns.size(); i++) {
+    // Initialise instruction
+    insns[i] = std::make_shared<MockInstruction>();
+    // All expected calls to instruction during tick()
+    EXPECT_CALL(*insns[i].get(), getSupportedPorts())
+        .WillOnce(ReturnRef(suppPorts));
+    EXPECT_CALL(*insns[i].get(), getSourceRegisters())
+        .WillOnce(Return(span<Register>()));
+    EXPECT_CALL(*insns[i].get(), getDestinationRegisters())
+        .WillOnce(Return(span<Register>()));
+    // Expected call to port allocator during tick()
+    EXPECT_CALL(portAlloc, allocate(suppPorts)).WillOnce(Return(EAGA));
+
+    input.getHeadSlots()[0] = insns[i];
+    diUnit.tick();
+  }
+  // Ensure Reservation station index 2 is full post tick, and all others are
+  // empty
+  std::vector<uint32_t> rsSizes;
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes.size(), refRsSizes.size());
+  for (size_t i = 0; i < refRsSizes.size(); i++) {
+    if (i != RS_EAGA) {
+      EXPECT_EQ(rsSizes[i], refRsSizes[i]);
+    } else {
+      EXPECT_EQ(rsSizes[i], 0);
+      EXPECT_NE(rsSizes[i], refRsSizes[i]);
+    }
+  }
+  // Ensure no stalls recorded in tick()
+  EXPECT_EQ(diUnit.getFrontendStalls(), 0);
+  EXPECT_EQ(diUnit.getBackendStalls(), 0);
+  EXPECT_EQ(diUnit.getPortBusyStalls(), 0);
+  EXPECT_EQ(diUnit.getRSStalls(), 0);
+
+  // Submit new instruction to same port
+  // All expected calls to instruction during tick()
+  EXPECT_CALL(*uop, getSupportedPorts()).WillOnce(ReturnRef(suppPorts));
+  // All expected calls to portAllocator during tick()
+  EXPECT_CALL(portAlloc, allocate(_)).Times(0);
+  input.getHeadSlots()[0] = uopPtr;
+  diUnit.tick();
+  // Ensure Reservation station sizes have stayed the same
+  rsSizes.clear();
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes.size(), refRsSizes.size());
+  for (size_t i = 0; i < refRsSizes.size(); i++) {
+    if (i != RS_EAGA) {
+      EXPECT_EQ(rsSizes[i], refRsSizes[i]);
+    } else {
+      EXPECT_EQ(rsSizes[i], 0);
+      EXPECT_NE(rsSizes[i], refRsSizes[i]);
+    }
+  }
+  // Check input pipelineBuffer stalled
+  EXPECT_TRUE(input.isStalled());
+  // Ensure one rsStall recorded in tick()
+  EXPECT_EQ(diUnit.getFrontendStalls(), 0);
+  EXPECT_EQ(diUnit.getBackendStalls(), 0);
+  EXPECT_EQ(diUnit.getPortBusyStalls(), 0);
+  EXPECT_EQ(diUnit.getRSStalls(), 1);
+}
+
+// Single instruction not issued in 1 cycle as port is stalled
+TEST_F(PipelineDispatchIssueUnitTest, singleInstr_portStall) {
+  // Setup supported port instructions can use
+  const std::vector<uint16_t> suppPorts = {EAGA};
+
+  // Submit new instruction to a port
+  // All expected calls to instruction during tick()
+  EXPECT_CALL(*uop, getSupportedPorts()).WillOnce(ReturnRef(suppPorts));
+  uop->setExceptionEncountered(false);
+  EXPECT_CALL(*uop, getSourceRegisters()).WillOnce(Return(span<Register>()));
+  EXPECT_CALL(*uop, getDestinationRegisters())
+      .WillOnce(Return(span<Register>()));
+  // Expected call to portAllocator during tick()
+  EXPECT_CALL(portAlloc, allocate(suppPorts)).WillOnce(Return(EAGA));
+
+  input.getHeadSlots()[0] = uopPtr;
+  diUnit.tick();
+
+  // Ensure correct RS sizes post tick()
+  std::vector<uint32_t> rsSizes;
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes.size(), refRsSizes.size());
+  for (size_t i = 0; i < refRsSizes.size(); i++) {
+    if (i != RS_EAGA) {
+      EXPECT_EQ(rsSizes[i], refRsSizes[i]);
+    } else {
+      EXPECT_EQ(rsSizes[i], refRsSizes[i] - 1);
+    }
+  }
+  // Ensure no stalls recorded in tick()
+  EXPECT_EQ(diUnit.getFrontendStalls(), 0);
+  EXPECT_EQ(diUnit.getBackendStalls(), 0);
+  EXPECT_EQ(diUnit.getPortBusyStalls(), 0);
+  EXPECT_EQ(diUnit.getRSStalls(), 0);
+
+  // Stall issue port
+  output[EAGA].stall(true);
+
+  // Perform issue()
+  diUnit.issue();
+  // Ensure correct RS sizes post issue()
+  rsSizes.clear();
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes.size(), refRsSizes.size());
+  for (size_t i = 0; i < refRsSizes.size(); i++) {
+    if (i != RS_EAGA) {
+      EXPECT_EQ(rsSizes[i], refRsSizes[i]);
+    } else {
+      EXPECT_EQ(rsSizes[i], refRsSizes[i] - 1);
+    }
+  }
+  // Ensure all output ports are empty
+  for (size_t i = 0; i < output.size(); i++) {
+    EXPECT_EQ(output[i].getTailSlots()[0], nullptr);
+  }
+  // Ensure portBusyStall and backend stall recorded in issue()
+  EXPECT_EQ(diUnit.getFrontendStalls(), 0);
+  EXPECT_EQ(diUnit.getBackendStalls(), 1);
+  EXPECT_EQ(diUnit.getPortBusyStalls(), 1);
+  EXPECT_EQ(diUnit.getRSStalls(), 0);
+}
+
+// Try dispatch two instructions with RAW hazard after renaming, second should
+// not be issued as it is dependant on first. Use forwardOperand() to resolve
+// dependency.
+TEST_F(PipelineDispatchIssueUnitTest, createdependency_raw) {
+  // Set-up source & destination registers and ports for the instructions
+  std::array<Register, 1> srcRegs_1 = {};
+  std::array<Register, 1> destRegs_1 = {r0};
+  std::array<Register, 1> srcRegs_2 = {r0};
+  std::array<Register, 1> destRegs_2 = {r1};
+  const std::vector<uint16_t> suppPorts = {EAGA};
+
+  // All expected calls to instruction 1 during tick()
+  EXPECT_CALL(*uop, getSupportedPorts()).WillOnce(ReturnRef(suppPorts));
+  uop->setExceptionEncountered(false);
+  EXPECT_CALL(*uop, getSourceRegisters())
+      .WillOnce(Return(span<Register>(srcRegs_1)));
+  EXPECT_CALL(*uop, isOperandReady(0)).WillOnce(Return(false));
+  EXPECT_CALL(*uop, supplyOperand(0, RegisterValue(0, 8)));
+  EXPECT_CALL(*uop, getDestinationRegisters())
+      .WillOnce(Return(span<Register>(destRegs_1)));
+  // Expected call to port allocator during tick()
+  EXPECT_CALL(portAlloc, allocate(suppPorts)).WillOnce(Return(EAGA));
+  EXPECT_CALL(portAlloc, issued(EAGA));
+
+  // Process instruction 1
+  input.getHeadSlots()[0] = uopPtr;
+  diUnit.tick();
+  diUnit.issue();
+  EXPECT_EQ(output[EAGA].getTailSlots()[0], uopPtr);
+  output[EAGA].tick();
+
+  // All expected calls to instruction 2 during tick()
+  EXPECT_CALL(*uop2, getSupportedPorts()).WillOnce(ReturnRef(suppPorts));
+  uop->setExceptionEncountered(false);
+  EXPECT_CALL(*uop2, getSourceRegisters())
+      .WillOnce(Return(span<Register>(srcRegs_2)));
+  EXPECT_CALL(*uop2, isOperandReady(0)).WillOnce(Return(false));
+  EXPECT_CALL(*uop2, getDestinationRegisters())
+      .WillOnce(Return(span<Register>(destRegs_2)));
+  // Expected call to port allocator during tick()
+  EXPECT_CALL(portAlloc, allocate(suppPorts)).WillOnce(Return(EAGA));
+
+  // Process instruction 2
+  input.getHeadSlots()[0] = uop2Ptr;
+  diUnit.tick();
+  diUnit.issue();
+  // Ensure correct RS sizes post tick() & issue()
+  std::vector<uint32_t> rsSizes;
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes.size(), refRsSizes.size());
+  for (size_t i = 0; i < refRsSizes.size(); i++) {
+    if (i != RS_EAGA) {
+      EXPECT_EQ(rsSizes[i], refRsSizes[i]);
+    } else {
+      EXPECT_EQ(rsSizes[i], refRsSizes[i] - 1);
+    }
+  }
+  // Ensure all output ports are empty
+  for (size_t i = 0; i < output.size(); i++) {
+    EXPECT_EQ(output[i].getTailSlots()[0], nullptr);
+  }
+  // Ensure backend stall recorded in issue()
+  EXPECT_EQ(diUnit.getFrontendStalls(), 0);
+  EXPECT_EQ(diUnit.getBackendStalls(), 1);
+  EXPECT_EQ(diUnit.getPortBusyStalls(), 0);
+  EXPECT_EQ(diUnit.getRSStalls(), 0);
+
+  // Forward operand for register r0
+  std::array<RegisterValue, 1> vals = {RegisterValue(6)};
+  EXPECT_CALL(*uop2, supplyOperand(0, vals[0]));
+  EXPECT_CALL(*uop2, canExecute()).WillOnce(Return(true));
+  diUnit.forwardOperands(span<Register>(srcRegs_2), vals);
+
+  // Try issue again for instruction 2
+  EXPECT_CALL(portAlloc, issued(EAGA));
+  diUnit.issue();
+  // Ensure correct RS sizes post issue()
+  rsSizes.clear();
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes, refRsSizes);
+  // Ensure all output ports are empty except EAGA
+  for (size_t i = 0; i < output.size(); i++) {
+    if (i != EAGA)
+      EXPECT_EQ(output[i].getTailSlots()[0], nullptr);
+    else
+      EXPECT_EQ(output[i].getTailSlots()[0], uop2Ptr);
+  }
+  // Ensure no further stalls recorded in issue()
+  EXPECT_EQ(diUnit.getFrontendStalls(), 0);
+  EXPECT_EQ(diUnit.getBackendStalls(), 1);
+  EXPECT_EQ(diUnit.getPortBusyStalls(), 0);
+  EXPECT_EQ(diUnit.getRSStalls(), 0);
+}
+
+// Ensure correct instructions are flushed from reservation stations and the
+// dependency matrix
+TEST_F(PipelineDispatchIssueUnitTest, purgeFlushed) {
+  // Set-up source & destination registers and ports for the instructions;
+  // creating a dependency
+  std::array<Register, 1> srcRegs_1 = {};
+  std::array<Register, 1> destRegs_1 = {r0};
+  std::array<Register, 1> srcRegs_2 = {r0};
+  std::array<Register, 1> destRegs_2 = {r1};
+  const std::vector<uint16_t> suppPorts = {EAGA};
+
+  // All expected calls to instruction 1 during tick()
+  EXPECT_CALL(*uop, getSupportedPorts()).WillOnce(ReturnRef(suppPorts));
+  uop->setExceptionEncountered(false);
+  EXPECT_CALL(*uop, getSourceRegisters())
+      .WillOnce(Return(span<Register>(srcRegs_1)));
+  EXPECT_CALL(*uop, isOperandReady(0)).WillOnce(Return(false));
+  EXPECT_CALL(*uop, supplyOperand(0, RegisterValue(0, 8)));
+  EXPECT_CALL(*uop, getDestinationRegisters())
+      .WillOnce(Return(span<Register>(destRegs_1)));
+  // Expected call to port allocator during tick()
+  EXPECT_CALL(portAlloc, allocate(suppPorts)).WillOnce(Return(EAGA));
+
+  // Process instruction 1
+  input.getHeadSlots()[0] = uopPtr;
+  diUnit.tick();
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
+
+  // All expected calls to instruction 2 during tick()
+  EXPECT_CALL(*uop2, getSupportedPorts()).WillOnce(ReturnRef(suppPorts));
+  uop->setExceptionEncountered(false);
+  EXPECT_CALL(*uop2, getSourceRegisters())
+      .WillOnce(Return(span<Register>(srcRegs_2)));
+  EXPECT_CALL(*uop2, isOperandReady(0)).WillOnce(Return(false));
+  EXPECT_CALL(*uop2, getDestinationRegisters())
+      .WillOnce(Return(span<Register>(destRegs_2)));
+  // Expected call to port allocator during tick()
+  EXPECT_CALL(portAlloc, allocate(suppPorts)).WillOnce(Return(EAGA));
+
+  // Process instruction 2
+  input.getHeadSlots()[0] = uop2Ptr;
+  diUnit.tick();
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
+
+  // Ensure correct RS sizes post tick()
+  std::vector<uint32_t> rsSizes;
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes.size(), refRsSizes.size());
+  for (size_t i = 0; i < refRsSizes.size(); i++) {
+    if (i != RS_EAGA) {
+      EXPECT_EQ(rsSizes[i], refRsSizes[i]);
+    } else {
+      EXPECT_EQ(rsSizes[i], refRsSizes[i] - 2);
+    }
+  }
+  // Ensure all output ports are empty
+  for (size_t i = 0; i < output.size(); i++) {
+    EXPECT_EQ(output[i].getTailSlots()[0], nullptr);
+  }
+  // Ensure no stalls recorded
+  EXPECT_EQ(diUnit.getFrontendStalls(), 0);
+  EXPECT_EQ(diUnit.getBackendStalls(), 0);
+  EXPECT_EQ(diUnit.getPortBusyStalls(), 0);
+  EXPECT_EQ(diUnit.getRSStalls(), 0);
+
+  // Remove flushed uops
+  EXPECT_CALL(portAlloc, deallocate(EAGA)).Times(2);
+  uopPtr->setFlushed();
+  uop2Ptr->setFlushed();
+  diUnit.purgeFlushed();
+
+  // Check reservation station sizes
+  rsSizes.clear();
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes, refRsSizes);
+
+  // Perform issue to see if `uop` is still present
+  diUnit.issue();
+  // Ensure all output ports are empty
+  for (size_t i = 0; i < output.size(); i++) {
+    EXPECT_EQ(output[i].getTailSlots()[0], nullptr);
+  }
+  // Ensure frontend stall recorded in issue()
+  EXPECT_EQ(diUnit.getFrontendStalls(), 1);
+  EXPECT_EQ(diUnit.getBackendStalls(), 0);
+  EXPECT_EQ(diUnit.getPortBusyStalls(), 0);
+  EXPECT_EQ(diUnit.getRSStalls(), 0);
+
+  // Call forwardOperand() and issue() to release `uop2` (if it were still
+  // present)
+  std::array<RegisterValue, 1> vals = {RegisterValue(6)};
+  diUnit.forwardOperands(span<Register>(srcRegs_2), vals);
+  // Check reservation station sizes
+  rsSizes.clear();
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes, refRsSizes);
+
+  diUnit.issue();
+  // Ensure all output ports are empty
+  for (size_t i = 0; i < output.size(); i++) {
+    EXPECT_EQ(output[i].getTailSlots()[0], nullptr);
+  }
+  // Ensure frontend stall recorded in issue()
+  EXPECT_EQ(diUnit.getFrontendStalls(), 2);
+  EXPECT_EQ(diUnit.getBackendStalls(), 0);
+  EXPECT_EQ(diUnit.getPortBusyStalls(), 0);
+  EXPECT_EQ(diUnit.getRSStalls(), 0);
+}
+
+// Test based on a64fx config file reservation staion configuration
+TEST_F(PipelineDispatchIssueUnitTest, getRSSizes) {
+  std::vector<uint32_t> rsSizes;
+  diUnit.getRSSizes(rsSizes);
+  EXPECT_EQ(rsSizes, refRsSizes);
+}
+
+}  // namespace pipeline
+}  // namespace simeng
+
+// tick
+// issue
\ No newline at end of file
diff --git a/test/unit/pipeline/ExecuteUnitTest.cc b/test/unit/pipeline/ExecuteUnitTest.cc
index eb130f53ad..3665628b8c 100644
--- a/test/unit/pipeline/ExecuteUnitTest.cc
+++ b/test/unit/pipeline/ExecuteUnitTest.cc
@@ -35,7 +35,7 @@ class PipelineExecuteUnitTest : public testing::Test {
             [this](auto instruction) {
               executionHandlers.raiseException(instruction);
             },
-            predictor, true, {3, 4, 5}),
+            true, {3, 4, 5}),
         uop(new MockInstruction),
         secondUop(new MockInstruction),
         thirdUop(new MockInstruction),
@@ -56,17 +56,35 @@ class PipelineExecuteUnitTest : public testing::Test {
   MockInstruction* thirdUop;
 
   std::shared_ptr<Instruction> uopPtr;
-  std::shared_ptr<MockInstruction> secondUopPtr;
-  std::shared_ptr<MockInstruction> thirdUopPtr;
+  std::shared_ptr<Instruction> secondUopPtr;
+  std::shared_ptr<Instruction> thirdUopPtr;
 };
 
 // Tests that the execution unit processes nothing if no instruction is present
 TEST_F(PipelineExecuteUnitTest, TickEmpty) {
+  EXPECT_TRUE(executeUnit.isEmpty());
   executeUnit.tick();
 
+  EXPECT_TRUE(executeUnit.isEmpty());
   EXPECT_EQ(output.getTailSlots()[0], nullptr);
 }
 
+// Tests that a flushed instruction is removed from the input buffer and not
+// processed through the EU
+TEST_F(PipelineExecuteUnitTest, flushedInputInsn) {
+  input.getHeadSlots()[0] = uopPtr;
+
+  // Setup instruction
+  uopPtr->setFlushed();
+  ON_CALL(*uop, canExecute()).WillByDefault(Return(true));
+
+  executeUnit.tick();
+
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
+  EXPECT_EQ(output.getTailSlots()[0], nullptr);
+  EXPECT_EQ(executeUnit.getCycles(), 0);
+}
+
 // Tests that the execution unit executes an instruction and forwards the
 // results
 TEST_F(PipelineExecuteUnitTest, Execute) {
@@ -117,17 +135,15 @@ TEST_F(PipelineExecuteUnitTest, ExecuteBranch) {
     uop->setBranchResults(taken, pc);
   }));
 
-  // Check that the branch predictor was updated with the results
-  EXPECT_CALL(*uop, getBranchType()).Times(1);
-  EXPECT_CALL(predictor, update(2, taken, pc, BranchType::Unconditional))
-      .Times(1);
-
   // Check that empty forwarding call is made
   EXPECT_CALL(executionHandlers, forwardOperands(IsEmpty(), IsEmpty()))
       .Times(1);
 
   executeUnit.tick();
 
+  EXPECT_EQ(uopPtr->wasBranchMispredicted(), false);
+  EXPECT_EQ(uopPtr->wasBranchTaken(), taken);
+
   EXPECT_EQ(executeUnit.shouldFlush(), false);
   EXPECT_EQ(output.getTailSlots()[0].get(), uop);
 }
@@ -192,7 +208,7 @@ TEST_F(PipelineExecuteUnitTest, PipelineStall) {
   EXPECT_EQ(input.getHeadSlots()[0].get(), secondUop);
   EXPECT_EQ(output.getTailSlots()[0], nullptr);
   executeUnit.tick();
-  EXPECT_EQ(input.getHeadSlots()[0].get(), nullptr);
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
   EXPECT_EQ(output.getTailSlots()[0].get(), uop);
 }
 
@@ -204,13 +220,14 @@ TEST_F(PipelineExecuteUnitTest, OperationStall) {
   uop->setLatency(5);
   uop->setStallCycles(5);
   ON_CALL(*uop, getGroup()).WillByDefault(Return(3));
+  ON_CALL(*uop, canExecute()).WillByDefault(Return(true));
   ON_CALL(*secondUop, getGroup()).WillByDefault(Return(4));
+  ON_CALL(*secondUop, canExecute()).WillByDefault(Return(true));
   ON_CALL(*thirdUop, getGroup()).WillByDefault(Return(2));
-
-  ON_CALL(*uop, canExecute()).WillByDefault(Return(true));
   ON_CALL(*thirdUop, canExecute()).WillByDefault(Return(true));
+
   EXPECT_CALL(*uop, execute()).Times(1);
-  EXPECT_CALL(*secondUop, execute()).Times(0);
+  EXPECT_CALL(*secondUop, execute()).Times(1);
   EXPECT_CALL(*thirdUop, execute()).Times(1);
 
   executeUnit.tick();
@@ -218,21 +235,107 @@ TEST_F(PipelineExecuteUnitTest, OperationStall) {
   EXPECT_EQ(output.getTailSlots()[0], nullptr);
   input.getHeadSlots()[0] = secondUopPtr;
   executeUnit.tick();
-  EXPECT_EQ(input.getHeadSlots()[0].get(), nullptr);
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
   EXPECT_EQ(output.getTailSlots()[0], nullptr);
   input.getHeadSlots()[0] = thirdUopPtr;
   executeUnit.tick();
-  EXPECT_EQ(input.getHeadSlots()[0].get(), nullptr);
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
   EXPECT_EQ(output.getTailSlots()[0], nullptr);
   executeUnit.tick();
-  EXPECT_EQ(input.getHeadSlots()[0].get(), nullptr);
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
   EXPECT_EQ(output.getTailSlots()[0], nullptr);
   executeUnit.tick();
-  EXPECT_EQ(input.getHeadSlots()[0].get(), nullptr);
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
+  EXPECT_EQ(output.getTailSlots()[0].get(), uop);
+  executeUnit.tick();
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
+  EXPECT_EQ(output.getTailSlots()[0].get(), thirdUop);
+  executeUnit.tick();
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
+  EXPECT_EQ(output.getTailSlots()[0].get(), secondUop);
+}
+
+// Test that a mispredicted branch instruction is properly handled
+TEST_F(PipelineExecuteUnitTest, mispredictedBranch) {
+  input.getHeadSlots()[0] = uopPtr;
+
+  ON_CALL(*uop, canExecute()).WillByDefault(Return(true));
+  // Anticipate testing instruction type; return true for branch
+  ON_CALL(*uop, isBranch()).WillByDefault(Return(true));
+  // Return branch type as unconditional by default
+  ON_CALL(*uop, getBranchType()).WillByDefault(Return(BranchType::Conditional));
+
+  const bool takenPred = false;
+  const bool taken = true;
+  const uint64_t pc = 4;
+  const uint64_t insnAddress = 16;
+  const uint64_t insnID = 5;
+
+  uop->setInstructionAddress(insnAddress);
+  uop->setInstructionId(insnID);
+  uop->setBranchPrediction({takenPred, insnAddress + 4});
+
+  EXPECT_CALL(*uop, execute()).WillOnce(Invoke([&]() {
+    uop->setExecuted(true);
+    uop->setBranchResults(taken, pc);
+  }));
+
+  // Check that empty forwarding call is made
+  EXPECT_CALL(executionHandlers, forwardOperands(IsEmpty(), IsEmpty()))
+      .Times(1);
+
+  executeUnit.tick();
+
+  EXPECT_EQ(uopPtr->wasBranchMispredicted(), true);
+  EXPECT_EQ(uopPtr->wasBranchTaken(), taken);
+
+  EXPECT_EQ(executeUnit.shouldFlush(), true);
   EXPECT_EQ(output.getTailSlots()[0].get(), uop);
+  EXPECT_EQ(executeUnit.getFlushAddress(), pc);
+  EXPECT_EQ(executeUnit.getFlushInsnId(), insnID);
+}
+
+// Test that the flushing mechansim works correctly via purgeFlushed()
+TEST_F(PipelineExecuteUnitTest, purgeFlushed) {
+  input.getHeadSlots()[0] = uopPtr;
+
+  uop->setLatency(5);
+  uop->setStallCycles(5);
+  // Set up instructions so that only one is in the EU pipeline at a time
+  ON_CALL(*uop, getGroup()).WillByDefault(Return(3));
+  ON_CALL(*uop, canExecute()).WillByDefault(Return(true));
+  ON_CALL(*secondUop, getGroup()).WillByDefault(Return(4));
+  ON_CALL(*secondUop, canExecute()).WillByDefault(Return(true));
+  ON_CALL(*thirdUop, getGroup()).WillByDefault(Return(5));
+  ON_CALL(*thirdUop, canExecute()).WillByDefault(Return(true));
+
+  EXPECT_CALL(*uop, execute()).Times(0);
+  EXPECT_CALL(*secondUop, execute()).Times(0);
+  EXPECT_CALL(*thirdUop, execute()).Times(1);
+
+  // Stage all three instructions in EU pipeline
+  executeUnit.tick();
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
+  EXPECT_EQ(output.getTailSlots()[0], nullptr);
+  input.getHeadSlots()[0] = secondUopPtr;
   executeUnit.tick();
-  EXPECT_EQ(input.getHeadSlots()[0].get(), nullptr);
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
+  EXPECT_EQ(output.getTailSlots()[0], nullptr);
+  input.getHeadSlots()[0] = thirdUopPtr;
+  executeUnit.tick();
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
+  EXPECT_EQ(output.getTailSlots()[0], nullptr);
+
+  // Flush first two instructions
+  uopPtr->setFlushed();
+  secondUopPtr->setFlushed();
+  executeUnit.purgeFlushed();
+
+  // Ensure non-flushed instruction progresses through the pipeline
+  executeUnit.tick();
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
   EXPECT_EQ(output.getTailSlots()[0].get(), thirdUop);
+  EXPECT_TRUE(executeUnit.isEmpty());
 }
 
 }  // namespace pipeline
diff --git a/test/unit/pipeline/FetchUnitTest.cc b/test/unit/pipeline/FetchUnitTest.cc
index d2ba64512b..36898574db 100644
--- a/test/unit/pipeline/FetchUnitTest.cc
+++ b/test/unit/pipeline/FetchUnitTest.cc
@@ -23,49 +23,70 @@ std::map<uint64_t, simeng::Trace*> traceMap;
 std::list<simeng::Trace*> probeList;
 
 using ::testing::_;
+using ::testing::AllOf;
+using ::testing::AnyNumber;
+using ::testing::AnyOf;
+using ::testing::AtLeast;
 using ::testing::DoAll;
 using ::testing::Field;
+using ::testing::Gt;
+using ::testing::Lt;
+using ::testing::Ne;
 using ::testing::Return;
 using ::testing::SetArgReferee;
 
 namespace simeng {
 namespace pipeline {
 
-class PipelineFetchUnitTest : public testing::Test {
+class PipelineFetchUnitTest
+    : public testing::TestWithParam<std::pair<uint8_t, uint8_t>> {
  public:
   PipelineFetchUnitTest()
       : output(1, {}),
+        linux(config::SimInfo::getConfig()["CPU-Info"]["Special-File-Dir-Path"]
+                  .as<std::string>()),
+        isa(linux),
         fetchBuffer({{0, 16}, 0, 0}),
         completedReads(&fetchBuffer, 1),
-        fetchUnit(output, memory, 1024, 0, 16, isa, predictor),
+        fetchUnit(output, memory, 1024, 0, blockSize, isa, predictor),
         uop(new MockInstruction),
-        uopPtr(uop) {
+        uopPtr(uop),
+        uop2(new MockInstruction),
+        uopPtr2(uop2) {
     uopPtr->setInstructionAddress(0);
   }
 
  protected:
+  const uint8_t insnMinSizeBytes = GetParam().first;
+  const uint8_t insnMaxSizeBytes = GetParam().second;
+  // TODO make this parameterisable and update all tests accordingly
+  const uint8_t blockSize = 16;
+
   PipelineBuffer<MacroOp> output;
   MockMemoryInterface memory;
+  kernel::Linux linux;
   MockArchitecture isa;
   MockBranchPredictor predictor;
 
-  MemoryReadResult fetchBuffer;
-  span<MemoryReadResult> completedReads;
+  memory::MemoryReadResult fetchBuffer;
+  span<memory::MemoryReadResult> completedReads;
 
   FetchUnit fetchUnit;
 
   MockInstruction* uop;
   std::shared_ptr<Instruction> uopPtr;
+  MockInstruction* uop2;
+  std::shared_ptr<Instruction> uopPtr2;
 };
 
 // Tests that ticking a fetch unit attempts to predecode from the correct
 // program counter and generates output correctly.
-TEST_F(PipelineFetchUnitTest, Tick) {
+TEST_P(PipelineFetchUnitTest, Tick) {
   MacroOp macroOp = {uopPtr};
 
   ON_CALL(memory, getCompletedReads()).WillByDefault(Return(completedReads));
 
-  ON_CALL(isa, getMaxInstructionSize()).WillByDefault(Return(4));
+  ON_CALL(isa, getMaxInstructionSize()).WillByDefault(Return(insnMaxSizeBytes));
 
   // Anticipate testing instruction type; return true for branch
   ON_CALL(*uop, isBranch()).WillByDefault(Return(true));
@@ -85,7 +106,7 @@ TEST_F(PipelineFetchUnitTest, Tick) {
 }
 
 // Tests that ticking a fetch unit does nothing if the output has stalled
-TEST_F(PipelineFetchUnitTest, TickStalled) {
+TEST_P(PipelineFetchUnitTest, TickStalled) {
   output.stall(true);
 
   // Anticipate testing instruction type; return true for branch
@@ -103,29 +124,714 @@ TEST_F(PipelineFetchUnitTest, TickStalled) {
 
 // Tests that the fetch unit will handle instructions that straddle fetch block
 // boundaries by automatically requesting the next block of data.
-TEST_F(PipelineFetchUnitTest, FetchUnaligned) {
-  MacroOp macroOp = {uopPtr};
-  ON_CALL(isa, getMaxInstructionSize()).WillByDefault(Return(4));
+TEST_P(PipelineFetchUnitTest, FetchUnaligned) {
+  MacroOp mOp = {uopPtr};
+  ON_CALL(isa, getMaxInstructionSize()).WillByDefault(Return(insnMaxSizeBytes));
+  ON_CALL(isa, getMinInstructionSize()).WillByDefault(Return(insnMinSizeBytes));
   ON_CALL(memory, getCompletedReads()).WillByDefault(Return(completedReads));
 
-  // Set PC to 14, so there will not be enough data to start decoding
+  // Min instruction size needs to be more than 1 to set PC correctly for this
+  // test
+  EXPECT_GT(insnMinSizeBytes, 1);
+  uint64_t setPC = (blockSize - insnMinSizeBytes) + 1;
+  // Set PC so that there will not be enough data to start decoding
   EXPECT_CALL(isa, predecode(_, _, _, _, _)).Times(0);
-  fetchUnit.updatePC(14);
+  fetchUnit.updatePC(setPC);
   fetchUnit.tick();
 
   // Expect a block starting at address 16 to be requested when we fetch again
-  EXPECT_CALL(memory, requestRead(Field(&MemoryAccessTarget::address, 16), _))
+  EXPECT_CALL(memory,
+              requestRead(Field(&memory::MemoryAccessTarget::address, 16), _))
       .Times(1);
   fetchUnit.requestFromPC();
 
   // Tick again, expecting that decoding will now resume
-  MemoryReadResult nextBlockValue = {{16, 16}, 0, 1};
-  span<MemoryReadResult> nextBlock = {&nextBlockValue, 1};
-  EXPECT_CALL(memory, getCompletedReads()).WillOnce(Return(nextBlock));
-  EXPECT_CALL(isa, predecode(_, _, _, _, _))
-      .WillOnce(DoAll(SetArgReferee<3>(macroOp), Return(4)));
+  memory::MemoryReadResult nextBlockValue = {{16, blockSize}, 0, 1};
+  span<memory::MemoryReadResult> nextBlock = {&nextBlockValue, 1};
+  ON_CALL(memory, getCompletedReads()).WillByDefault(Return(nextBlock));
+  ON_CALL(isa, predecode(_, _, _, _, _))
+      .WillByDefault(DoAll(SetArgReferee<3>(mOp), Return(4)));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  EXPECT_CALL(memory, clearCompletedReads()).Times(4);
+  EXPECT_CALL(isa, getMaxInstructionSize()).Times(4);
+  EXPECT_CALL(isa, getMinInstructionSize()).Times(4);
+  EXPECT_CALL(isa, predecode(_, _, _, _, _)).Times(4);
+
+  // Tick 4 times to process all 16 bytes of fetched data
+  for (int i = 0; i < 4; i++) {
+    fetchUnit.tick();
+  }
+  // Tick a 5th time to ensure all buffered bytes have been used
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+  EXPECT_CALL(isa, getMinInstructionSize()).Times(1);
+  EXPECT_CALL(isa, predecode(_, _, _, _, _)).Times(0);
+  fetchUnit.tick();
+}
+
+// Tests that a properly aligned PC (to the fetch block boundary) is correctly
+// fetched
+TEST_P(PipelineFetchUnitTest, fetchAligned) {
+  const uint8_t pc = 16;
+
+  ON_CALL(isa, getMaxInstructionSize()).WillByDefault(Return(insnMaxSizeBytes));
+  ON_CALL(isa, getMinInstructionSize()).WillByDefault(Return(insnMinSizeBytes));
+
+  memory::MemoryAccessTarget target = {pc, blockSize};
+  EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+  EXPECT_CALL(memory, requestRead(target, _)).Times(1);
+
+  // Request block from Memory
+  fetchUnit.updatePC(pc);
+  fetchUnit.requestFromPC();
+
+  MacroOp mOp = {uopPtr};
+  memory::MemoryReadResult memReadResult = {
+      target, RegisterValue(0xFFFF, blockSize), 1};
+  span<memory::MemoryReadResult> nextBlock = {&memReadResult, 1};
+  ON_CALL(memory, getCompletedReads()).WillByDefault(Return(nextBlock));
+  ON_CALL(isa, predecode(_, _, _, _, _))
+      .WillByDefault(DoAll(SetArgReferee<3>(mOp), Return(4)));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  EXPECT_CALL(memory, clearCompletedReads()).Times(4);
+  EXPECT_CALL(isa, getMaxInstructionSize()).Times(4);
+  EXPECT_CALL(isa, getMinInstructionSize()).Times(4);
+  EXPECT_CALL(isa, predecode(_, _, _, _, _)).Times(4);
+
+  // Tick 4 times to process all 16 bytes of fetched data
+  for (int i = 0; i < 4; i++) {
+    fetchUnit.tick();
+  }
+  // Tick a 5th time to ensure all buffered bytes have been used
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  EXPECT_CALL(memory, clearCompletedReads()).Times(0);
+  EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+  EXPECT_CALL(isa, getMinInstructionSize()).Times(1);
+  EXPECT_CALL(isa, predecode(_, _, _, _, _)).Times(0);
+  fetchUnit.tick();
+}
+
+// Tests that halting functionality triggers correctly
+TEST_P(PipelineFetchUnitTest, halted) {
+  ON_CALL(isa, getMaxInstructionSize()).WillByDefault(Return(insnMaxSizeBytes));
+  ON_CALL(isa, getMinInstructionSize()).WillByDefault(Return(insnMinSizeBytes));
+  EXPECT_FALSE(fetchUnit.hasHalted());
+  fetchUnit.tick();
+  EXPECT_FALSE(fetchUnit.hasHalted());
+
+  // Test PC >= programByteLength triggers halting
+  fetchUnit.updatePC(1024);
+  EXPECT_TRUE(fetchUnit.hasHalted());
+
+  // Test PC being incremented to >= programByteLength triggers halting
+  fetchUnit.updatePC(1008);
+  EXPECT_FALSE(fetchUnit.hasHalted());
+
+  memory::MemoryAccessTarget target = {1008, blockSize};
+  EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+  EXPECT_CALL(isa, getMinInstructionSize()).Times(0);
+  EXPECT_CALL(memory, requestRead(target, _)).Times(1);
+  fetchUnit.requestFromPC();
+
+  MacroOp mOp = {uopPtr};
+  memory::MemoryReadResult memReadResult = {
+      target, RegisterValue(0xFFFF, blockSize), 1};
+  span<memory::MemoryReadResult> nextBlock = {&memReadResult, 1};
+  ON_CALL(memory, getCompletedReads()).WillByDefault(Return(nextBlock));
+  ON_CALL(isa, predecode(_, _, _, _, _))
+      .WillByDefault(DoAll(SetArgReferee<3>(mOp), Return(4)));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  EXPECT_CALL(memory, clearCompletedReads()).Times(4);
+  EXPECT_CALL(isa, getMaxInstructionSize()).Times(4);
+  EXPECT_CALL(isa, getMinInstructionSize()).Times(4);
+  EXPECT_CALL(isa, predecode(_, _, _, _, _)).Times(4);
+  // Tick 4 times to process all 16 bytes of fetched data
+  for (int i = 0; i < 4; i++) {
+    fetchUnit.tick();
+  }
+  EXPECT_TRUE(fetchUnit.hasHalted());
+}
+
+// Tests that fetching a branch instruction (predicted taken) mid block causes a
+// branch stall + discards the remaining fetched instructions
+TEST_P(PipelineFetchUnitTest, fetchTakenBranchMidBlock) {
+  const uint8_t pc = 16;
+
+  ON_CALL(isa, getMaxInstructionSize()).WillByDefault(Return(insnMaxSizeBytes));
+  ON_CALL(isa, getMinInstructionSize()).WillByDefault(Return(insnMinSizeBytes));
+
+  memory::MemoryAccessTarget target = {pc, blockSize};
+  EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+  EXPECT_CALL(isa, getMinInstructionSize()).Times(0);
+  EXPECT_CALL(memory, requestRead(target, _)).Times(1);
+
+  // Request block from memory
+  fetchUnit.updatePC(pc);
+  fetchUnit.requestFromPC();
+
+  MacroOp mOp = {uopPtr};
+  memory::MemoryReadResult memReadResult = {
+      target, RegisterValue(0xFFFF, blockSize), 1};
+  span<memory::MemoryReadResult> nextBlock = {&memReadResult, 1};
+  ON_CALL(memory, getCompletedReads()).WillByDefault(Return(nextBlock));
+  ON_CALL(isa, predecode(_, _, _, _, _))
+      .WillByDefault(DoAll(SetArgReferee<3>(mOp), Return(4)));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+
+  // For first tick, process instruction as non-branch
+  EXPECT_CALL(memory, clearCompletedReads()).Times(1);
+  EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+  EXPECT_CALL(isa, getMinInstructionSize()).Times(1);
+  EXPECT_CALL(isa, predecode(_, _, _, _, _)).Times(1);
+  EXPECT_CALL(*uop, isBranch()).WillOnce(Return(false));
+  fetchUnit.tick();
+
+  // For second tick, process a taken branch meaning rest of block is discarded
+  // & a new memory block is requested
+  EXPECT_CALL(memory, getCompletedReads()).Times(0);
+  EXPECT_CALL(memory, clearCompletedReads()).Times(1);
+  EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+  EXPECT_CALL(isa, getMinInstructionSize()).Times(1);
+  EXPECT_CALL(isa, predecode(_, _, _, _, _)).Times(1);
+  EXPECT_CALL(*uop, isBranch()).WillOnce(Return(true));
+  BranchType bType = BranchType::Unconditional;
+  uint64_t knownOff = 304;
+  EXPECT_CALL(*uop, getBranchType()).WillOnce(Return(bType));
+  EXPECT_CALL(*uop, getKnownOffset()).WillOnce(Return(knownOff));
+  BranchPrediction pred = {true, pc + knownOff};
+  EXPECT_CALL(predictor, predict(20, bType, knownOff)).WillOnce(Return(pred));
+  fetchUnit.tick();
+
+  // Ensure on next tick, predecode is not called
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  EXPECT_CALL(memory, clearCompletedReads()).Times(0);
+  EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+  EXPECT_CALL(isa, getMinInstructionSize()).Times(1);
+  EXPECT_CALL(isa, predecode(_, _, _, _, _)).Times(0);
+  fetchUnit.tick();
+
+  // Make sure on next call to `requestFromPC`, target is address 320
+  // (pred.target)
+  target = {pred.target, blockSize};
+  EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+  EXPECT_CALL(isa, getMinInstructionSize()).Times(0);
+  EXPECT_CALL(memory, requestRead(target, _)).Times(1);
+  fetchUnit.requestFromPC();
+}
+
+// Tests the functionality of the supplying from the Loop Buffer
+TEST_P(PipelineFetchUnitTest, supplyFromLoopBuffer) {
+  // Set instructions to be fetched from memory
+  memory::MemoryReadResult memReadResult = {
+      {0x0, blockSize}, RegisterValue(0xFFFF, blockSize), 1};
+  span<memory::MemoryReadResult> nextBlock = {&memReadResult, 1};
+  ON_CALL(memory, getCompletedReads()).WillByDefault(Return(nextBlock));
+
+  ON_CALL(isa, getMaxInstructionSize()).WillByDefault(Return(insnMaxSizeBytes));
+
+  // Register loop boundary
+  fetchUnit.registerLoopBoundary(0xC);
+
+  // Set the instructions, within the loop body, to be returned from predecode
+  MacroOp mOp2 = {uopPtr2};
+  ON_CALL(isa, predecode(_, _, 0xC, _, _))
+      .WillByDefault(DoAll(SetArgReferee<3>(mOp2), Return(4)));
+  ON_CALL(*uop2, isBranch()).WillByDefault(Return(true));
+
+  MacroOp mOp = {uopPtr};
+  ON_CALL(isa, predecode(_, _, Ne(0xC), _, _))
+      .WillByDefault(DoAll(SetArgReferee<3>(mOp), Return(4)));
+  ON_CALL(*uop, isBranch()).WillByDefault(Return(false));
+
+  // Set the expectation from the predictor to be true so a loop body will
+  // be detected
+  ON_CALL(predictor, predict(_, _, _, _))
+      .WillByDefault(Return(BranchPrediction({true, 0x0})));
+
+  // Set Loop Buffer state to be LoopBufferState::FILLING
+  // Tick 4 times to process all 16 bytes of fetched data
+  for (int i = 0; i < 4; i++) {
+    fetchUnit.tick();
+  }
+
+  // Fetch the next block of instructions from memory
+  fetchUnit.requestFromPC();
+
+  // Fill Loop Buffer and set its state to be LoopBufferState::SUPPLYING
+  // Tick 4 times to process all 16 bytes of fetched data
+  for (int i = 0; i < 4; i++) {
+    fetchUnit.tick();
+  }
+
+  // Whilst the Loop Buffer state is LoopBufferState::SUPPLYING, the request
+  // read should never be called
+  EXPECT_CALL(memory, requestRead(_, _)).Times(0);
+  EXPECT_CALL(isa, getMaxInstructionSize()).Times(0);
+  EXPECT_CALL(memory, getCompletedReads()).Times(0);
+  fetchUnit.requestFromPC();
+
+  // Empty output buffer and ensure the correct instructions are supplied from
+  // the Loop Buffer
+  output.fill({});
+  fetchUnit.tick();
+  EXPECT_EQ(output.getTailSlots()[0], mOp);
+  output.fill({});
+  fetchUnit.tick();
+  EXPECT_EQ(output.getTailSlots()[0], mOp);
+  output.fill({});
+  fetchUnit.tick();
+  EXPECT_EQ(output.getTailSlots()[0], mOp);
+  output.fill({});
+  fetchUnit.tick();
+  EXPECT_EQ(output.getTailSlots()[0], mOp2);
+
+  // Flush the Loop Buffer and ensure correct instructions are fetched from
+  // memory
+  fetchUnit.flushLoopBuffer();
+  fetchUnit.updatePC(0x0);
+  EXPECT_CALL(memory, requestRead(_, _)).Times(AtLeast(1));
+  EXPECT_CALL(isa, getMaxInstructionSize()).Times(AtLeast(1));
+  EXPECT_CALL(memory, getCompletedReads()).Times(AtLeast(1));
+  fetchUnit.requestFromPC();
+  output.fill({});
+  fetchUnit.tick();
+  EXPECT_EQ(output.getTailSlots()[0], mOp);
+  output.fill({});
+  fetchUnit.tick();
+  EXPECT_EQ(output.getTailSlots()[0], mOp);
+  output.fill({});
+  fetchUnit.tick();
+  EXPECT_EQ(output.getTailSlots()[0], mOp);
+  output.fill({});
+  fetchUnit.tick();
+  EXPECT_EQ(output.getTailSlots()[0], mOp2);
+}
+
+// Tests the functionality of idling the supply to the Loop Buffer one of not
+// taken branch at the loopBoundaryAddress_
+TEST_P(PipelineFetchUnitTest, idleLoopBufferDueToNotTakenBoundary) {
+  // Set instructions to be fetched from memory
+  memory::MemoryReadResult memReadResultA = {
+      {0x0, blockSize}, RegisterValue(0xFFFF, blockSize), 1};
+  span<memory::MemoryReadResult> nextBlockA = {&memReadResultA, 1};
+  memory::MemoryReadResult memReadResultB = {
+      {0x10, blockSize}, RegisterValue(0xFFFF, blockSize), 1};
+  span<memory::MemoryReadResult> nextBlockB = {&memReadResultB, 1};
+  EXPECT_CALL(memory, getCompletedReads()).WillRepeatedly(Return(nextBlockA));
+
+  ON_CALL(isa, getMaxInstructionSize()).WillByDefault(Return(insnMaxSizeBytes));
+
+  // Register loop boundary
+  fetchUnit.registerLoopBoundary(0xC);
+
+  // Set the instructions, within the loop body, to be returned from predecode
+  MacroOp mOp2 = {uopPtr2};
+  ON_CALL(isa, predecode(_, _, Gt(0x8), _, _))
+      .WillByDefault(DoAll(SetArgReferee<3>(mOp2), Return(4)));
+  ON_CALL(*uop2, isBranch()).WillByDefault(Return(true));
+
+  MacroOp mOp = {uopPtr};
+  ON_CALL(isa, predecode(_, _, Lt(0xC), _, _))
+      .WillByDefault(DoAll(SetArgReferee<3>(mOp), Return(4)));
+  ON_CALL(*uop, isBranch()).WillByDefault(Return(false));
+
+  // Set the first expectation from the predictor to be true so a loop body will
+  // be detected
+  EXPECT_CALL(predictor, predict(_, _, _))
+      .WillOnce(Return(BranchPrediction({true, 0x0})));
+
+  // Set Loop Buffer state to be LoopBufferState::FILLING
+  // Tick 4 times to process all 16 bytes of fetched data
+  for (int i = 0; i < 4; i++) {
+    fetchUnit.tick();
+  }
+
+  // Fetch the next block of instructions from memory and change the expected
+  // outcome of the branch predictor
+  fetchUnit.requestFromPC();
+  EXPECT_CALL(predictor, predict(_, _, _))
+      .WillRepeatedly(Return(BranchPrediction({false, 0x0})));
+
+  // Attempt to fill Loop Buffer but prevent it on a not taken outcome at the
+  // loopBoundaryAddress_ branch
+  // Tick 4 times to process all 16 bytes of fetched data
+  for (int i = 0; i < 4; i++) {
+    fetchUnit.tick();
+  }
+
+  // Set the expectation for the next block to be fetched after the Loop Buffer
+  // state has been reset
+  const memory::MemoryAccessTarget target = {0x10, blockSize};
+  EXPECT_CALL(memory, getCompletedReads()).WillRepeatedly(Return(nextBlockB));
+  EXPECT_CALL(memory, requestRead(target, _)).Times(1);
+
+  // Fetch the next block of instructions from memory
+  fetchUnit.requestFromPC();
+
+  // Empty output buffer and ensure the correct instructions are fetched from
+  // memory
+  output.fill({});
+  fetchUnit.tick();
+  EXPECT_EQ(output.getTailSlots()[0], mOp2);
+  output.fill({});
+  fetchUnit.tick();
+  EXPECT_EQ(output.getTailSlots()[0], mOp2);
+  output.fill({});
   fetchUnit.tick();
+  EXPECT_EQ(output.getTailSlots()[0], mOp2);
+  output.fill({});
+  fetchUnit.tick();
+  EXPECT_EQ(output.getTailSlots()[0], mOp2);
+}
+
+// Tests that a min sized instruction held at the end of the fetch buffer is
+// allowed to be predecoded in the same cycle as being fetched
+TEST_P(PipelineFetchUnitTest, minSizeInstructionAtEndOfBuffer) {
+  ON_CALL(isa, getMaxInstructionSize()).WillByDefault(Return(insnMaxSizeBytes));
+  ON_CALL(isa, getMinInstructionSize()).WillByDefault(Return(insnMinSizeBytes));
+  ON_CALL(memory, getCompletedReads()).WillByDefault(Return(completedReads));
+
+  // Buffer will contain valid min size instruction so predecode returns
+  // min bytes read
+  MacroOp mOp = {uopPtr};
+  ON_CALL(isa, predecode(_, insnMinSizeBytes, 0x10 - insnMinSizeBytes, _))
+      .WillByDefault(DoAll(SetArgReferee<3>(mOp), Return(insnMinSizeBytes)));
+
+  // Fetch the data, only min bytes will be copied to fetch buffer. Should allow
+  // continuation to predecode
+  EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  EXPECT_CALL(isa, getMinInstructionSize()).Times(1);
+  EXPECT_CALL(isa, predecode(_, _, _, _)).Times(1);
+
+  uint64_t setPC = blockSize - insnMinSizeBytes;
+  // Fetch a single minimum sized instruction, buffered bytes = 0
+  fetchUnit.updatePC(setPC);
+  // Tick. Fetch data and predecode
+  fetchUnit.tick();
+
+  // Buffer should now be empty as all bytes predecoded
+  EXPECT_EQ(fetchUnit.bufferedBytes_, 0);
+  EXPECT_EQ(fetchUnit.output_.getTailSlots()[0], mOp);
+
+  // Expect a block starting at address 16 to be requested when we fetch again
+  EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+  EXPECT_CALL(memory,
+              requestRead(Field(&memory::MemoryAccessTarget::address, 16), _))
+      .Times(1);
+  fetchUnit.requestFromPC();
+
+  // Tick again, expecting that decoding will now resume
+  MacroOp mOp2 = {uopPtr2};
+  memory::MemoryReadResult nextBlockValue = {{16, blockSize}, 0, 1};
+  span<memory::MemoryReadResult> nextBlock = {&nextBlockValue, 1};
+  ON_CALL(memory, getCompletedReads()).WillByDefault(Return(nextBlock));
+  ON_CALL(isa, predecode(_, _, _, _))
+      .WillByDefault(DoAll(SetArgReferee<3>(mOp2), Return(insnMaxSizeBytes)));
+
+  EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+  // Completed reads called again as more data is requested
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  EXPECT_CALL(isa, getMinInstructionSize()).Times(1);
+  // Output of width 1 so only 1 call to predecode
+  EXPECT_CALL(isa, predecode(_, _, _, _)).Times(1);
+
+  fetchUnit.tick();
+
+  // Initially 0 bytes, 16 bytes added, max bytes predecoded leaving (16 - max)
+  // bytes left
+  EXPECT_EQ(fetchUnit.bufferedBytes_, 16 - insnMaxSizeBytes);
+  EXPECT_EQ(fetchUnit.output_.getTailSlots()[0], mOp2);
+}
+
+// Test that invalid min number of bytes held at the end of the buffer is not
+// successfully predecoded and that more data is fetched subsequently allowing
+// progression as a full instruction is now present in the buffer
+TEST_P(PipelineFetchUnitTest, invalidMinBytesAtEndOfBuffer) {
+  // This is only relevant if min and max size are different. Otherwise, there
+  // won't be any progression as the fetch unit will be caught in an infinite
+  // loop
+  if (insnMinSizeBytes < insnMaxSizeBytes) {
+    ON_CALL(isa, getMaxInstructionSize())
+        .WillByDefault(Return(insnMaxSizeBytes));
+    ON_CALL(isa, getMinInstructionSize())
+        .WillByDefault(Return(insnMinSizeBytes));
+    ON_CALL(memory, getCompletedReads()).WillByDefault(Return(completedReads));
+
+    // Buffer will contain invalid min bytes so predecode returns 0 bytes read
+    ON_CALL(isa, predecode(_, insnMinSizeBytes, 0x10 - insnMinSizeBytes, _))
+        .WillByDefault(Return(0));
+
+    // getMaxInstructionSize called for second time in assertion
+    if (strcmp(SIMENG_BUILD_TYPE, "Release") == 0) {
+      EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+    } else {
+      EXPECT_CALL(isa, getMaxInstructionSize()).Times(2);
+    }
+    EXPECT_CALL(memory, getCompletedReads()).Times(1);
+    EXPECT_CALL(isa, getMinInstructionSize()).Times(1);
+    EXPECT_CALL(isa, predecode(_, _, _, _)).Times(1);
+
+    uint64_t setPC = blockSize - insnMinSizeBytes;
+    // Fetch a single minimum sized instruction, buffered bytes = 0
+    fetchUnit.updatePC(setPC);
+    // Tick
+    fetchUnit.tick();
+
+    // No data consumed
+    EXPECT_EQ(fetchUnit.bufferedBytes_, insnMinSizeBytes);
+    EXPECT_EQ(fetchUnit.output_.getTailSlots()[0], MacroOp());
+
+    // Expect that memory is requested even though there is data in the buffer
+    // as bufferedBytes < maxInstructionSize
+    EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+    EXPECT_CALL(memory,
+                requestRead(Field(&memory::MemoryAccessTarget::address, 16), _))
+        .Times(1);
+    fetchUnit.requestFromPC();
+
+    // Tick again expecting buffer to be filled and a word is predecoded
+    MacroOp mOp = {uopPtr};
+    memory::MemoryReadResult nextBlockValue = {{16, blockSize}, 0, 1};
+    span<memory::MemoryReadResult> nextBlock = {&nextBlockValue, 1};
+    ON_CALL(memory, getCompletedReads()).WillByDefault(Return(nextBlock));
+    ON_CALL(isa, predecode(_, _, _, _))
+        .WillByDefault(DoAll(SetArgReferee<3>(mOp), Return(insnMaxSizeBytes)));
+
+    EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+    EXPECT_CALL(memory, getCompletedReads()).Times(1);
+    EXPECT_CALL(isa, getMinInstructionSize()).Times(1);
+    EXPECT_CALL(isa, predecode(_, _, _, _)).Times(1);
+
+    fetchUnit.tick();
+
+    // Initially min bytes, 16 bytes added, max bytes predecoded
+    EXPECT_EQ(fetchUnit.bufferedBytes_,
+              (insnMinSizeBytes + 16) - insnMaxSizeBytes);
+    EXPECT_EQ(fetchUnit.output_.getTailSlots()[0], mOp);
+  }
+}
+
+// When min and max instruction sizes are different, ensure progression with
+// valid min sized instruction at end of buffer when next read doesn't complete.
+TEST_P(PipelineFetchUnitTest, validMinSizeReadsDontComplete) {
+  // In the case that min and max are the same, memory is never requested as
+  // there is enough data in the buffer. In this case, the test isn't relevant
+  if (insnMinSizeBytes < insnMaxSizeBytes) {
+    ON_CALL(isa, getMaxInstructionSize())
+        .WillByDefault(Return(insnMaxSizeBytes));
+    ON_CALL(isa, getMinInstructionSize())
+        .WillByDefault(Return(insnMinSizeBytes));
+    ON_CALL(memory, getCompletedReads()).WillByDefault(Return(completedReads));
+
+    // Buffer will contain valid max and min sized instruction, predecode
+    // returns max bytes read on first tick
+    MacroOp mOp = {uopPtr};
+    ON_CALL(isa, predecode(_, insnMaxSizeBytes + insnMinSizeBytes,
+                           0x10 - (insnMaxSizeBytes + insnMinSizeBytes), _))
+        .WillByDefault(DoAll(SetArgReferee<3>(mOp), Return(insnMaxSizeBytes)));
+
+    // Fetch the data, only last max + min bytes from block. Should allow
+    // continuation to predecode
+    EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+    EXPECT_CALL(memory, getCompletedReads()).Times(1);
+    EXPECT_CALL(isa, getMinInstructionSize()).Times(1);
+    EXPECT_CALL(isa, predecode(_, _, _, _)).Times(1);
+
+    uint64_t setPC = blockSize - (insnMaxSizeBytes + insnMinSizeBytes);
+    // Fetch a minimum and maximum sized instruction, buffered bytes = 0
+    fetchUnit.updatePC(setPC);
+    // Tick and predecode max bytes
+    fetchUnit.tick();
+
+    // Ensure max bytes consumed
+    EXPECT_EQ(fetchUnit.bufferedBytes_, insnMinSizeBytes);
+    EXPECT_EQ(fetchUnit.pc_, blockSize - insnMinSizeBytes);
+    EXPECT_EQ(fetchUnit.output_.getTailSlots()[0], mOp);
+
+    // Expect that memory is requested even though there is data in the buffer
+    // as bufferedBytes < maxInstructionSize
+    EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+    EXPECT_CALL(memory,
+                requestRead(Field(&memory::MemoryAccessTarget::address, 16), _))
+        .Times(1);
+    fetchUnit.requestFromPC();
+
+    EXPECT_EQ(fetchUnit.bufferedBytes_, insnMinSizeBytes);
+    EXPECT_EQ(fetchUnit.pc_, blockSize - insnMinSizeBytes);
+
+    // Memory doesn't complete reads in next cycle but buffered bytes should be
+    // predecoded
+    MacroOp mOp2 = {uopPtr2};
+    ON_CALL(memory, getCompletedReads())
+        .WillByDefault(Return(span<memory::MemoryReadResult>{nullptr, 0}));
+    ON_CALL(isa, predecode(_, insnMinSizeBytes, 0x10 - insnMinSizeBytes, _))
+        .WillByDefault(DoAll(SetArgReferee<3>(mOp2), Return(insnMinSizeBytes)));
+
+    // Path through fetch as follows:
+    // More data required as bufferedBytes_ < maxInsnSize so getCompletedReads
+    // Doesn't complete so buffer doesn't get added to
+    // Buffer still has some valid data so predecode should be called
+
+    EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+    EXPECT_CALL(memory, getCompletedReads()).Times(1);
+    EXPECT_CALL(isa, getMinInstructionSize()).Times(2);
+    EXPECT_CALL(isa, predecode(_, _, _, _)).Times(1);
+
+    // Tick
+    fetchUnit.tick();
+
+    // Ensure min bytes are consumed
+    EXPECT_EQ(fetchUnit.bufferedBytes_, 0);
+    EXPECT_EQ(fetchUnit.pc_, 16);
+    EXPECT_EQ(fetchUnit.output_.getTailSlots()[0], mOp2);
+  }
 }
 
+// Test that minimum bytes held at the end of the buffer is not successfully
+// predecoded and should be re-tried when reads don't complete
+TEST_P(PipelineFetchUnitTest, invalidMinBytesreadsDontComplete) {
+  // In the case where min and max are the same, predecode will never return 0
+  // so the test is only relevent in the case where they are different
+  if (insnMinSizeBytes < insnMaxSizeBytes) {
+    ON_CALL(isa, getMaxInstructionSize())
+        .WillByDefault(Return(insnMaxSizeBytes));
+    ON_CALL(isa, getMinInstructionSize())
+        .WillByDefault(Return(insnMinSizeBytes));
+    ON_CALL(memory, getCompletedReads()).WillByDefault(Return(completedReads));
+
+    // Buffer will contain invalid min bytes so predecode returns 0 bytes read
+    ON_CALL(isa, predecode(_, insnMinSizeBytes, 0x10 - insnMinSizeBytes, _))
+        .WillByDefault(Return(0));
+
+    // getMaxInstructionSize called for second time in assertion
+    if (strcmp(SIMENG_BUILD_TYPE, "Release") == 0) {
+      EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+    } else {
+      EXPECT_CALL(isa, getMaxInstructionSize()).Times(2);
+    }
+    EXPECT_CALL(memory, getCompletedReads()).Times(1);
+    EXPECT_CALL(isa, getMinInstructionSize()).Times(1);
+    EXPECT_CALL(isa, predecode(_, _, _, _)).Times(1);
+
+    uint64_t setPC = blockSize - insnMinSizeBytes;
+    // Fetch a minimum number of bytes, buffered bytes = 0
+    fetchUnit.updatePC(setPC);
+    // Tick
+    fetchUnit.tick();
+
+    // No data consumed
+    EXPECT_EQ(fetchUnit.bufferedBytes_, insnMinSizeBytes);
+    EXPECT_EQ(fetchUnit.pc_, blockSize - insnMinSizeBytes);
+    EXPECT_EQ(fetchUnit.output_.getTailSlots()[0], MacroOp());
+
+    // Expect that memory is requested even though there is data in the buffer
+    // as bufferedBytes < maxInstructionSize
+    EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+    EXPECT_CALL(memory,
+                requestRead(Field(&memory::MemoryAccessTarget::address, 16), _))
+        .Times(1);
+    fetchUnit.requestFromPC();
+
+    EXPECT_EQ(fetchUnit.bufferedBytes_, insnMinSizeBytes);
+    EXPECT_EQ(fetchUnit.pc_, blockSize - insnMinSizeBytes);
+
+    // Memory doesn't complete reads in next cycle but buffered bytes should
+    // attempt to be predecoded
+    ON_CALL(memory, getCompletedReads())
+        .WillByDefault(Return(span<memory::MemoryReadResult>{nullptr, 0}));
+    // Predecode still returns no bytes read
+    ON_CALL(isa, predecode(_, insnMinSizeBytes, 0x10 - insnMinSizeBytes, _))
+        .WillByDefault(Return(0));
+
+    // getMaxInsnSize called again in assertion
+    if (strcmp(SIMENG_BUILD_TYPE, "Release") == 0) {
+      EXPECT_CALL(isa, getMaxInstructionSize()).Times(1);
+    } else {
+      EXPECT_CALL(isa, getMaxInstructionSize()).Times(2);
+    }
+    EXPECT_CALL(memory, getCompletedReads()).Times(1);
+    EXPECT_CALL(isa, getMinInstructionSize()).Times(2);
+    EXPECT_CALL(isa, predecode(_, _, _, _)).Times(1);
+
+    // Tick
+    fetchUnit.tick();
+
+    // Ensure min bytes are not consumed
+    EXPECT_EQ(fetchUnit.bufferedBytes_, insnMinSizeBytes);
+    EXPECT_EQ(fetchUnit.pc_, blockSize - insnMinSizeBytes);
+    EXPECT_EQ(fetchUnit.output_.getTailSlots()[0], MacroOp());
+  }
+}
+
+// Test that the Fetch unit is correctly tallying the number of branch
+// instructions fetched, and that the getBranchFetchedCount getter function
+// returns the correct value
+TEST_P(PipelineFetchUnitTest, branchesFetchedCountedIncorrectly) {
+  // Set instructions to be fetched from memory
+  memory::MemoryReadResult memReadResultA = {
+      {0x0, blockSize}, RegisterValue(0xFFFF, blockSize), 1};
+  span<memory::MemoryReadResult> nextBlockA = {&memReadResultA, 1};
+  memory::MemoryReadResult memReadResultB = {
+      {0x10, blockSize}, RegisterValue(0xFFFF, blockSize), 1};
+  span<memory::MemoryReadResult> nextBlockB = {&memReadResultB, 1};
+  EXPECT_CALL(memory, getCompletedReads()).WillRepeatedly(Return(nextBlockA));
+
+  ON_CALL(isa, getMaxInstructionSize()).WillByDefault(Return(insnMaxSizeBytes));
+
+  // Set the instructions to be returned from predecode
+  MacroOp mOp2 = {uopPtr2};
+  ON_CALL(isa, predecode(_, _, Gt(0x8), _))
+      .WillByDefault(DoAll(SetArgReferee<3>(mOp2), Return(4)));
+  ON_CALL(*uop2, isBranch()).WillByDefault(Return(true));
+  MacroOp mOp = {uopPtr};
+  ON_CALL(isa, predecode(_, _, Lt(0xC), _))
+      .WillByDefault(DoAll(SetArgReferee<3>(mOp), Return(4)));
+  ON_CALL(*uop, isBranch()).WillByDefault(Return(false));
+  EXPECT_CALL(predictor, predict(_, _, _))
+      .WillOnce(Return(BranchPrediction({true, 0x0})));
+
+  // Fetch instructions from data block -- one branch instruction
+  for (int i = 0; i < 4; i++) {
+    fetchUnit.tick();
+  }
+
+  // Confirm that the correct number of fetched branches has been recorded by
+  // the Fetch Unit
+  EXPECT_EQ(fetchUnit.getBranchFetchedCount(), 1);
+
+  // Fetch the next block of instructions from memory and change the expected
+  // outcome of the branch predictor
+  fetchUnit.requestFromPC();
+  EXPECT_CALL(predictor, predict(_, _, _))
+      .WillRepeatedly(Return(BranchPrediction({false, 0x0})));
+
+  // Fetch instructions from data block -- one branch instruction
+  for (int i = 0; i < 4; i++) {
+    fetchUnit.tick();
+  }
+
+  // Confirm that the correct number of fetched branches has been recorded by
+  // the Fetch Unit
+  EXPECT_EQ(fetchUnit.getBranchFetchedCount(), 2);
+
+  const memory::MemoryAccessTarget target = {0x10, blockSize};
+  EXPECT_CALL(memory, getCompletedReads()).WillRepeatedly(Return(nextBlockB));
+  EXPECT_CALL(memory, requestRead(target, _)).Times(1);
+
+  // Fetch instructions from data block -- four branch instructions
+  fetchUnit.requestFromPC();
+  for (int i = 0; i < 4; i++) {
+    fetchUnit.tick();
+  }
+
+  // Confirm that the correct number of fetched branches has been recorded by
+  // the Fetch Unit
+  EXPECT_EQ(fetchUnit.getBranchFetchedCount(), 6);
+}
+
+INSTANTIATE_TEST_SUITE_P(PipelineFetchUnitTests, PipelineFetchUnitTest,
+                         ::testing::Values(std::pair(2, 4), std::pair(4, 4)));
+
 }  // namespace pipeline
 }  // namespace simeng
diff --git a/test/unit/pipeline/LoadStoreQueueTest.cc b/test/unit/pipeline/LoadStoreQueueTest.cc
index a6519d6632..2dca391ceb 100644
--- a/test/unit/pipeline/LoadStoreQueueTest.cc
+++ b/test/unit/pipeline/LoadStoreQueueTest.cc
@@ -17,6 +17,10 @@ const uint8_t MAX_LOADS = 32;
 const uint8_t MAX_STORES = 32;
 const uint8_t MAX_COMBINED = 64;
 
+// TODO: When the associated requestWrite(...) gets moved into the LSQ's tick()
+// functionality, we need to check the state of requestStoreQueue_ and calling
+// of requestWrite(...) in a vareity of tests
+
 class MockForwardOperandsHandler {
  public:
   MOCK_METHOD2(forwardOperands,
@@ -36,10 +40,12 @@ class LoadStoreQueueTest : public ::testing::TestWithParam<bool> {
         loadUop2(new MockInstruction),
         storeUop(new MockInstruction),
         storeUop2(new MockInstruction),
+        loadStoreUop(new MockInstruction),
         loadUopPtr(loadUop),
         loadUopPtr2(loadUop2),
         storeUopPtr(storeUop),
-        storeUopPtr2(storeUop2) {
+        storeUopPtr2(storeUop2),
+        loadStoreUopPtr(loadStoreUop) {
     // Set up sensible return values for the load uop
     ON_CALL(*loadUop, isLoad()).WillByDefault(Return(true));
     ON_CALL(*loadUop, getGeneratedAddresses())
@@ -54,23 +60,32 @@ class LoadStoreQueueTest : public ::testing::TestWithParam<bool> {
   }
 
  protected:
-  LoadStoreQueue getQueue() {
+  LoadStoreQueue getQueue(bool exclusive = false,
+                          uint16_t loadBandwidth = UINT16_MAX,
+                          uint16_t storeBandwidth = UINT16_MAX,
+                          uint16_t permittedRequests = UINT16_MAX,
+                          uint16_t permittedLoads = UINT16_MAX,
+                          uint16_t permittedStores = UINT16_MAX) {
     if (GetParam()) {
       // Combined queue
-      return LoadStoreQueue(MAX_COMBINED, dataMemory,
-                            {completionSlots.data(), completionSlots.size()},
-                            [this](auto registers, auto values) {
-                              forwardOperandsHandler.forwardOperands(registers,
-                                                                     values);
-                            });
+      return LoadStoreQueue(
+          MAX_COMBINED, dataMemory,
+          {completionSlots.data(), completionSlots.size()},
+          [this](auto registers, auto values) {
+            forwardOperandsHandler.forwardOperands(registers, values);
+          },
+          [](auto uop) {}, exclusive, loadBandwidth, storeBandwidth,
+          permittedRequests, permittedLoads, permittedStores);
     } else {
       // Split queue
-      return LoadStoreQueue(MAX_LOADS, MAX_STORES, dataMemory,
-                            {completionSlots.data(), completionSlots.size()},
-                            [this](auto registers, auto values) {
-                              forwardOperandsHandler.forwardOperands(registers,
-                                                                     values);
-                            });
+      return LoadStoreQueue(
+          MAX_LOADS, MAX_STORES, dataMemory,
+          {completionSlots.data(), completionSlots.size()},
+          [this](auto registers, auto values) {
+            forwardOperandsHandler.forwardOperands(registers, values);
+          },
+          [](auto uop) {}, exclusive, loadBandwidth, storeBandwidth,
+          permittedRequests, permittedLoads, permittedStores);
     }
   }
 
@@ -108,8 +123,8 @@ class LoadStoreQueueTest : public ::testing::TestWithParam<bool> {
   std::vector<pipeline::PipelineBuffer<std::shared_ptr<Instruction>>>
       completionSlots;
 
-  std::vector<MemoryAccessTarget> addresses;
-  span<const MemoryAccessTarget> addressesSpan;
+  std::vector<memory::MemoryAccessTarget> addresses;
+  span<const memory::MemoryAccessTarget> addressesSpan;
 
   std::vector<RegisterValue> data;
   span<const RegisterValue> dataSpan;
@@ -120,11 +135,13 @@ class LoadStoreQueueTest : public ::testing::TestWithParam<bool> {
   MockInstruction* loadUop2;
   MockInstruction* storeUop;
   MockInstruction* storeUop2;
+  MockInstruction* loadStoreUop;
 
   std::shared_ptr<Instruction> loadUopPtr;
   std::shared_ptr<Instruction> loadUopPtr2;
   std::shared_ptr<MockInstruction> storeUopPtr;
   std::shared_ptr<MockInstruction> storeUopPtr2;
+  std::shared_ptr<MockInstruction> loadStoreUopPtr;
 
   MockForwardOperandsHandler forwardOperandsHandler;
 
@@ -133,9 +150,9 @@ class LoadStoreQueueTest : public ::testing::TestWithParam<bool> {
 
 // Test that a split queue can be constructed correctly
 TEST_F(LoadStoreQueueTest, SplitQueue) {
-  LoadStoreQueue queue =
-      LoadStoreQueue(MAX_LOADS, MAX_STORES, dataMemory, {nullptr, 0},
-                     [](auto registers, auto values) {});
+  LoadStoreQueue queue = LoadStoreQueue(
+      MAX_LOADS, MAX_STORES, dataMemory, {nullptr, 0},
+      [](auto registers, auto values) {}, [](auto uop) {});
 
   EXPECT_EQ(queue.isCombined(), false);
   EXPECT_EQ(queue.getLoadQueueSpace(), MAX_LOADS);
@@ -145,8 +162,9 @@ TEST_F(LoadStoreQueueTest, SplitQueue) {
 
 // Test that a combined queue can be constructed correctly
 TEST_F(LoadStoreQueueTest, CombinedQueue) {
-  LoadStoreQueue queue = LoadStoreQueue(MAX_COMBINED, dataMemory, {nullptr, 0},
-                                        [](auto registers, auto values) {});
+  LoadStoreQueue queue = LoadStoreQueue(
+      MAX_COMBINED, dataMemory, {nullptr, 0},
+      [](auto registers, auto values) {}, [](auto uop) {});
 
   EXPECT_EQ(queue.isCombined(), true);
   EXPECT_EQ(queue.getLoadQueueSpace(), MAX_COMBINED);
@@ -200,11 +218,49 @@ TEST_P(LoadStoreQueueTest, AddStore) {
 TEST_P(LoadStoreQueueTest, PurgeFlushedLoad) {
   auto queue = getQueue();
   auto initialLoadSpace = queue.getLoadQueueSpace();
+  memory::MemoryReadResult completedRead = {addresses[0], data[0], 1};
+  span<memory::MemoryReadResult> completedReads = {&completedRead, 1};
+
+  // Set load instruction attributes
+  loadUop->setSequenceId(0);
+  loadUop->setInstructionId(0);
+  loadUop2->setSequenceId(1);
+  loadUop2->setInstructionId(1);
+
+  EXPECT_CALL(*loadUop, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(addressesSpan));
+  EXPECT_CALL(*loadUop2, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(addressesSpan));
+
+  // Add loads to LSQ
   queue.addLoad(loadUopPtr);
+  queue.addLoad(loadUopPtr2);
 
+  // Start the first load so that its accesses can be added to
+  // requestLoadQueue_/requestedLoads_ and expect a memory access to be
+  // performed
+  queue.startLoad(loadUopPtr);
+  EXPECT_CALL(dataMemory, requestRead(addresses[0], 0)).Times(1);
+  queue.tick();
+
+  // Start the second load so that its accesses can be added to
+  // requestLoadQueue_/requestedLoads_ but flush it before it can perform a
+  // memory access
+  queue.startLoad(loadUopPtr2);
   loadUop->setFlushed();
+  loadUop2->setFlushed();
   queue.purgeFlushed();
 
+  // Expect no activity regarding memory accesses or the passing of the load
+  // instruction to the output buffer
+  EXPECT_CALL(dataMemory, requestRead(_, _)).Times(0);
+  EXPECT_CALL(dataMemory, getCompletedReads())
+      .WillRepeatedly(Return(completedReads));
+  queue.tick();
+
+  EXPECT_EQ(completionSlots[0].getTailSlots()[0], nullptr);
   EXPECT_EQ(queue.getLoadQueueSpace(), initialLoadSpace);
 }
 
@@ -225,14 +281,24 @@ TEST_P(LoadStoreQueueTest, Load) {
   loadUop->setSequenceId(1);
   auto queue = getQueue();
 
-  MemoryReadResult completedRead = {addresses[0], data[0], 1};
-  span<MemoryReadResult> completedReads = {&completedRead, 1};
-
-  EXPECT_CALL(*loadUop, getGeneratedAddresses()).Times(AtLeast(1));
+  memory::MemoryReadResult completedRead = {addresses[0], data[0], 1};
+  span<memory::MemoryReadResult> completedReads = {&completedRead, 1};
 
-  loadUop->setDataPending(addresses.size());
+  // Set load instruction attributes
+  EXPECT_CALL(*loadUop, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(addressesSpan));
+  loadUop->setLSQLatency(3);
 
+  // Begin load in LSQ
   queue.addLoad(loadUopPtr);
+  queue.startLoad(loadUopPtr);
+
+  // Given 3 cycle latency, no requests should occur in the first two ticks of
+  // the LSQ
+  EXPECT_CALL(dataMemory, requestRead(_, _)).Times(0);
+  queue.tick();
+  queue.tick();
 
   // Check that a read request is made to the memory interface
   EXPECT_CALL(dataMemory, requestRead(addresses[0], _)).Times(1);
@@ -242,15 +308,40 @@ TEST_P(LoadStoreQueueTest, Load) {
       .WillRepeatedly(Return(completedReads));
 
   // Check that the LSQ supplies the right data to the instruction
-  // TODO: Replace with check for call over memory interface in future?
   EXPECT_CALL(*loadUop,
-              supplyData(0, Property(&RegisterValue::get<uint8_t>, data[0])))
+              supplyData(addresses[0].address,
+                         Property(&RegisterValue::get<uint8_t>, data[0])))
       .Times(1);
 
+  // Tick the queue to complete the load
+  queue.tick();
+
+  EXPECT_EQ(completionSlots[0].getTailSlots()[0].get(), loadUop);
+}
+
+// Tests that a queue can perform a load with no addresses
+TEST_P(LoadStoreQueueTest, LoadWithNoAddresses) {
+  loadUop->setSequenceId(1);
+  auto queue = getQueue();
+
+  span<const memory::MemoryAccessTarget> emptyAddressesSpan = {};
+
+  EXPECT_CALL(*loadUop, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(emptyAddressesSpan));
+
+  // Check that a read request isn't made to the memory interface but the load
+  // completes in the LSQ
+  EXPECT_CALL(dataMemory, requestRead(_, _)).Times(0);
+  EXPECT_CALL(*loadUop, execute()).Times(1);
+
+  queue.addLoad(loadUopPtr);
   queue.startLoad(loadUopPtr);
 
   // Tick the queue to complete the load
   queue.tick();
+
+  EXPECT_EQ(completionSlots[0].getTailSlots()[0].get(), loadUop);
 }
 
 // Tests that a queue can commit a load
@@ -272,14 +363,18 @@ TEST_P(LoadStoreQueueTest, Store) {
   auto queue = getQueue();
   auto initialStoreSpace = queue.getStoreQueueSpace();
 
-  EXPECT_CALL(*storeUop, getGeneratedAddresses()).Times(AtLeast(1));
-  EXPECT_CALL(*storeUop, getData()).Times(AtLeast(1));
-
+  // Set store instruction attributes
   storeUop->setSequenceId(1);
   storeUop->setInstructionId(1);
 
+  EXPECT_CALL(*storeUop, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(addressesSpan));
+  EXPECT_CALL(*storeUop, getData())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(dataSpan));
+
   queue.addStore(storeUopPtr);
-  storeUopPtr->setCommitReady();
   queue.supplyStoreData(storeUopPtr);
 
   // Check that a write request is sent to the memory interface
@@ -296,6 +391,298 @@ TEST_P(LoadStoreQueueTest, Store) {
   EXPECT_EQ(queue.getStoreQueueSpace(), initialStoreSpace);
 }
 
+// Tests that a queue can perform a load-store operation
+TEST_P(LoadStoreQueueTest, LoadStore) {
+  auto queue = getQueue();
+  auto initialLoadSpace = queue.getLoadQueueSpace();
+  auto initialStoreSpace = queue.getStoreQueueSpace();
+
+  memory::MemoryReadResult completedRead = {addresses[0], data[0], 1};
+  span<memory::MemoryReadResult> completedReads = {&completedRead, 1};
+
+  // Set load-store instruction attributes
+  loadStoreUop->setSequenceId(1);
+  loadStoreUop->setInstructionId(1);
+
+  EXPECT_CALL(*loadStoreUop, isLoad())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(true));
+  EXPECT_CALL(*loadStoreUop, isStoreData())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(true));
+
+  EXPECT_CALL(*loadStoreUop, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(addressesSpan));
+  EXPECT_CALL(*loadStoreUop, getData())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(dataSpan));
+
+  // Register load-store operation and start load portion
+  queue.addLoad(loadStoreUopPtr);
+  queue.addStore(loadStoreUopPtr);
+  queue.startLoad(loadStoreUopPtr);
+
+  // Check that a read request is made to the memory interface
+  EXPECT_CALL(dataMemory, requestRead(addresses[0], _)).Times(1);
+
+  // Expect a check against finished reads and return the result
+  EXPECT_CALL(dataMemory, getCompletedReads())
+      .WillRepeatedly(Return(completedReads));
+
+  // Check that the LSQ supplies the right data to the instruction
+  EXPECT_CALL(*loadStoreUop,
+              supplyData(addresses[0].address,
+                         Property(&RegisterValue::get<uint8_t>, data[0])))
+      .Times(1);
+
+  // Tick the queue to complete the load portion of the load-store
+  queue.tick();
+  EXPECT_EQ(completionSlots[0].getTailSlots()[0].get(), loadStoreUop);
+
+  // Check that a write request is sent to the memory interface
+  EXPECT_CALL(dataMemory,
+              requestWrite(addresses[0],
+                           Property(&RegisterValue::get<uint8_t>, data[0])))
+      .Times(1);
+
+  // Commit both potions of the load-store
+  queue.commitLoad(loadStoreUopPtr);
+  queue.commitStore(loadStoreUopPtr);
+
+  // Check the load-store was removed
+  EXPECT_EQ(queue.getLoadQueueSpace(), initialLoadSpace);
+  EXPECT_EQ(queue.getStoreQueueSpace(), initialStoreSpace);
+}
+
+// Tests that bandwidth restrictions are adhered to in a non-exclusive LSQ
+TEST_P(LoadStoreQueueTest, NonExclusiveBandwidthRestriction) {
+  auto queue = getQueue(false, 3, 3);
+
+  // Set instruction attributes
+  loadUop->setSequenceId(0);
+  loadUop->setInstructionId(0);
+  storeUop->setSequenceId(1);
+  storeUop->setInstructionId(1);
+  loadUop2->setSequenceId(2);
+  loadUop2->setInstructionId(2);
+
+  std::vector<memory::MemoryAccessTarget> multipleAddresses = {{1, 2}, {2, 2}};
+  span<const memory::MemoryAccessTarget> multipleAddressesSpan = {
+      multipleAddresses.data(), multipleAddresses.size()};
+  std::vector<RegisterValue> storeData = {static_cast<uint8_t>(0x01),
+                                          static_cast<uint8_t>(0x10)};
+  span<const RegisterValue> storeDataSpan = {storeData.data(),
+                                             storeData.size()};
+
+  EXPECT_CALL(*loadUop, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(multipleAddressesSpan));
+  EXPECT_CALL(*storeUop, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(multipleAddressesSpan));
+  EXPECT_CALL(*loadUop2, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(multipleAddressesSpan));
+  EXPECT_CALL(*storeUop, getData())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(storeDataSpan));
+
+  // Add instructions to LSQ and register their accesses to be processed in the
+  // tick() function
+  queue.addLoad(loadUopPtr);
+  queue.addLoad(loadUopPtr2);
+  queue.startLoad(loadUopPtr);
+  queue.startLoad(loadUopPtr2);
+  queue.addStore(storeUopPtr);
+  queue.supplyStoreData(storeUopPtr);
+  queue.commitStore(storeUopPtr);
+
+  // Set expectations for tick logic based on set restrictions. Only 2 bytes of
+  // read and 2 bytes of write accesses should be processed per cycle (in this
+  // case that translates to one of the two addresses each uop has to handle).
+  EXPECT_CALL(dataMemory, requestRead(_, 0)).Times(1);
+  queue.tick();
+  EXPECT_CALL(dataMemory, requestRead(_, 0)).Times(1);
+  queue.tick();
+  EXPECT_CALL(dataMemory, requestRead(_, 2)).Times(1);
+  queue.tick();
+  EXPECT_CALL(dataMemory, requestRead(_, 2)).Times(1);
+  queue.tick();
+}
+
+// Tests that bandwidth restrictions are adhered to in an exclusive LSQ
+TEST_P(LoadStoreQueueTest, ExclusiveBandwidthRestriction) {
+  auto queue = getQueue(true, 3, 3);
+
+  // Set instruction attributes
+  loadUop->setSequenceId(0);
+  loadUop->setInstructionId(0);
+  storeUop->setSequenceId(1);
+  storeUop->setInstructionId(1);
+  loadUop2->setSequenceId(2);
+  loadUop2->setInstructionId(2);
+
+  std::vector<memory::MemoryAccessTarget> multipleAddresses = {{1, 2}, {2, 2}};
+  span<const memory::MemoryAccessTarget> multipleAddressesSpan = {
+      multipleAddresses.data(), multipleAddresses.size()};
+  std::vector<RegisterValue> storeData = {static_cast<uint8_t>(0x01),
+                                          static_cast<uint8_t>(0x10)};
+  span<const RegisterValue> storeDataSpan = {storeData.data(),
+                                             storeData.size()};
+
+  EXPECT_CALL(*loadUop, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(multipleAddressesSpan));
+  EXPECT_CALL(*storeUop, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(multipleAddressesSpan));
+  EXPECT_CALL(*loadUop2, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(multipleAddressesSpan));
+  EXPECT_CALL(*storeUop, getData())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(storeDataSpan));
+
+  // Add instructions to LSQ and register their accesses to be processed in the
+  // tick() function
+  queue.addLoad(loadUopPtr);
+  queue.addLoad(loadUopPtr2);
+  queue.startLoad(loadUopPtr);
+  queue.startLoad(loadUopPtr2);
+  queue.addStore(storeUopPtr);
+  queue.supplyStoreData(storeUopPtr);
+  queue.commitStore(storeUopPtr);
+
+  // Set expectations for tick logic based on set restrictions. Only 2 bytes of
+  // read and 2 bytes of write accesses should be processed per cycle (in this
+  // case that translates to one of the two addresses each uop has to handle).
+  // However, there cannot be an overlap between load and store bandwidth usage
+  // per cycle due to the LSQ being exclusive
+  EXPECT_CALL(dataMemory, requestRead(_, _)).Times(0);
+  queue.tick();
+  EXPECT_CALL(dataMemory, requestRead(_, _)).Times(0);
+  queue.tick();
+  EXPECT_CALL(dataMemory, requestRead(_, 0)).Times(1);
+  queue.tick();
+  EXPECT_CALL(dataMemory, requestRead(_, 0)).Times(1);
+  queue.tick();
+  EXPECT_CALL(dataMemory, requestRead(_, 2)).Times(1);
+  queue.tick();
+  EXPECT_CALL(dataMemory, requestRead(_, 2)).Times(1);
+  queue.tick();
+}
+
+// Tests that request restrictions are adhered to in a non-exclusive LSQ
+TEST_P(LoadStoreQueueTest, NonExclusiveRequestsRestriction) {
+  auto queue = getQueue(false, UINT16_MAX, UINT16_MAX, 2, 2, 1);
+
+  // Set instruction attributes
+  loadUop->setSequenceId(0);
+  loadUop->setInstructionId(0);
+  storeUop->setSequenceId(1);
+  storeUop->setInstructionId(1);
+  loadUop2->setSequenceId(2);
+  loadUop2->setInstructionId(2);
+
+  std::vector<memory::MemoryAccessTarget> multipleAddresses = {{1, 2}, {2, 2}};
+  span<const memory::MemoryAccessTarget> multipleAddressesSpan = {
+      multipleAddresses.data(), multipleAddresses.size()};
+  std::vector<RegisterValue> storeData = {static_cast<uint8_t>(0x01),
+                                          static_cast<uint8_t>(0x10)};
+  span<const RegisterValue> storeDataSpan = {storeData.data(),
+                                             storeData.size()};
+
+  EXPECT_CALL(*loadUop, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(multipleAddressesSpan));
+  EXPECT_CALL(*storeUop, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(multipleAddressesSpan));
+  EXPECT_CALL(*loadUop2, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(multipleAddressesSpan));
+  EXPECT_CALL(*storeUop, getData())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(storeDataSpan));
+
+  // Add instructions to LSQ and register their accesses to be processed in the
+  // tick() function
+  queue.addLoad(loadUopPtr);
+  queue.addLoad(loadUopPtr2);
+  queue.startLoad(loadUopPtr);
+  queue.startLoad(loadUopPtr2);
+  queue.addStore(storeUopPtr);
+  queue.supplyStoreData(storeUopPtr);
+  queue.commitStore(storeUopPtr);
+
+  // Set expectations for tick logic based on set restrictions. Either 2 reads
+  // or 1 read and 1 write should be processed per cycle
+  EXPECT_CALL(dataMemory, requestRead(_, 0)).Times(1);
+  queue.tick();
+  EXPECT_CALL(dataMemory, requestRead(_, 0)).Times(1);
+  queue.tick();
+  EXPECT_CALL(dataMemory, requestRead(_, 2)).Times(2);
+  queue.tick();
+}
+
+// Tests that request restrictions are adhered to in an exclusive LSQ
+TEST_P(LoadStoreQueueTest, ExclusiveRequestsRestriction) {
+  auto queue = getQueue(true, UINT16_MAX, UINT16_MAX, 3, 2, 1);
+
+  // Set instruction attributes
+  loadUop->setSequenceId(0);
+  loadUop->setInstructionId(0);
+  storeUop->setSequenceId(1);
+  storeUop->setInstructionId(1);
+  loadUop2->setSequenceId(2);
+  loadUop2->setInstructionId(2);
+
+  std::vector<memory::MemoryAccessTarget> multipleAddresses = {{1, 2}, {2, 2}};
+  span<const memory::MemoryAccessTarget> multipleAddressesSpan = {
+      multipleAddresses.data(), multipleAddresses.size()};
+  std::vector<RegisterValue> storeData = {static_cast<uint8_t>(0x01),
+                                          static_cast<uint8_t>(0x10)};
+  span<const RegisterValue> storeDataSpan = {storeData.data(),
+                                             storeData.size()};
+
+  EXPECT_CALL(*loadUop, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(multipleAddressesSpan));
+  EXPECT_CALL(*storeUop, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(multipleAddressesSpan));
+  EXPECT_CALL(*loadUop2, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(multipleAddressesSpan));
+  EXPECT_CALL(*storeUop, getData())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(storeDataSpan));
+
+  // Add instructions to LSQ and register their accesses to be processed in the
+  // tick() function
+  queue.addLoad(loadUopPtr);
+  queue.addLoad(loadUopPtr2);
+  queue.startLoad(loadUopPtr);
+  queue.startLoad(loadUopPtr2);
+  queue.addStore(storeUopPtr);
+  queue.supplyStoreData(storeUopPtr);
+  queue.commitStore(storeUopPtr);
+
+  // Set expectations for tick logic based on set restrictions. Only 2 reads and
+  // 1 write should be processed per cycle. However, there cannot be an overlap
+  // between load and store requests being processed in a single cycle due to
+  // the LSQ being exclusive.
+  EXPECT_CALL(dataMemory, requestRead(_, _)).Times(0);
+  queue.tick();
+  EXPECT_CALL(dataMemory, requestRead(_, _)).Times(0);
+  queue.tick();
+  EXPECT_CALL(dataMemory, requestRead(_, 0)).Times(2);
+  queue.tick();
+  EXPECT_CALL(dataMemory, requestRead(_, 2)).Times(2);
+  queue.tick();
+}
+
 // Tests that committing a store will correctly detect a direct memory order
 // violation
 TEST_P(LoadStoreQueueTest, Violation) {
@@ -335,19 +722,19 @@ TEST_P(LoadStoreQueueTest, ViolationOverlap) {
   auto queue = getQueue();
 
   // The store will write the byte `0x01` at addresses 0 and 1
-  std::vector<MemoryAccessTarget> storeAddresses = {{0, 2}};
+  std::vector<memory::MemoryAccessTarget> storeAddresses = {{0, 2}};
   std::vector<RegisterValue> storeData = {static_cast<uint16_t>(0x0101)};
 
-  span<const MemoryAccessTarget> storeAddressesSpan = {storeAddresses.data(),
-                                                       storeAddresses.size()};
+  span<const memory::MemoryAccessTarget> storeAddressesSpan = {
+      storeAddresses.data(), storeAddresses.size()};
   span<const RegisterValue> storeDataSpan = {storeData.data(),
                                              storeData.size()};
 
   // The load will read two bytes, at addresses 1 and 2; this will overlap with
   // the written data at address 1
-  std::vector<MemoryAccessTarget> loadAddresses = {{1, 2}};
-  span<const MemoryAccessTarget> loadAddressesSpan = {loadAddresses.data(),
-                                                      loadAddresses.size()};
+  std::vector<memory::MemoryAccessTarget> loadAddresses = {{1, 2}};
+  span<const memory::MemoryAccessTarget> loadAddressesSpan = {
+      loadAddresses.data(), loadAddresses.size()};
 
   EXPECT_CALL(*storeUop, getGeneratedAddresses())
       .Times(AtLeast(1))
@@ -372,9 +759,9 @@ TEST_P(LoadStoreQueueTest, NoViolation) {
   auto queue = getQueue();
 
   // A different address to the one being stored to
-  std::vector<MemoryAccessTarget> loadAddresses = {{1, 1}};
-  span<const MemoryAccessTarget> loadAddressesSpan = {loadAddresses.data(),
-                                                      loadAddresses.size()};
+  std::vector<memory::MemoryAccessTarget> loadAddresses = {{1, 1}};
+  span<const memory::MemoryAccessTarget> loadAddressesSpan = {
+      loadAddresses.data(), loadAddresses.size()};
 
   EXPECT_CALL(*loadUop, getGeneratedAddresses())
       .Times(AtLeast(1))
@@ -399,9 +786,9 @@ TEST_P(LoadStoreQueueTest, FlushDuringConfliction) {
   loadUop2->setFlushed();
 
   // Set store addresses and data
-  std::vector<MemoryAccessTarget> storeAddresses = {{1, 1}, {2, 1}};
-  span<const MemoryAccessTarget> storeAddressesSpan = {storeAddresses.data(),
-                                                       storeAddresses.size()};
+  std::vector<memory::MemoryAccessTarget> storeAddresses = {{1, 1}, {2, 1}};
+  span<const memory::MemoryAccessTarget> storeAddressesSpan = {
+      storeAddresses.data(), storeAddresses.size()};
   std::vector<RegisterValue> storeData = {static_cast<uint8_t>(0x01),
                                           static_cast<uint8_t>(0x10)};
   span<const RegisterValue> storeDataSpan = {storeData.data(),
@@ -414,17 +801,17 @@ TEST_P(LoadStoreQueueTest, FlushDuringConfliction) {
       .WillRepeatedly(Return(storeDataSpan));
 
   // Set load address which overlaps on first store address
-  std::vector<MemoryAccessTarget> loadAddresses = {{1, 1}};
-  span<const MemoryAccessTarget> loadAddressesSpan = {loadAddresses.data(),
-                                                      loadAddresses.size()};
+  std::vector<memory::MemoryAccessTarget> loadAddresses = {{1, 1}};
+  span<const memory::MemoryAccessTarget> loadAddressesSpan = {
+      loadAddresses.data(), loadAddresses.size()};
   EXPECT_CALL(*loadUop, getGeneratedAddresses())
       .Times(AtLeast(1))
       .WillRepeatedly(Return(loadAddressesSpan));
 
   // Set load address which overlaps on second store address
-  std::vector<MemoryAccessTarget> loadAddresses2 = {{2, 1}};
-  span<const MemoryAccessTarget> loadAddressesSpan2 = {loadAddresses2.data(),
-                                                       loadAddresses2.size()};
+  std::vector<memory::MemoryAccessTarget> loadAddresses2 = {{2, 1}};
+  span<const memory::MemoryAccessTarget> loadAddressesSpan2 = {
+      loadAddresses2.data(), loadAddresses2.size()};
   EXPECT_CALL(*loadUop2, getGeneratedAddresses())
       .Times(AtLeast(1))
       .WillRepeatedly(Return(loadAddressesSpan2));
@@ -453,6 +840,67 @@ TEST_P(LoadStoreQueueTest, FlushDuringConfliction) {
   queue.tick();
 }
 
+// Test that a load access exactly conflicting on a store access (matching
+// address and access size no larger) gets its data supplied when the store
+// commits
+TEST_P(LoadStoreQueueTest, SupplyDataToConfliction) {
+  auto queue = getQueue();
+
+  // Set instruction attributes
+  storeUop->setSequenceId(0);
+  storeUop->setInstructionId(0);
+  loadUop->setSequenceId(1);
+  loadUop->setInstructionId(1);
+
+  std::vector<memory::MemoryAccessTarget> storeAddresses = {{1, 1}, {2, 1}};
+  span<const memory::MemoryAccessTarget> storeAddressesSpan = {
+      storeAddresses.data(), storeAddresses.size()};
+  std::vector<RegisterValue> storeData = {static_cast<uint8_t>(0x01),
+                                          static_cast<uint8_t>(0x10)};
+  span<const RegisterValue> storeDataSpan = {storeData.data(),
+                                             storeData.size()};
+  EXPECT_CALL(*storeUop, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(storeAddressesSpan));
+  EXPECT_CALL(*storeUop, getData())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(storeDataSpan));
+
+  // Set load addresses which exactly and partially overlaps on first and second
+  // store addresses respectively
+  std::vector<memory::MemoryAccessTarget> loadAddresses = {
+      {1, 1}, {2, 2}, {3, 1}};
+  span<const memory::MemoryAccessTarget> loadAddressesSpan = {
+      loadAddresses.data(), loadAddresses.size()};
+  EXPECT_CALL(*loadUop, getGeneratedAddresses())
+      .Times(AtLeast(1))
+      .WillRepeatedly(Return(loadAddressesSpan));
+
+  // Add instructions to LSQ
+  queue.addStore(storeUopPtr);
+  queue.addLoad(loadUopPtr);
+
+  // Supply store data so the store can commit
+  queue.supplyStoreData(storeUopPtr);
+
+  // Start the load so the confliction can be registered
+  queue.startLoad(loadUopPtr);
+
+  // Two of the accesses don't exactly conflict so they should generate memory
+  // accesses
+  EXPECT_CALL(dataMemory, requestRead(loadAddresses[1], 1)).Times(1);
+  EXPECT_CALL(dataMemory, requestRead(loadAddresses[2], 1)).Times(1);
+  queue.tick();
+
+  // The one access which does exactly conflict with a store access should get
+  // its data supplied on the store's commitment
+  EXPECT_CALL(*loadUop,
+              supplyData(loadAddresses[0].address,
+                         Property(&RegisterValue::get<uint8_t>, storeData[0])))
+      .Times(1);
+  queue.commitStore(storeUopPtr);
+}
+
 INSTANTIATE_TEST_SUITE_P(LoadStoreQueueTests, LoadStoreQueueTest,
                          ::testing::Values<bool>(false, true));
 
diff --git a/test/unit/pipeline/M1PortAllocatorTest.cc b/test/unit/pipeline/M1PortAllocatorTest.cc
new file mode 100644
index 0000000000..69786bfed9
--- /dev/null
+++ b/test/unit/pipeline/M1PortAllocatorTest.cc
@@ -0,0 +1,157 @@
+#include "gtest/gtest.h"
+#include "simeng/pipeline/M1PortAllocator.hh"
+
+namespace simeng {
+namespace pipeline {
+
+class M1PortAllocatorTest : public testing::Test {
+ public:
+  M1PortAllocatorTest() : portAllocator(portArrangement, rsArrangement) {
+    portAllocator.setRSSizeGetter(
+        [this](std::vector<uint32_t>& sizeVec) { rsSizes(sizeVec); });
+  }
+
+  void rsSizes(std::vector<uint32_t>& sizeVec) const {
+    sizeVec = rsFreeEntries;
+  }
+
+ protected:
+  // Representation of the M1 Firestorm reservation station layout
+  std::vector<uint32_t> rsFreeEntries = {24, 26, 16, 12, 28, 28, 12,
+                                         12, 12, 12, 36, 36, 36, 36};
+  // Representation of the M1 Firestorm port layout
+  const std::vector<std::vector<uint16_t>> portArrangement = {
+      {0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}, {13}};
+  // Representation of the M1 Firestorm Reservation Station Arrangement
+  // std::pair<uint8_t, uint64_t> = <rsIndex, rsSize>
+  std::vector<std::pair<uint16_t, uint64_t>> rsArrangement = {
+      {0, 24}, {1, 26}, {2, 16}, {3, 12},  {4, 28},  {5, 28},  {6, 12},
+      {7, 12}, {8, 12}, {9, 12}, {10, 36}, {11, 36}, {12, 36}, {13, 36}};
+
+  M1PortAllocator portAllocator;
+};
+
+// Tests correct allocation for single port groups (i.e. INT_DIV_OR_SQRT)
+TEST_F(M1PortAllocatorTest, singlePortAllocation) {
+  std::vector<uint16_t> ports = {4};
+  EXPECT_EQ(portAllocator.allocate(ports), 4);
+}
+
+// Tests correct allocation of multiple INT_SIMPLE instructions
+TEST_F(M1PortAllocatorTest, allocationIntSimple) {
+  std::vector<uint16_t> ports = {0, 1, 2, 3, 4, 5};
+  EXPECT_EQ(portAllocator.allocate(ports), 0);
+  rsFreeEntries[0]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 1);
+  rsFreeEntries[1]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 2);
+  rsFreeEntries[2]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 3);
+  rsFreeEntries[3]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 4);
+  rsFreeEntries[4]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 5);
+  rsFreeEntries[5]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 0);
+  rsFreeEntries[0]--;
+
+  // Ensure `issued()` logic works as expected
+  portAllocator.issued(3);
+  rsFreeEntries[3]++;
+  EXPECT_EQ(portAllocator.allocate(ports), 3);
+  rsFreeEntries[3]--;
+}
+
+// Tests correct allocation of multiple BRANCH instructions
+TEST_F(M1PortAllocatorTest, allocationBranch) {
+  std::vector<uint16_t> ports = {0, 1};
+  EXPECT_EQ(portAllocator.allocate(ports), 0);
+  rsFreeEntries[0]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 1);
+  rsFreeEntries[1]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 0);
+  rsFreeEntries[0]--;
+
+  // Ensure `issued()` logic works as expected
+  portAllocator.issued(0);
+  rsFreeEntries[0]++;
+  EXPECT_EQ(portAllocator.allocate(ports), 0);
+  rsFreeEntries[0]--;
+}
+
+// Tests correct allocation of multiple INT_MUL instructions
+TEST_F(M1PortAllocatorTest, allocationIntMul) {
+  std::vector<uint16_t> ports = {4, 5};
+  EXPECT_EQ(portAllocator.allocate(ports), 4);
+  rsFreeEntries[4]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 5);
+  rsFreeEntries[5]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 4);
+  rsFreeEntries[4]--;
+
+  // Ensure `issued()` logic works as expected
+  portAllocator.issued(4);
+  rsFreeEntries[4]++;
+  EXPECT_EQ(portAllocator.allocate(ports), 4);
+  rsFreeEntries[4]--;
+}
+
+// Tests correct allocation of multiple LOAD instructions
+TEST_F(M1PortAllocatorTest, allocationLoad) {
+  std::vector<uint16_t> ports = {7, 8, 9};
+  EXPECT_EQ(portAllocator.allocate(ports), 7);
+  rsFreeEntries[7]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 8);
+  rsFreeEntries[8]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 9);
+  rsFreeEntries[9]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 7);
+  rsFreeEntries[7]--;
+
+  // Ensure `issued()` logic works as expected
+  portAllocator.issued(9);
+  rsFreeEntries[9]++;
+  EXPECT_EQ(portAllocator.allocate(ports), 9);
+  rsFreeEntries[9]--;
+}
+
+// Tests correct allocation of multiple STORE instructions
+TEST_F(M1PortAllocatorTest, allocationStore) {
+  std::vector<uint16_t> ports = {6, 7};
+  EXPECT_EQ(portAllocator.allocate(ports), 6);
+  rsFreeEntries[6]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 7);
+  rsFreeEntries[7]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 6);
+  rsFreeEntries[6]--;
+
+  // Ensure `issued()` logic works as expected
+  portAllocator.issued(6);
+  rsFreeEntries[6]++;
+  EXPECT_EQ(portAllocator.allocate(ports), 6);
+  rsFreeEntries[6]--;
+}
+
+// Tests correct allocation of multiple FP / VECTOR instructions
+TEST_F(M1PortAllocatorTest, allocationFpVec) {
+  std::vector<uint16_t> ports = {10, 11, 12, 13};
+  EXPECT_EQ(portAllocator.allocate(ports), 10);
+  rsFreeEntries[10]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 11);
+  rsFreeEntries[11]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 12);
+  rsFreeEntries[12]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 13);
+  rsFreeEntries[13]--;
+  EXPECT_EQ(portAllocator.allocate(ports), 10);
+  rsFreeEntries[10]--;
+
+  // Ensure `issued()` logic works as expected
+  portAllocator.issued(12);
+  rsFreeEntries[12]++;
+  EXPECT_EQ(portAllocator.allocate(ports), 12);
+  rsFreeEntries[12]--;
+}
+
+}  // namespace pipeline
+}  // namespace simeng
\ No newline at end of file
diff --git a/test/unit/pipeline/MappedRegisterFileSetTest.cc b/test/unit/pipeline/MappedRegisterFileSetTest.cc
new file mode 100644
index 0000000000..fc63657779
--- /dev/null
+++ b/test/unit/pipeline/MappedRegisterFileSetTest.cc
@@ -0,0 +1,56 @@
+#include "gtest/gtest.h"
+#include "simeng/pipeline/MappedRegisterFileSet.hh"
+
+namespace simeng {
+namespace pipeline {
+
+class MappedRegisterFileSetTest : public ::testing::Test {
+ public:
+  MappedRegisterFileSetTest()
+      : regFileSet(physRegFileStruct),
+        rat(archRegFileStruct, physRegCounts),
+        mappedRegFile(regFileSet, rat) {}
+
+ protected:
+  const std::vector<RegisterFileStructure> archRegFileStruct = {
+      {8, 10}, {24, 15}, {256, 31}};
+  const std::vector<RegisterFileStructure> physRegFileStruct = {
+      {8, 20}, {24, 30}, {256, 62}};
+  const std::vector<uint16_t> physRegCounts = {20, 30, 62};
+
+  RegisterFileSet regFileSet;
+  RegisterAliasTable rat;
+
+  MappedRegisterFileSet mappedRegFile;
+};
+
+// Ensure that with continually changing physical-architectural register mapping
+// changes, the correct register is being updated with set().
+TEST_F(MappedRegisterFileSetTest, getSet) {
+  // Loop through all register types
+  for (uint8_t i = 0; i < archRegFileStruct.size(); i++) {
+    // Keep allocating the same register to a) keep past values and b) more
+    // easily verify correct functionality
+    const uint16_t maxRegTag = archRegFileStruct[i].quantity - 1;
+    const uint16_t regSize = archRegFileStruct[i].bytes;
+    const Register rMax = {i, maxRegTag};
+
+    std::vector<Register> physRegs;
+    for (int j = 2; j < 12; j++) {
+      physRegs.push_back(rat.allocate(rMax));
+      RegisterValue regVal = RegisterValue(j, regSize);
+      mappedRegFile.set(rMax, regVal);
+      EXPECT_EQ(mappedRegFile.get(rMax), regVal);
+    }
+
+    for (int k = 0; k < 10; k++) {
+      // RAT constructed where Arch-Phys mapping is 1:1. So, first re-mapped
+      // value will be to maxArchRegRag + 1
+      EXPECT_EQ(physRegs[k].tag, maxRegTag + k + 1);
+      EXPECT_EQ(physRegs[k].type, i);
+      EXPECT_EQ(regFileSet.get(physRegs[k]), RegisterValue(k + 2, regSize));
+    }
+  }
+}
+}  // namespace pipeline
+}  // namespace simeng
\ No newline at end of file
diff --git a/test/unit/pipeline/RegisterAliasTableTest.cc b/test/unit/pipeline/RegisterAliasTableTest.cc
index 99b3daf059..6b6f1d9985 100644
--- a/test/unit/pipeline/RegisterAliasTableTest.cc
+++ b/test/unit/pipeline/RegisterAliasTableTest.cc
@@ -62,11 +62,15 @@ TEST_F(RegisterAliasTableTest, AllocateIndependent) {
   auto multiRAT =
       RegisterAliasTable({{8, architecturalCount}, {8, architecturalCount}},
                          {physicalCount, physicalCount});
+  auto initialFreeRegisters0 = multiRAT.freeRegistersAvailable(0);
   auto initialFreeRegisters1 = multiRAT.freeRegistersAvailable(1);
 
   multiRAT.allocate(reg);
 
-  // Check that the same number of physical registers are still available
+  // Check 1 fewer physical registers are now available for regFile 0
+  EXPECT_EQ(multiRAT.freeRegistersAvailable(0), initialFreeRegisters0 - 1);
+  // Check that the same number of physical registers are still available for
+  // regFile 1
   EXPECT_EQ(multiRAT.freeRegistersAvailable(1), initialFreeRegisters1);
 }
 
diff --git a/test/unit/pipeline/RenameUnitTest.cc b/test/unit/pipeline/RenameUnitTest.cc
new file mode 100644
index 0000000000..3f3013adf6
--- /dev/null
+++ b/test/unit/pipeline/RenameUnitTest.cc
@@ -0,0 +1,461 @@
+#include "../MockBranchPredictor.hh"
+#include "../MockInstruction.hh"
+#include "../MockMemoryInterface.hh"
+#include "gtest/gtest.h"
+#include "simeng/pipeline/RenameUnit.hh"
+
+namespace simeng {
+
+namespace pipeline {
+
+using ::testing::_;
+using ::testing::Return;
+
+class RenameUnitTest : public testing::Test {
+ public:
+  RenameUnitTest()
+      : input(1, nullptr),
+        output(1, nullptr),
+        rat(archRegFileStruct, physRegCounts),
+        lsq(
+            lsqQueueSize, lsqQueueSize, memory, completionSlots,
+            [](auto registers, auto values) {}, [](auto insn) {}),
+        rob(
+            robSize, rat, lsq, [](auto insn) {}, [](auto branchAddr) {},
+            predictor, 16, 4),
+        renameUnit(input, output, rob, rat, lsq, physRegCounts.size()),
+        uop(new MockInstruction),
+        uop2(new MockInstruction),
+        uop3(new MockInstruction),
+        uopPtr(uop),
+        uop2Ptr(uop2),
+        uop3Ptr(uop3) {}
+
+ protected:
+  // 3rd register type has same arch & physical counts meaning renaming is not
+  // permitted.
+  const std::vector<RegisterFileStructure> archRegFileStruct = {
+      {8, 10}, {24, 15}, {256, 31}};
+  const std::vector<RegisterFileStructure> physRegFileStruct = {
+      {8, 20}, {24, 30}, {256, 31}};
+  const std::vector<uint16_t> physRegCounts = {20, 30, 31};
+
+  const Register r0 = {0, 0};
+  const Register r1 = {1, 2};
+  const Register r2 = {2, 4};
+
+  const uint64_t robSize = 8;
+  const uint64_t lsqQueueSize = 10;
+
+  PipelineBuffer<std::shared_ptr<Instruction>> input;
+  PipelineBuffer<std::shared_ptr<Instruction>> output;
+
+  MockMemoryInterface memory;
+  MockBranchPredictor predictor;
+  span<PipelineBuffer<std::shared_ptr<Instruction>>> completionSlots;
+
+  RegisterAliasTable rat;
+  LoadStoreQueue lsq;
+  ReorderBuffer rob;
+
+  RenameUnit renameUnit;
+
+  MockInstruction* uop;
+  MockInstruction* uop2;
+  MockInstruction* uop3;
+
+  std::shared_ptr<Instruction> uopPtr;
+  std::shared_ptr<Instruction> uop2Ptr;
+  std::shared_ptr<Instruction> uop3Ptr;
+};
+
+// Test the correct functionality when input buffer and unit is empty
+TEST_F(RenameUnitTest, emptyTick) {
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
+  EXPECT_EQ(output.getTailSlots()[0], nullptr);
+
+  renameUnit.tick();
+
+  // Check output buffers and statistics are as expected
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
+  EXPECT_EQ(output.getTailSlots()[0], nullptr);
+  EXPECT_EQ(renameUnit.getAllocationStalls(), 0);
+  EXPECT_EQ(renameUnit.getROBStalls(), 0);
+  EXPECT_EQ(renameUnit.getLoadQueueStalls(), 0);
+  EXPECT_EQ(renameUnit.getStoreQueueStalls(), 0);
+}
+
+// Test the normal functionality of an instruction passing through the unit
+TEST_F(RenameUnitTest, tick) {
+  input.getHeadSlots()[0] = uopPtr;
+
+  std::array<Register, 1> destRegs = {r0};
+  std::array<Register, 2> srcRegs = {r0, r1};
+  ON_CALL(*uop, getDestinationRegisters())
+      .WillByDefault(Return(span<Register>(destRegs)));
+  ON_CALL(*uop, getSourceRegisters())
+      .WillByDefault(Return(span<Register>(srcRegs)));
+  ON_CALL(*uop, isOperandReady(_)).WillByDefault(Return(false));
+  ON_CALL(*uop, isLoad()).WillByDefault(Return(false));
+  ON_CALL(*uop, isStoreAddress()).WillByDefault(Return(false));
+
+  // Setup expected calls to MockInstruction
+  EXPECT_CALL(*uop, isLoad()).Times(1);
+  EXPECT_CALL(*uop, isStoreAddress()).Times(1);
+  EXPECT_CALL(*uop, getDestinationRegisters()).Times(1);
+  EXPECT_CALL(*uop, getSourceRegisters()).Times(1);
+  EXPECT_CALL(*uop, isOperandReady(_)).Times(2);
+  EXPECT_CALL(*uop, renameSource(_, _)).Times(2);
+  EXPECT_CALL(*uop, renameDestination(0, _)).Times(1);
+  renameUnit.tick();
+
+  // Check output buffers and statistics are as expected
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
+  EXPECT_EQ(output.getTailSlots()[0].get(), uop);
+  EXPECT_EQ(renameUnit.getAllocationStalls(), 0);
+  EXPECT_EQ(renameUnit.getROBStalls(), 0);
+  EXPECT_EQ(renameUnit.getLoadQueueStalls(), 0);
+  EXPECT_EQ(renameUnit.getStoreQueueStalls(), 0);
+
+  // Check ROB, LSQ, and RAT mappings have been changed accordingly
+  EXPECT_EQ(rob.size(), 1);
+  EXPECT_EQ(rob.getFreeSpace(), robSize - 1);
+  EXPECT_EQ(lsq.getTotalSpace(), lsqQueueSize * 2);
+  const Register mappedReg = {0, archRegFileStruct[0].quantity};
+  EXPECT_EQ(rat.getMapping(r0), mappedReg);
+  EXPECT_EQ(rat.getMapping(r1), r1);
+}
+
+// Ensure input buffer is stalled when output buffer is stalled
+TEST_F(RenameUnitTest, outputStall) {
+  output.stall(true);
+  renameUnit.tick();
+  EXPECT_TRUE(input.isStalled());
+}
+
+// Test that an instruction exception is properly dealt with
+TEST_F(RenameUnitTest, uopException) {
+  input.getHeadSlots()[0] = uopPtr;
+  uop->setExceptionEncountered(true);
+
+  renameUnit.tick();
+
+  EXPECT_TRUE(uopPtr->canCommit());
+
+  EXPECT_EQ(rob.size(), 1);
+  EXPECT_EQ(rob.getFreeSpace(), robSize - 1);
+  EXPECT_EQ(lsq.getTotalSpace(), lsqQueueSize * 2);
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
+  EXPECT_EQ(output.getTailSlots()[0], nullptr);
+
+  EXPECT_EQ(renameUnit.getAllocationStalls(), 0);
+  EXPECT_EQ(renameUnit.getROBStalls(), 0);
+  EXPECT_EQ(renameUnit.getLoadQueueStalls(), 0);
+  EXPECT_EQ(renameUnit.getStoreQueueStalls(), 0);
+}
+
+// Test for when no physical registers are available
+TEST_F(RenameUnitTest, noFreeRegs) {
+  // Take up all type-0 physical registers
+  // All arch regs originally mapped to phys reg, meaning remaing
+  // regs = physCount - archCount
+  for (int i = 0; i < physRegCounts[0] - archRegFileStruct[0].quantity; i++) {
+    rat.allocate(r0);
+  }
+  EXPECT_EQ(rat.freeRegistersAvailable(0), 0);
+
+  input.getHeadSlots()[0] = uopPtr;
+
+  std::array<Register, 1> destRegs = {r0};
+  ON_CALL(*uop, getDestinationRegisters())
+      .WillByDefault(Return(span<Register>(destRegs)));
+  ON_CALL(*uop, isOperandReady(_)).WillByDefault(Return(false));
+  ON_CALL(*uop, isLoad()).WillByDefault(Return(false));
+  ON_CALL(*uop, isStoreAddress()).WillByDefault(Return(false));
+
+  // Setup expected calls to MockInstruction
+  EXPECT_CALL(*uop, isLoad()).Times(1);
+  EXPECT_CALL(*uop, isStoreAddress()).Times(1);
+  EXPECT_CALL(*uop, getDestinationRegisters()).Times(1);
+  renameUnit.tick();
+
+  EXPECT_TRUE(input.isStalled());
+
+  EXPECT_EQ(rob.size(), 0);
+  EXPECT_EQ(rob.getFreeSpace(), robSize);
+  EXPECT_EQ(lsq.getTotalSpace(), lsqQueueSize * 2);
+  EXPECT_EQ(input.getHeadSlots()[0], uopPtr);
+  EXPECT_EQ(output.getTailSlots()[0], nullptr);
+
+  EXPECT_EQ(renameUnit.getAllocationStalls(), 1);
+  EXPECT_EQ(renameUnit.getROBStalls(), 0);
+  EXPECT_EQ(renameUnit.getLoadQueueStalls(), 0);
+  EXPECT_EQ(renameUnit.getStoreQueueStalls(), 0);
+}
+
+// Tests that when ROB is full, no renaming occurs
+TEST_F(RenameUnitTest, fullROB) {
+  // Pre-fill ROB
+  for (uint64_t i = 0; i < robSize; i++) {
+    rob.reserve(uopPtr);
+  }
+  EXPECT_EQ(rob.getFreeSpace(), 0);
+
+  input.getHeadSlots()[0] = uopPtr;
+  renameUnit.tick();
+
+  EXPECT_TRUE(input.isStalled());
+
+  EXPECT_EQ(rob.size(), robSize);
+  EXPECT_EQ(rob.getFreeSpace(), 0);
+  EXPECT_EQ(lsq.getTotalSpace(), lsqQueueSize * 2);
+  EXPECT_EQ(input.getHeadSlots()[0], uopPtr);
+  EXPECT_EQ(output.getTailSlots()[0], nullptr);
+
+  EXPECT_EQ(renameUnit.getAllocationStalls(), 0);
+  EXPECT_EQ(renameUnit.getROBStalls(), 1);
+  EXPECT_EQ(renameUnit.getLoadQueueStalls(), 0);
+  EXPECT_EQ(renameUnit.getStoreQueueStalls(), 0);
+}
+
+// Test a LOAD instruction is handled correctly
+TEST_F(RenameUnitTest, loadUop) {
+  input.getHeadSlots()[0] = uopPtr;
+
+  std::array<Register, 1> destRegs = {r0};
+  std::array<Register, 2> srcRegs = {r0, r1};
+  ON_CALL(*uop, getDestinationRegisters())
+      .WillByDefault(Return(span<Register>(destRegs)));
+  ON_CALL(*uop, getSourceRegisters())
+      .WillByDefault(Return(span<Register>(srcRegs)));
+  ON_CALL(*uop, isOperandReady(_)).WillByDefault(Return(false));
+  ON_CALL(*uop, isLoad()).WillByDefault(Return(true));
+  ON_CALL(*uop, isStoreAddress()).WillByDefault(Return(false));
+
+  // Setup expected calls to MockInstruction
+  EXPECT_CALL(*uop, isLoad()).Times(1);
+  EXPECT_CALL(*uop, isStoreAddress()).Times(1);
+  EXPECT_CALL(*uop, getDestinationRegisters()).Times(1);
+  EXPECT_CALL(*uop, getSourceRegisters()).Times(1);
+  EXPECT_CALL(*uop, isOperandReady(_)).Times(2);
+  EXPECT_CALL(*uop, renameSource(_, _)).Times(2);
+  EXPECT_CALL(*uop, renameDestination(0, _)).Times(1);
+  renameUnit.tick();
+
+  // Check output buffers and statistics are as expected
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
+  EXPECT_EQ(output.getTailSlots()[0].get(), uop);
+  EXPECT_EQ(renameUnit.getAllocationStalls(), 0);
+  EXPECT_EQ(renameUnit.getROBStalls(), 0);
+  EXPECT_EQ(renameUnit.getLoadQueueStalls(), 0);
+  EXPECT_EQ(renameUnit.getStoreQueueStalls(), 0);
+
+  // Check ROB, LSQ, and RAT mappings have been changed accordingly
+  EXPECT_EQ(rob.size(), 1);
+  EXPECT_EQ(rob.getFreeSpace(), robSize - 1);
+  EXPECT_EQ(lsq.getLoadQueueSpace(), lsqQueueSize - 1);
+  EXPECT_EQ(lsq.getStoreQueueSpace(), lsqQueueSize);
+  EXPECT_EQ(lsq.getTotalSpace(), (lsqQueueSize * 2) - 1);
+  const Register mappedReg = {0, archRegFileStruct[0].quantity};
+  EXPECT_EQ(rat.getMapping(r0), mappedReg);
+  EXPECT_EQ(rat.getMapping(r1), r1);
+}
+
+// Test a LOAD instruction is handled correctly when Load queue is full
+TEST_F(RenameUnitTest, loadUopQueueFull) {
+  // pre-fill Load Queue
+  for (uint64_t i = 0; i < lsqQueueSize; i++) {
+    lsq.addLoad(uopPtr);
+  }
+  EXPECT_EQ(lsq.getLoadQueueSpace(), 0);
+
+  input.getHeadSlots()[0] = uopPtr;
+
+  std::array<Register, 1> destRegs = {r0};
+  std::array<Register, 2> srcRegs = {r0, r1};
+  ON_CALL(*uop, getDestinationRegisters())
+      .WillByDefault(Return(span<Register>(destRegs)));
+  ON_CALL(*uop, getSourceRegisters())
+      .WillByDefault(Return(span<Register>(srcRegs)));
+  ON_CALL(*uop, isOperandReady(_)).WillByDefault(Return(false));
+  ON_CALL(*uop, isLoad()).WillByDefault(Return(true));
+  ON_CALL(*uop, isStoreAddress()).WillByDefault(Return(false));
+
+  // Setup expected calls to MockInstruction
+  EXPECT_CALL(*uop, isLoad()).Times(1);
+  EXPECT_CALL(*uop, isStoreAddress()).Times(1);
+  renameUnit.tick();
+
+  EXPECT_TRUE(input.isStalled());
+
+  // Check output buffers and statistics are as expected
+  EXPECT_EQ(input.getHeadSlots()[0], uopPtr);
+  EXPECT_EQ(output.getTailSlots()[0], nullptr);
+  EXPECT_EQ(renameUnit.getAllocationStalls(), 0);
+  EXPECT_EQ(renameUnit.getROBStalls(), 0);
+  EXPECT_EQ(renameUnit.getLoadQueueStalls(), 1);
+  EXPECT_EQ(renameUnit.getStoreQueueStalls(), 0);
+
+  // Check ROB, LSQ, and RAT mappings have been changed accordingly
+  EXPECT_EQ(rob.size(), 0);
+  EXPECT_EQ(rob.getFreeSpace(), robSize);
+  EXPECT_EQ(lsq.getLoadQueueSpace(), 0);
+  EXPECT_EQ(lsq.getStoreQueueSpace(), lsqQueueSize);
+  EXPECT_EQ(lsq.getTotalSpace(), lsqQueueSize);
+}
+
+// Test a STORE instruction is handled correctly
+TEST_F(RenameUnitTest, storeUop) {
+  input.getHeadSlots()[0] = uopPtr;
+
+  std::array<Register, 1> destRegs = {r0};
+  std::array<Register, 2> srcRegs = {r0, r1};
+  ON_CALL(*uop, getDestinationRegisters())
+      .WillByDefault(Return(span<Register>(destRegs)));
+  ON_CALL(*uop, getSourceRegisters())
+      .WillByDefault(Return(span<Register>(srcRegs)));
+  ON_CALL(*uop, isOperandReady(_)).WillByDefault(Return(false));
+  ON_CALL(*uop, isLoad()).WillByDefault(Return(false));
+  ON_CALL(*uop, isStoreAddress()).WillByDefault(Return(true));
+
+  // Setup expected calls to MockInstruction
+  EXPECT_CALL(*uop, isLoad()).Times(1);
+  EXPECT_CALL(*uop, isStoreAddress()).Times(1);
+  EXPECT_CALL(*uop, getDestinationRegisters()).Times(1);
+  EXPECT_CALL(*uop, getSourceRegisters()).Times(1);
+  EXPECT_CALL(*uop, isOperandReady(_)).Times(2);
+  EXPECT_CALL(*uop, renameSource(_, _)).Times(2);
+  EXPECT_CALL(*uop, renameDestination(0, _)).Times(1);
+  renameUnit.tick();
+
+  // Check output buffers and statistics are as expected
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
+  EXPECT_EQ(output.getTailSlots()[0].get(), uop);
+  EXPECT_EQ(renameUnit.getAllocationStalls(), 0);
+  EXPECT_EQ(renameUnit.getROBStalls(), 0);
+  EXPECT_EQ(renameUnit.getLoadQueueStalls(), 0);
+  EXPECT_EQ(renameUnit.getStoreQueueStalls(), 0);
+
+  // Check ROB, LSQ, and RAT mappings have been changed accordingly
+  EXPECT_EQ(rob.size(), 1);
+  EXPECT_EQ(rob.getFreeSpace(), robSize - 1);
+  EXPECT_EQ(lsq.getLoadQueueSpace(), lsqQueueSize);
+  EXPECT_EQ(lsq.getStoreQueueSpace(), lsqQueueSize - 1);
+  EXPECT_EQ(lsq.getTotalSpace(), (lsqQueueSize * 2) - 1);
+  const Register mappedReg = {0, archRegFileStruct[0].quantity};
+  EXPECT_EQ(rat.getMapping(r0), mappedReg);
+  EXPECT_EQ(rat.getMapping(r1), r1);
+}
+
+// Test a STORE instruction is handled correctly when Store queue is full
+TEST_F(RenameUnitTest, storeUopQueueFull) {
+  // pre-fill Load Queue
+  for (uint64_t i = 0; i < lsqQueueSize; i++) {
+    lsq.addStore(uopPtr);
+  }
+  EXPECT_EQ(lsq.getStoreQueueSpace(), 0);
+
+  input.getHeadSlots()[0] = uopPtr;
+
+  std::array<Register, 1> destRegs = {r0};
+  std::array<Register, 2> srcRegs = {r0, r1};
+  ON_CALL(*uop, getDestinationRegisters())
+      .WillByDefault(Return(span<Register>(destRegs)));
+  ON_CALL(*uop, getSourceRegisters())
+      .WillByDefault(Return(span<Register>(srcRegs)));
+  ON_CALL(*uop, isOperandReady(_)).WillByDefault(Return(false));
+  ON_CALL(*uop, isLoad()).WillByDefault(Return(false));
+  ON_CALL(*uop, isStoreAddress()).WillByDefault(Return(true));
+
+  // Setup expected calls to MockInstruction
+  EXPECT_CALL(*uop, isLoad()).Times(1);
+  EXPECT_CALL(*uop, isStoreAddress()).Times(1);
+  renameUnit.tick();
+
+  EXPECT_TRUE(input.isStalled());
+
+  // Check output buffers and statistics are as expected
+  EXPECT_EQ(input.getHeadSlots()[0], uopPtr);
+  EXPECT_EQ(output.getTailSlots()[0], nullptr);
+  EXPECT_EQ(renameUnit.getAllocationStalls(), 0);
+  EXPECT_EQ(renameUnit.getROBStalls(), 0);
+  EXPECT_EQ(renameUnit.getLoadQueueStalls(), 0);
+  EXPECT_EQ(renameUnit.getStoreQueueStalls(), 1);
+
+  // Check ROB, LSQ, and RAT mappings have been changed accordingly
+  EXPECT_EQ(rob.size(), 0);
+  EXPECT_EQ(rob.getFreeSpace(), robSize);
+  EXPECT_EQ(lsq.getLoadQueueSpace(), lsqQueueSize);
+  EXPECT_EQ(lsq.getStoreQueueSpace(), 0);
+  EXPECT_EQ(lsq.getTotalSpace(), lsqQueueSize);
+}
+
+// Test to ensure Serialized destinations work correctly
+TEST_F(RenameUnitTest, serializedDest) {
+  // A serialized uop can only proceed when the ROB is empty. Pre-add an
+  // instruction to ensure uop stalls correctly in renameUnit Pre-fill ROB
+  rob.reserve(uop2Ptr);
+  EXPECT_EQ(rob.size(), 1);
+
+  // A serialized uop is caused when the destination register cannot be renamed
+  // - i.e. the number of archRegs is the same as physRegs
+  input.getHeadSlots()[0] = uopPtr;
+  std::array<Register, 1> destRegs = {r2};
+  std::array<Register, 2> srcRegs = {r0, r1};
+  ON_CALL(*uop, getDestinationRegisters())
+      .WillByDefault(Return(span<Register>(destRegs)));
+  ON_CALL(*uop, getSourceRegisters())
+      .WillByDefault(Return(span<Register>(srcRegs)));
+  ON_CALL(*uop, isOperandReady(_)).WillByDefault(Return(false));
+  ON_CALL(*uop, isLoad()).WillByDefault(Return(false));
+  ON_CALL(*uop, isStoreAddress()).WillByDefault(Return(false));
+
+  // On first tick, input should stall and uop should not proceed through
+  // renameUnit
+  EXPECT_CALL(*uop, isLoad()).Times(1);
+  EXPECT_CALL(*uop, isStoreAddress()).Times(1);
+  EXPECT_CALL(*uop, getDestinationRegisters()).Times(1);
+  renameUnit.tick();
+
+  EXPECT_TRUE(input.isStalled());
+  EXPECT_EQ(input.getHeadSlots()[0], uopPtr);
+  EXPECT_EQ(output.getTailSlots()[0], nullptr);
+
+  // Empty item in ROB
+  EXPECT_EQ(rob.size(), 1);
+  uop2Ptr->setCommitReady();
+  EXPECT_CALL(*uop2, getDestinationRegisters()).Times(1);
+  EXPECT_CALL(*uop2, isLoad()).WillOnce(Return(false));
+  EXPECT_CALL(*uop2, isStoreAddress()).WillOnce(Return(false));
+  EXPECT_CALL(*uop2, isBranch()).Times(2).WillRepeatedly(Return(false));
+  rob.commit(1);
+  EXPECT_EQ(rob.size(), 0);
+
+  // Try tick again
+  EXPECT_CALL(*uop, isLoad()).Times(1);
+  EXPECT_CALL(*uop, isStoreAddress()).Times(1);
+  EXPECT_CALL(*uop, getDestinationRegisters()).Times(1);
+  EXPECT_CALL(*uop, getSourceRegisters()).Times(1);
+  EXPECT_CALL(*uop, isOperandReady(_)).Times(2);
+  EXPECT_CALL(*uop, renameSource(_, _)).Times(2);
+  renameUnit.tick();
+
+  // Check output buffers and statistics are as expected
+  EXPECT_EQ(input.getHeadSlots()[0], nullptr);
+  EXPECT_EQ(output.getTailSlots()[0].get(), uop);
+  EXPECT_EQ(renameUnit.getAllocationStalls(), 0);
+  EXPECT_EQ(renameUnit.getROBStalls(), 0);
+  EXPECT_EQ(renameUnit.getLoadQueueStalls(), 0);
+  EXPECT_EQ(renameUnit.getStoreQueueStalls(), 0);
+
+  // Check ROB, LSQ, and RAT mappings have been changed accordingly
+  EXPECT_EQ(rob.size(), 1);
+  EXPECT_EQ(rob.getFreeSpace(), robSize - 1);
+  EXPECT_EQ(lsq.getTotalSpace(), lsqQueueSize * 2);
+  EXPECT_EQ(rat.getMapping(r0), r0);
+  EXPECT_EQ(rat.getMapping(r1), r1);
+  EXPECT_EQ(rat.getMapping(r2), r2);
+}
+
+}  // namespace pipeline
+}  // namespace simeng
\ No newline at end of file
diff --git a/test/unit/pipeline/ReorderBufferTest.cc b/test/unit/pipeline/ReorderBufferTest.cc
index 66b8f93dc4..ff3b63756d 100644
--- a/test/unit/pipeline/ReorderBufferTest.cc
+++ b/test/unit/pipeline/ReorderBufferTest.cc
@@ -1,11 +1,9 @@
 #include "../MockBranchPredictor.hh"
 #include "../MockInstruction.hh"
 #include "../MockMemoryInterface.hh"
-#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "simeng/Instruction.hh"
 #include "simeng/pipeline/LoadStoreQueue.hh"
-#include "simeng/pipeline/RegisterAliasTable.hh"
 #include "simeng/pipeline/ReorderBuffer.hh"
 
 using ::testing::_;
@@ -26,16 +24,20 @@ class ReorderBufferTest : public testing::Test {
   ReorderBufferTest()
       : memory{},
         rat({{8, 32}}, {64}),
-        lsq(maxLSQLoads, maxLSQStores, dataMemory, {nullptr, 0},
-            [](auto registers, auto values) {}),
+        lsq(
+            maxLSQLoads, maxLSQStores, dataMemory, {nullptr, 0},
+            [](auto registers, auto values) {}, [](auto uop) {}),
         uop(new MockInstruction),
         uop2(new MockInstruction),
+        uop3(new MockInstruction),
         uopPtr(uop),
         uopPtr2(uop2),
+        uopPtr3(uop3),
         reorderBuffer(
             maxROBSize, rat, lsq,
             [this](auto insn) { exceptionHandler.raiseException(insn); },
-            [](auto branchAddress) {}, predictor, 0, 0) {}
+            [this](auto branchAddress) { loopBoundaryAddr = branchAddress; },
+            predictor, 4, 2) {}
 
  protected:
   const uint8_t maxLSQLoads = 32;
@@ -51,13 +53,17 @@ class ReorderBufferTest : public testing::Test {
 
   MockInstruction* uop;
   MockInstruction* uop2;
+  MockInstruction* uop3;
 
   std::shared_ptr<Instruction> uopPtr;
-  std::shared_ptr<MockInstruction> uopPtr2;
+  std::shared_ptr<Instruction> uopPtr2;
+  std::shared_ptr<Instruction> uopPtr3;
 
   MockMemoryInterface dataMemory;
 
   ReorderBuffer reorderBuffer;
+
+  uint64_t loopBoundaryAddr = 0;
 };
 
 // Tests that an instruction can have a slot reserved in the ROB and be
@@ -99,6 +105,7 @@ TEST_F(ReorderBufferTest, Commit) {
 
   EXPECT_EQ(committed, 1);
   EXPECT_EQ(reorderBuffer.size(), 0);
+  EXPECT_EQ(reorderBuffer.getInstructionsCommittedCount(), 1);
 }
 
 // Tests that the reorder buffer won't commit an instruction if it's not ready
@@ -109,6 +116,7 @@ TEST_F(ReorderBufferTest, CommitNotReady) {
 
   EXPECT_EQ(committed, 0);
   EXPECT_EQ(reorderBuffer.size(), 1);
+  EXPECT_EQ(reorderBuffer.getInstructionsCommittedCount(), 0);
 }
 
 // Tests that the reorder buffer won't commit a ready instruction if it's not at
@@ -123,6 +131,7 @@ TEST_F(ReorderBufferTest, CommitHeadNotReady) {
 
   EXPECT_EQ(committed, 0);
   EXPECT_EQ(reorderBuffer.size(), 2);
+  EXPECT_EQ(reorderBuffer.getInstructionsCommittedCount(), 0);
 }
 
 // Tests that the reorder buffer can commit multiple ready instructions
@@ -137,6 +146,7 @@ TEST_F(ReorderBufferTest, CommitMultiple) {
 
   EXPECT_EQ(committed, 2);
   EXPECT_EQ(reorderBuffer.size(), 0);
+  EXPECT_EQ(reorderBuffer.getInstructionsCommittedCount(), 2);
 }
 
 // Tests that the reorder buffer correctly informs the LSQ when committing a
@@ -152,13 +162,14 @@ TEST_F(ReorderBufferTest, CommitLoad) {
 
   // Check that the load was removed from the LSQ
   EXPECT_EQ(lsq.getLoadQueueSpace(), maxLSQLoads);
+  EXPECT_EQ(reorderBuffer.getInstructionsCommittedCount(), 1);
 }
 
 // Tests that the reorder buffer correctly triggers a store upon commit
 TEST_F(ReorderBufferTest, CommitStore) {
-  std::vector<MemoryAccessTarget> addresses = {{0, 1}};
-  span<const MemoryAccessTarget> addressesSpan = {addresses.data(),
-                                                  addresses.size()};
+  std::vector<memory::MemoryAccessTarget> addresses = {{0, 1}};
+  span<const memory::MemoryAccessTarget> addressesSpan = {addresses.data(),
+                                                          addresses.size()};
 
   std::vector<RegisterValue> data = {static_cast<uint8_t>(1)};
   span<const RegisterValue> dataSpan = {data.data(), data.size()};
@@ -189,6 +200,7 @@ TEST_F(ReorderBufferTest, CommitStore) {
 
   // Check that the store was committed and removed from the LSQ
   EXPECT_EQ(lsq.getStoreQueueSpace(), maxLSQStores);
+  EXPECT_EQ(reorderBuffer.getInstructionsCommittedCount(), 1);
 
   // Tick lsq to complete store
   lsq.tick();
@@ -220,6 +232,259 @@ TEST_F(ReorderBufferTest, Exception) {
   auto committed = reorderBuffer.commit(1);
 
   EXPECT_EQ(committed, 1);
+  EXPECT_EQ(reorderBuffer.getInstructionsCommittedCount(), 1);
+}
+
+// Test the reorder buffer correctly sets a macro-op to commitReady when all of
+// its associated micro-ops have been
+TEST_F(ReorderBufferTest, commitMicroOps) {
+  // Reserve all microOps
+  uop->setIsMicroOp(true);
+  uop->setIsLastMicroOp(false);
+  uop2->setIsMicroOp(true);
+  uop2->setIsLastMicroOp(false);
+  uop3->setIsMicroOp(true);
+  uop3->setIsLastMicroOp(true);
+  reorderBuffer.reserve(uopPtr);
+  reorderBuffer.reserve(uopPtr2);
+  reorderBuffer.reserve(uopPtr3);
+  EXPECT_EQ(reorderBuffer.size(), 3);
+
+  EXPECT_EQ(uopPtr->getInstructionId(), 0);
+  EXPECT_EQ(uopPtr2->getInstructionId(), 0);
+  EXPECT_EQ(uopPtr3->getInstructionId(), 0);
+
+  // No micro-ops are waiting commit. Make sure they're not commit ready after
+  // call to `commitMicroOps`
+  reorderBuffer.commitMicroOps(0);
+  EXPECT_FALSE(uopPtr->canCommit());
+  EXPECT_FALSE(uopPtr2->canCommit());
+  EXPECT_FALSE(uopPtr3->canCommit());
+
+  // Set middle instruction as waitingCommit - ensure still not set commit ready
+  uop->setWaitingCommit();
+  reorderBuffer.commitMicroOps(0);
+  EXPECT_FALSE(uopPtr->canCommit());
+  EXPECT_FALSE(uopPtr2->canCommit());
+  EXPECT_FALSE(uopPtr3->canCommit());
+
+  // Set last instruction as waitingCommit - ensure still not set commit ready
+  uop3->setWaitingCommit();
+  reorderBuffer.commitMicroOps(0);
+  EXPECT_FALSE(uopPtr->canCommit());
+  EXPECT_FALSE(uopPtr2->canCommit());
+  EXPECT_FALSE(uopPtr3->canCommit());
+
+  // Set first instruction as waitingCommit - ensure still they are set commit
+  // ready now all micro-ops are done
+  uop2->setWaitingCommit();
+  reorderBuffer.commitMicroOps(0);
+  EXPECT_TRUE(uopPtr->canCommit());
+  EXPECT_TRUE(uopPtr2->canCommit());
+  EXPECT_TRUE(uopPtr3->canCommit());
+
+  // Now call commit in ROB and make sure micro-ops are committed properly
+  unsigned int committed = reorderBuffer.commit(3);
+  EXPECT_EQ(committed, 3);
+  EXPECT_EQ(reorderBuffer.getInstructionsCommittedCount(), 1);
+  EXPECT_EQ(reorderBuffer.size(), 0);
+}
+
+// Test that a detected violating load in the lsq leads to a flush
+TEST_F(ReorderBufferTest, violatingLoad) {
+  const uint64_t strAddr = 16;
+  const uint64_t strSize = 4;
+  const uint64_t ldAddr = 18;
+  const uint64_t ldSize = 4;
+
+  // Init Store
+  const memory::MemoryAccessTarget strTarget = {strAddr, strSize};
+  span<const memory::MemoryAccessTarget> strTargetSpan = {&strTarget, 1};
+  ON_CALL(*uop, getGeneratedAddresses()).WillByDefault(Return(strTargetSpan));
+  ON_CALL(*uop, isStoreAddress()).WillByDefault(Return(true));
+  ON_CALL(*uop, isStoreData()).WillByDefault(Return(true));
+  uopPtr->setSequenceId(0);
+  uopPtr->setInstructionId(0);
+  lsq.addStore(uopPtr);
+  reorderBuffer.reserve(uopPtr);
+  // Init load
+  const memory::MemoryAccessTarget ldTarget = {ldAddr, ldSize};
+  span<const memory::MemoryAccessTarget> ldTargetSpan = {&ldTarget, 1};
+  ON_CALL(*uop2, getGeneratedAddresses()).WillByDefault(Return(ldTargetSpan));
+  ON_CALL(*uop2, isLoad()).WillByDefault(Return(true));
+  uopPtr2->setSequenceId(1);
+  uopPtr2->setInstructionId(1);
+  uopPtr2->setInstructionAddress(4096);
+  lsq.addLoad(uopPtr2);
+  reorderBuffer.reserve(uopPtr2);
+
+  EXPECT_EQ(reorderBuffer.size(), 2);
+
+  // Start load "Out of order"
+  EXPECT_CALL(*uop2, getGeneratedAddresses()).Times(1);
+  EXPECT_CALL(*uop, getGeneratedAddresses()).Times(1);
+  lsq.startLoad(uopPtr2);
+
+  // Set store "ready to commit" so that violation gets detected
+  uopPtr->setCommitReady();
+  // Supply Store's data
+  RegisterValue strData = RegisterValue(0xABCD, strSize);
+  span<const RegisterValue> strDataSpan = {&strData, 1};
+  ON_CALL(*uop, getData()).WillByDefault(Return(strDataSpan));
+  EXPECT_CALL(*uop, getData()).Times(1);
+  lsq.supplyStoreData(uopPtr);
+
+  EXPECT_CALL(*uop, isStoreAddress()).WillOnce(Return(true));
+  EXPECT_CALL(*uop, getGeneratedAddresses()).Times(1);        // in LSQ
+  EXPECT_CALL(dataMemory, requestWrite(strTarget, strData));  // in LSQ
+  EXPECT_CALL(*uop2, getGeneratedAddresses()).Times(1);       // in LSQ
+  unsigned int committed = reorderBuffer.commit(4);
+
+  EXPECT_EQ(committed, 1);
+  EXPECT_EQ(reorderBuffer.size(), 1);
+  EXPECT_TRUE(reorderBuffer.shouldFlush());
+  EXPECT_EQ(reorderBuffer.getViolatingLoadsCount(), 1);
+  EXPECT_EQ(lsq.getViolatingLoad(), uopPtr2);
+  EXPECT_EQ(reorderBuffer.getFlushAddress(), 4096);
+  EXPECT_EQ(reorderBuffer.getFlushInsnId(), 0);
+}
+
+// Test that a branch is treated as expected, will trigger the loop buffer when
+// seen enough times (loop detection threshold set to 2)
+TEST_F(ReorderBufferTest, branch) {
+  // Set up branch instruction
+  const uint64_t insnAddr = 4096;
+  const uint64_t branchAddr = 1024;
+  BranchPrediction pred = {true, branchAddr};
+  ON_CALL(*uop, isBranch()).WillByDefault(Return(true));
+  uopPtr->setSequenceId(0);
+  uopPtr->setInstructionId(0);
+  uopPtr->setInstructionAddress(insnAddr);
+  uopPtr->setBranchPrediction(pred);
+  uop->setExecuted(true);
+  uopPtr->setCommitReady();
+
+  // First pass through ROB -- seen count reset to 0 as new branch
+  reorderBuffer.reserve(uopPtr);
+  EXPECT_CALL(*uop, isBranch()).Times(2);
+  EXPECT_CALL(predictor,
+              update(4096, uop->wasBranchTaken(), uop->getBranchAddress(),
+                     uop->getBranchType(), uop->getInstructionId()));
+  reorderBuffer.commit(1);
+  EXPECT_NE(loopBoundaryAddr, insnAddr);
+
+  // Second pass through ROB -- seen count = 1
+  reorderBuffer.reserve(uopPtr);
+  EXPECT_CALL(*uop, isBranch()).Times(2);
+  EXPECT_CALL(predictor,
+              update(4096, uop->wasBranchTaken(), uop->getBranchAddress(),
+                     uop->getBranchType(), uop->getInstructionId()));
+  reorderBuffer.commit(1);
+  EXPECT_NE(loopBoundaryAddr, insnAddr);
+
+  // Third pass through ROB -- seen count = 2
+  reorderBuffer.reserve(uopPtr);
+  EXPECT_CALL(*uop, isBranch()).Times(2);
+  EXPECT_CALL(predictor,
+              update(4096, uop->wasBranchTaken(), uop->getBranchAddress(),
+                     uop->getBranchType(), uop->getInstructionId()));
+  reorderBuffer.commit(1);
+  EXPECT_NE(loopBoundaryAddr, insnAddr);
+
+  // Fourth pass through ROB -- seen count = 3; exceeds detection theshold,
+  // loopBoundaryAddr updated
+  reorderBuffer.reserve(uopPtr);
+  EXPECT_CALL(*uop, isBranch()).Times(2);
+  EXPECT_CALL(predictor,
+              update(4096, uop->wasBranchTaken(), uop->getBranchAddress(),
+                     uop->getBranchType(), uop->getInstructionId()));
+  reorderBuffer.commit(1);
+  EXPECT_EQ(loopBoundaryAddr, insnAddr);
+
+  // Update prediction & reset loopBoundaryAddr. Flush ROB to reset loopDetected
+  pred = {false, branchAddr + 64};
+  uopPtr->setBranchPrediction(pred);
+  loopBoundaryAddr = 0;
+  reorderBuffer.flush(0);
+
+  // Re-do loop detecition
+  // First pass through ROB -- seen count reset to 0 as new branch
+  reorderBuffer.reserve(uopPtr);
+  EXPECT_CALL(*uop, isBranch()).Times(2);
+  EXPECT_CALL(predictor,
+              update(4096, uop->wasBranchTaken(), uop->getBranchAddress(),
+                     uop->getBranchType(), uop->getInstructionId()));
+  reorderBuffer.commit(1);
+  EXPECT_NE(loopBoundaryAddr, insnAddr);
+
+  // Second pass through ROB -- seen count = 1
+  reorderBuffer.reserve(uopPtr);
+  EXPECT_CALL(*uop, isBranch()).Times(2);
+  EXPECT_CALL(predictor,
+              update(4096, uop->wasBranchTaken(), uop->getBranchAddress(),
+                     uop->getBranchType(), uop->getInstructionId()));
+  reorderBuffer.commit(1);
+  EXPECT_NE(loopBoundaryAddr, insnAddr);
+
+  // Third pass through ROB -- seen count = 2
+  reorderBuffer.reserve(uopPtr);
+  EXPECT_CALL(*uop, isBranch()).Times(2);
+  EXPECT_CALL(predictor,
+              update(4096, uop->wasBranchTaken(), uop->getBranchAddress(),
+                     uop->getBranchType(), uop->getInstructionId()));
+  reorderBuffer.commit(1);
+  EXPECT_NE(loopBoundaryAddr, insnAddr);
+
+  // Fourth pass through ROB -- seen count = 3; exceeds detection threshold,
+  // loopBoundaryAddr updated
+  reorderBuffer.reserve(uopPtr);
+  EXPECT_CALL(*uop, isBranch()).Times(2);
+  EXPECT_CALL(predictor,
+              update(4096, uop->wasBranchTaken(), uop->getBranchAddress(),
+                     uop->getBranchType(), uop->getInstructionId()));
+  reorderBuffer.commit(1);
+  EXPECT_EQ(loopBoundaryAddr, insnAddr);
+
+  // Check that branch misprediction metrics have been correctly collected
+  EXPECT_EQ(reorderBuffer.getBranchMispredictedCount(), 8);
+}
+
+// Tests that only those destination registers which have been renamed are
+// rewound upon a ROB flush
+TEST_F(ReorderBufferTest, registerRewind) {
+  uop->setInstructionId(0);
+  uop->setSequenceId(0);
+  uop2->setInstructionId(1);
+  uop2->setSequenceId(1);
+
+  // Reserve entries in ROB
+  reorderBuffer.reserve(uopPtr);
+  reorderBuffer.reserve(uopPtr2);
+
+  // Rename one of the destination registers
+  Register archReg = {0, 1, 0};
+  Register renamedReg = rat.allocate({0, 1});
+  EXPECT_EQ(renamedReg.tag, 32);
+
+  // Set destination registers for to be flushed uop2 with the second register
+  // not being renamed
+  std::vector<Register> destinations = {renamedReg, {0, 2, 0}};
+  const span<Register> destinationSpan = {
+      const_cast<Register*>(destinations.data()), 2};
+  EXPECT_CALL(*uop2, getDestinationRegisters())
+      .Times(1)
+      .WillRepeatedly(Return(destinationSpan));
+
+  // Check that mappings in RAT are correct
+  EXPECT_EQ(rat.getMapping(archReg).tag, 32);
+  EXPECT_EQ(rat.getMapping(destinations[1]).tag, 2);
+
+  // Flush ROB
+  reorderBuffer.flush(0);
+
+  // Check rewind occured on only the first destination register
+  EXPECT_EQ(rat.getMapping(archReg).tag, 1);
+  EXPECT_EQ(rat.getMapping(destinations[1]).tag, 2);
 }
 
 }  // namespace pipeline
diff --git a/test/unit/riscv/ArchInfoTest.cc b/test/unit/riscv/ArchInfoTest.cc
new file mode 100644
index 0000000000..a086394633
--- /dev/null
+++ b/test/unit/riscv/ArchInfoTest.cc
@@ -0,0 +1,62 @@
+#include "gtest/gtest.h"
+#include "simeng/arch/riscv/ArchInfo.hh"
+#include "simeng/config/SimInfo.hh"
+#include "simeng/version.hh"
+
+namespace simeng {
+namespace arch {
+namespace riscv {
+
+class RiscVArchInfoTest : public ::testing::Test {
+ public:
+  RiscVArchInfoTest() {
+    simeng::config::SimInfo::setConfig(SIMENG_SOURCE_DIR
+                                       "/configs/DEMO_RISCV.yaml");
+  }
+
+ protected:
+  const std::vector<uint64_t> sysRegisterEnums = {
+      simeng::arch::riscv::riscv_sysreg::RISCV_SYSREG_FFLAGS,
+      simeng::arch::riscv::riscv_sysreg::RISCV_SYSREG_FRM,
+      simeng::arch::riscv::riscv_sysreg::RISCV_SYSREG_FCSR,
+      simeng::arch::riscv::riscv_sysreg::RISCV_SYSREG_CYCLE,
+      simeng::arch::riscv::riscv_sysreg::RISCV_SYSREG_TIME,
+      simeng::arch::riscv::riscv_sysreg::RISCV_SYSREG_INSTRET};
+
+  const std::vector<simeng::RegisterFileStructure> archRegStruct = {
+      {8, 32}, {8, 32}, {8, static_cast<uint16_t>(sysRegisterEnums.size())}};
+
+  const std::vector<simeng::RegisterFileStructure> physRegStruct = {
+      {8, 154}, {8, 90}, {8, static_cast<uint16_t>(sysRegisterEnums.size())}};
+
+  const std::vector<uint16_t> physRegQuants = {
+      154, 90, static_cast<uint16_t>(sysRegisterEnums.size())};
+};
+
+// Test for the getSysRegEnums() function
+TEST_F(RiscVArchInfoTest, getSysRegEnums) {
+  ArchInfo info = ArchInfo(config::SimInfo::getConfig());
+  EXPECT_EQ(info.getSysRegEnums(), sysRegisterEnums);
+}
+
+// Test for the getArchRegStruct() function
+TEST_F(RiscVArchInfoTest, getArchRegStruct) {
+  ArchInfo info = ArchInfo(config::SimInfo::getConfig());
+  EXPECT_EQ(info.getArchRegStruct(), archRegStruct);
+}
+
+// Test for the getPhysRegStruct() function
+TEST_F(RiscVArchInfoTest, getPhysRegStruct) {
+  ArchInfo info = ArchInfo(config::SimInfo::getConfig());
+  EXPECT_EQ(info.getPhysRegStruct(), physRegStruct);
+}
+
+// Test for the getPhysRegQuantities() function
+TEST_F(RiscVArchInfoTest, getPhysRegQuantities) {
+  ArchInfo info = ArchInfo(config::SimInfo::getConfig());
+  EXPECT_EQ(info.getPhysRegQuantities(), physRegQuants);
+}
+
+}  // namespace riscv
+}  // namespace arch
+}  // namespace simeng
\ No newline at end of file
diff --git a/test/unit/riscv/ArchitectureTest.cc b/test/unit/riscv/ArchitectureTest.cc
new file mode 100644
index 0000000000..49e64e42b0
--- /dev/null
+++ b/test/unit/riscv/ArchitectureTest.cc
@@ -0,0 +1,163 @@
+#include <iostream>
+
+#include "../ConfigInit.hh"
+#include "gtest/gtest.h"
+#include "simeng/CoreInstance.hh"
+#include "simeng/RegisterFileSet.hh"
+#include "simeng/arch/aarch64/Architecture.hh"
+#include "simeng/arch/riscv/Architecture.hh"
+#include "simeng/span.hh"
+#include "simeng/version.hh"
+
+namespace simeng {
+namespace arch {
+namespace riscv {
+
+// RISC-V Tests
+class RiscVArchitectureTest : public testing::Test {
+ public:
+  RiscVArchitectureTest()
+      : kernel(config::SimInfo::getConfig()["CPU-Info"]["Special-File-Dir-Path"]
+                   .as<std::string>()) {
+    arch = std::make_unique<Architecture>(kernel);
+    kernel.createProcess(process);
+  }
+
+ protected:
+  // Setting core model to complex OoO model to more verbosely test the
+  // Architecture class.
+  ConfigInit configInit = ConfigInit(config::ISA::RV64, R"YAML({
+  Core: {
+    Simulation-Mode: outoforder
+  },
+  Ports: { 
+    '0': {Portname: Port 0, Instruction-Group-Support: [INT_SIMPLE, INT_MUL, FLOAT]},
+    '1': {Portname: Port 1, Instruction-Group-Support: [INT, FLOAT]},
+    '2': {Portname: Port 2, Instruction-Group-Support: [INT_SIMPLE, INT_MUL, BRANCH]},
+    '3': {Portname: Port 4, Instruction-Group-Support: [LOAD]},
+    '4': {Portname: Port 5, Instruction-Group-Support: [LOAD]},
+    '5': {Portname: Port 3, Instruction-Group-Support: [STORE]}
+  },
+  Reservation-Stations: {
+    '0': {Size: 60, Dispatch-Rate: 4, Ports: [Port 0, Port 1, Port 2, Port 4, Port 5, Port 3]}
+  },
+  Execution-Units: {
+    '0': {Pipelined: True},
+    '1': {Pipelined: True},
+    '2': {Pipelined: True},
+    '3': {Pipelined: True},
+    '4': {Pipelined: True},
+    '5': {Pipelined: True}
+  },
+  Latencies: {
+    '0': {Instruction-Groups: [INT_SIMPLE_ARTH, INT_SIMPLE_LOGICAL], Execution-Latency: 1, Execution-Throughput: 1},
+    '1': {Instruction-Groups: [INT_MUL], Execution-Latency: 5, Execution-Throughput: 1},
+    '2': {Instruction-Groups: [INT_DIV_OR_SQRT], Execution-Latency: 39, Execution-Throughput: 39},
+    '3': {Instruction-Groups: [FLOAT_SIMPLE_CMP], Execution-Latency: 5, Execution-Throughput: 1},
+    '4': {Instruction-Groups: [FLOAT_MUL], Execution-Latency: 6, Execution-Throughput: 1},
+    '5': {Instruction-Groups: [FLOAT_SIMPLE_CVT], Execution-Latency: 7, Execution-Throughput: 1},
+    '6': {Instruction-Groups: [FLOAT_DIV_OR_SQRT], Execution-Latency: 16, Execution-Throughput: 16}
+  }
+  })YAML");
+
+  // addi	sp, ra, 2000
+  const std::array<uint8_t, 4> validInstrBytes = {0x13, 0x81, 0x00, 0x7d};
+  const std::array<uint8_t, 4> invalidInstrBytes = {0x7f, 0x00, 0x81, 0xbb};
+
+  std::unique_ptr<Architecture> arch;
+  kernel::Linux kernel;
+  kernel::LinuxProcess process = kernel::LinuxProcess(
+      span(validInstrBytes.data(), validInstrBytes.size()));
+};
+
+TEST_F(RiscVArchitectureTest, predecode) {
+  // Test that mis-aligned instruction address results in error
+  MacroOp output;
+  uint8_t result = arch->predecode(validInstrBytes.data(),
+                                   validInstrBytes.size(), 0x7, output);
+  EXPECT_EQ(result, 1);
+  EXPECT_EQ(output[0]->getInstructionAddress(), 0x7);
+  EXPECT_EQ(output[0]->exceptionEncountered(), true);
+
+  // Test that an invalid instruction returns instruction with an exception
+  output = MacroOp();
+  result = arch->predecode(invalidInstrBytes.data(), invalidInstrBytes.size(),
+                           0x8, output);
+  EXPECT_EQ(result, 4);
+  EXPECT_EQ(output[0]->getInstructionAddress(), 0x8);
+  EXPECT_EQ(output[0]->exceptionEncountered(), true);
+
+  // Test that an instruction can be properly decoded
+  output = MacroOp();
+  result = arch->predecode(validInstrBytes.data(), validInstrBytes.size(), 0x4,
+                           output);
+  EXPECT_EQ(result, 4);
+  EXPECT_EQ(output[0]->getInstructionAddress(), 0x4);
+  EXPECT_EQ(output[0]->exceptionEncountered(), false);
+}
+
+TEST_F(RiscVArchitectureTest, getSystemRegisterTag) {
+  // Test incorrect system register will fail
+  int32_t output = arch->getSystemRegisterTag(-1);
+  EXPECT_EQ(output, -1);
+
+  // Test for correct behaviour
+  output = arch->getSystemRegisterTag(RISCV_SYSREG_FFLAGS);
+  EXPECT_EQ(output, 0);
+}
+
+TEST_F(RiscVArchitectureTest, handleException) {
+  // Get Instruction
+  MacroOp insn;
+  uint8_t bytes = arch->predecode(invalidInstrBytes.data(),
+                                  invalidInstrBytes.size(), 0x4, insn);
+  EXPECT_EQ(bytes, 4);
+  EXPECT_EQ(insn[0]->getInstructionAddress(), 0x4);
+  EXPECT_EQ(insn[0]->exceptionEncountered(), true);
+
+  // Get Core
+  std::string executablePath = SIMENG_SOURCE_DIR "/SimEngDefaultProgram";
+  std::vector<std::string> executableArgs = {};
+  std::unique_ptr<CoreInstance> coreInstance =
+      std::make_unique<CoreInstance>(executablePath, executableArgs);
+  const Core& core = *coreInstance->getCore();
+  memory::MemoryInterface& memInt = *coreInstance->getDataMemory();
+  auto exceptionHandler = arch->handleException(insn[0], core, memInt);
+
+  bool tickRes = exceptionHandler->tick();
+  auto result = exceptionHandler->getResult();
+  EXPECT_TRUE(tickRes);
+  EXPECT_TRUE(result.fatal);
+  // Instruction address for fatal exception is always 0.
+  EXPECT_EQ(result.instructionAddress, 0x0);
+}
+
+TEST_F(RiscVArchitectureTest, getInitialState) {
+  std::vector<Register> regs = {{RegisterType::GENERAL, 2}};
+  std::vector<RegisterValue> regVals = {{kernel.getInitialStackPointer(), 8}};
+
+  arch::ProcessStateChange changes = arch->getInitialState();
+  EXPECT_EQ(changes.type, arch::ChangeType::REPLACEMENT);
+  EXPECT_EQ(changes.modifiedRegisters, regs);
+  EXPECT_EQ(changes.modifiedRegisterValues, regVals);
+}
+
+TEST_F(RiscVArchitectureTest, getMaxInstructionSize) {
+  EXPECT_EQ(arch->getMaxInstructionSize(), 4);
+}
+
+TEST_F(RiscVArchitectureTest, updateSystemTimerRegisters) {
+  RegisterFileSet regFile = config::SimInfo::getArchRegStruct();
+  Register cycleSystemReg = {
+      RegisterType::SYSTEM,
+      static_cast<uint16_t>(arch->getSystemRegisterTag(RISCV_SYSREG_CYCLE))};
+
+  uint64_t ticks = 30;
+  EXPECT_EQ(regFile.get(cycleSystemReg), RegisterValue(0, 8));
+  arch->updateSystemTimerRegisters(&regFile, ticks);
+  EXPECT_EQ(regFile.get(cycleSystemReg), RegisterValue(ticks, 8));
+}
+
+}  // namespace riscv
+}  // namespace arch
+}  // namespace simeng
diff --git a/test/unit/riscv/ExceptionHandlerTest.cc b/test/unit/riscv/ExceptionHandlerTest.cc
new file mode 100644
index 0000000000..3e9ac92be7
--- /dev/null
+++ b/test/unit/riscv/ExceptionHandlerTest.cc
@@ -0,0 +1,639 @@
+#include "../ConfigInit.hh"
+#include "../MockCore.hh"
+#include "../MockInstruction.hh"
+#include "../MockMemoryInterface.hh"
+#include "gmock/gmock.h"
+#include "simeng/ArchitecturalRegisterFileSet.hh"
+#include "simeng/arch/riscv/Architecture.hh"
+#include "simeng/arch/riscv/ExceptionHandler.hh"
+#include "simeng/arch/riscv/Instruction.hh"
+
+namespace simeng {
+namespace arch {
+namespace riscv {
+
+using ::testing::HasSubstr;
+using ::testing::Return;
+using ::testing::ReturnRef;
+
+class RiscVExceptionHandlerTest : public ::testing::Test {
+ public:
+  RiscVExceptionHandlerTest()
+      : kernel(config::SimInfo::getConfig()["CPU-Info"]["Special-File-Dir-Path"]
+                   .as<std::string>()),
+        arch(kernel),
+        physRegFileSet(config::SimInfo::getArchRegStruct()),
+        archRegFileSet(physRegFileSet),
+        core(memory, arch, config::SimInfo::getArchRegStruct()) {}
+
+ protected:
+  ConfigInit configInit = ConfigInit(config::ISA::RV64, "");
+
+  MockMemoryInterface memory;
+  kernel::Linux kernel;
+  Architecture arch;
+
+  RegisterFileSet physRegFileSet;
+  ArchitecturalRegisterFileSet archRegFileSet;
+
+  MockCore core;
+
+  // addi	sp, ra, 2000 --- Just need a valid instruction to hijack
+  std::array<uint8_t, 4> validInstrBytes = {0x13, 0x81, 0x00, 0x7d};
+
+  /** Helper constants for RISC-V general-purpose registers. */
+  static constexpr Register R0 = {RegisterType::GENERAL, 10};
+  static constexpr Register R1 = {RegisterType::GENERAL, 11};
+  static constexpr Register R2 = {RegisterType::GENERAL, 12};
+  static constexpr Register R3 = {RegisterType::GENERAL, 13};
+  static constexpr Register R4 = {RegisterType::GENERAL, 14};
+  static constexpr Register R5 = {RegisterType::GENERAL, 15};
+  static constexpr Register R7 = {RegisterType::GENERAL, 17};
+};
+
+// All system calls are tested in /test/regression/riscv/Syscall.cc
+
+// Test that a syscall is processed sucessfully
+TEST_F(RiscVExceptionHandlerTest, testSyscall) {
+  // Create "syscall" instruction
+  uint64_t insnAddr = 0x4;
+  MacroOp uops;
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  InstructionException exception = InstructionException::SupervisorCall;
+  std::shared_ptr<Instruction> insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  insn->setInstructionAddress(insnAddr);
+
+  // Setup register file for `uname` syscall (chosen as minimal functionality)
+  archRegFileSet.set(R0, RegisterValue(1234, 8));
+  archRegFileSet.set(R7, RegisterValue(160, 8));
+
+  // Create ExceptionHandler
+  ExceptionHandler handler(insn, core, memory, kernel);
+
+  // Tick exceptionHandler
+  ON_CALL(core, getArchitecturalRegisterFileSet())
+      .WillByDefault(ReturnRef(archRegFileSet));
+  EXPECT_CALL(core, getArchitecturalRegisterFileSet()).Times(1);
+  bool retVal = handler.tick();
+  ExceptionResult result = handler.getResult();
+
+  EXPECT_TRUE(retVal);
+  EXPECT_FALSE(result.fatal);
+  EXPECT_EQ(result.instructionAddress, insnAddr + 4);
+  EXPECT_EQ(result.stateChange.type, ChangeType::REPLACEMENT);
+  std::vector<Register> modRegs = {R0};
+  EXPECT_EQ(result.stateChange.modifiedRegisters, modRegs);
+  std::vector<RegisterValue> modRegVals = {{0ull, 8}};
+  EXPECT_EQ(result.stateChange.modifiedRegisterValues, modRegVals);
+  std::vector<memory::MemoryAccessTarget> modMemTargets = {
+      {1234, 6},
+      {1234 + 65, 13},
+      {1234 + (65 * 2), 42},
+      {1234 + (65 * 3), 35},
+      {1234 + (65 * 4), 8},
+      {1234 + (65 * 5), 7}};
+  EXPECT_EQ(result.stateChange.memoryAddresses, modMemTargets);
+  std::vector<RegisterValue> modMemVals = {
+      RegisterValue("Linux"),
+      RegisterValue("fedora-riscv"),
+      RegisterValue("5.5.0-0.rc5.git0.1.1.riscv64.fc32.riscv64"),
+      RegisterValue("#1 SMP Mon Jan 6 17:31:22 UTC 2020"),
+      RegisterValue("riscv64"),
+      RegisterValue("(none)")};
+  EXPECT_EQ(result.stateChange.memoryAddressValues, modMemVals);
+}
+
+// Test that `readStringThen()` operates as expected
+TEST_F(RiscVExceptionHandlerTest, readStringThen) {
+  // Create new mock instruction and ExceptionHandler
+  std::shared_ptr<MockInstruction> uopPtr(new MockInstruction);
+  ExceptionHandler handler(uopPtr, core, memory, kernel);
+
+  // Initialise variables
+  size_t retVal = 0;
+  char* buffer;
+  buffer = (char*)malloc(256);
+  for (int i = 0; i < 256; i++) {
+    buffer[i] = 'q';
+  }
+  uint64_t addr = 1024;
+  int maxLen = kernel::Linux::LINUX_PATH_MAX;
+
+  memory::MemoryAccessTarget target1 = {addr, 1};
+  memory::MemoryReadResult res1 = {target1, RegisterValue(0xAB, 1), 1};
+  span<memory::MemoryReadResult> res1Span =
+      span<memory::MemoryReadResult>(&res1, 1);
+
+  memory::MemoryAccessTarget target2 = {addr + 1, 1};
+  memory::MemoryReadResult res2 = {target2,
+                                   RegisterValue(static_cast<int>('\0'), 1), 1};
+  span<memory::MemoryReadResult> res2Span =
+      span<memory::MemoryReadResult>(&res2, 1);
+
+  // On first call to readStringThen, expect return of false and retVal to still
+  // be 0, and buffer to be filled with `q`
+  memory::MemoryAccessTarget tar = {addr, 1};
+  EXPECT_CALL(memory, requestRead(tar, 0)).Times(1);
+  bool outcome =
+      handler.readStringThen(buffer, addr, maxLen, [&retVal](auto length) {
+        retVal = length;
+        return true;
+      });
+  EXPECT_FALSE(outcome);
+  EXPECT_EQ(retVal, 0);
+  for (int i = 0; i < 256; i++) {
+    EXPECT_EQ(buffer[i], 'q');
+  }
+
+  // ResumeHandling (called on tick()) should now be set to `readStringThen()`
+  // so call this for our second pass.
+  ON_CALL(memory, getCompletedReads())
+      .WillByDefault(Return(span<memory::MemoryReadResult>()));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  outcome = handler.tick();
+  // No memory reads completed yet so again expect to return false and no change
+  // to `retval` or buffer
+  EXPECT_FALSE(outcome);
+  EXPECT_EQ(retVal, 0);
+  for (int i = 0; i < 256; i++) {
+    EXPECT_EQ(buffer[i], 'q');
+  }
+
+  // Call tick() again, but mimic a memory read completing
+  tar = {addr + 1, 1};
+  ON_CALL(memory, getCompletedReads()).WillByDefault(Return(res1Span));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  EXPECT_CALL(memory, requestRead(tar, 0)).Times(1);
+  outcome = handler.tick();
+  // Completed read but still not complete, so outcome should be false, retVal
+  // unchanged, but some data in the buffer
+  EXPECT_FALSE(outcome);
+  EXPECT_EQ(retVal, 0);
+  for (int i = 0; i < 256; i++) {
+    if (i == 0) {
+      EXPECT_EQ(buffer[i], (char)0xAB);
+    } else {
+      EXPECT_EQ(buffer[i], 'q');
+    }
+  }
+
+  // Call tick() for a final time, getting the final read result
+  ON_CALL(memory, getCompletedReads()).WillByDefault(Return(res2Span));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  outcome = handler.tick();
+  // End of string '\0' found so expect `then()` to have been called, the
+  // outcome to be true, and the buffer again to have updated
+  EXPECT_TRUE(outcome);
+  EXPECT_EQ(retVal, 1);
+  for (int i = 0; i < 256; i++) {
+    if (i == 0) {
+      EXPECT_EQ(buffer[i], (char)0xAB);
+    } else if (i == 1) {
+      EXPECT_EQ(buffer[i], '\0');
+    } else {
+      EXPECT_EQ(buffer[i], 'q');
+    }
+  }
+}
+
+// Test that in `readStringThen()` if max length is 0, then is called straight
+// away
+TEST_F(RiscVExceptionHandlerTest, readStringThen_maxLen0) {
+  // Create new mock instruction and ExceptionHandler
+  std::shared_ptr<MockInstruction> uopPtr(new MockInstruction);
+  ExceptionHandler handler(uopPtr, core, memory, kernel);
+  size_t retVal = 100;
+  char* buffer;
+  buffer = (char*)malloc(256);
+  for (int i = 0; i < 256; i++) {
+    buffer[i] = 'q';
+  }
+  uint64_t addr = 1024;
+  int maxLen = 0;
+
+  bool outcome =
+      handler.readStringThen(buffer, addr, maxLen, [&retVal](auto length) {
+        retVal = length;
+        return true;
+      });
+  EXPECT_TRUE(outcome);
+  EXPECT_EQ(retVal, -1);
+  for (int i = 0; i < 256; i++) {
+    EXPECT_EQ(buffer[i], 'q');
+  }
+}
+
+// Test that in `readStringThen()` if max length has been met, then() is called
+// and no more string is fetched
+TEST_F(RiscVExceptionHandlerTest, readStringThen_maxLenReached) {
+  // Create new mock instruction and ExceptionHandler
+  std::shared_ptr<MockInstruction> uopPtr(new MockInstruction);
+  ExceptionHandler handler(uopPtr, core, memory, kernel);
+
+  // Initialise variables
+  size_t retVal = 100;
+  char* buffer;
+  buffer = (char*)malloc(256);
+  for (int i = 0; i < 256; i++) {
+    buffer[i] = 'q';
+  }
+  uint64_t addr = 1024;
+  int maxLen = 1;
+
+  memory::MemoryAccessTarget target1 = {addr, 1};
+  memory::MemoryReadResult res1 = {target1, RegisterValue(0xAB, 1), 1};
+  span<memory::MemoryReadResult> res1Span =
+      span<memory::MemoryReadResult>(&res1, 1);
+
+  // On first call to readStringThen, expect return of false and retVal to still
+  // be 0, and buffer to be filled with `q`
+  memory::MemoryAccessTarget tar = {addr, 1};
+  EXPECT_CALL(memory, requestRead(tar, 0)).Times(1);
+  bool outcome =
+      handler.readStringThen(buffer, addr, maxLen, [&retVal](auto length) {
+        retVal = length;
+        return true;
+      });
+  EXPECT_FALSE(outcome);
+  EXPECT_EQ(retVal, 100);
+  for (int i = 0; i < 256; i++) {
+    EXPECT_EQ(buffer[i], 'q');
+  }
+
+  // ResumeHandling (called on tick()) should now be set to `readStringThen()`
+  // so call this for our second pass.
+  ON_CALL(memory, getCompletedReads())
+      .WillByDefault(Return(span<memory::MemoryReadResult>()));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  outcome = handler.tick();
+  // No memory reads completed yet so again expect to return false and no change
+  // to `retval` or buffer
+  EXPECT_FALSE(outcome);
+  EXPECT_EQ(retVal, 100);
+  for (int i = 0; i < 256; i++) {
+    EXPECT_EQ(buffer[i], 'q');
+  }
+
+  // Call tick() again, but mimic a memory read completing
+  ON_CALL(memory, getCompletedReads()).WillByDefault(Return(res1Span));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  outcome = handler.tick();
+  // Completed read and maxLength reached. Expect then() to have been called,
+  // the outcome to be true, and the buffer to have updated. RetVal should be
+  // maxLength
+  EXPECT_TRUE(outcome);
+  EXPECT_EQ(retVal, 1);
+  for (int i = 0; i < 256; i++) {
+    if (i == 0) {
+      EXPECT_EQ(buffer[i], (char)0xAB);
+    } else {
+      EXPECT_EQ(buffer[i], 'q');
+    }
+  }
+}
+
+// Test that `readBufferThen()` operates as expected
+TEST_F(RiscVExceptionHandlerTest, readBufferThen) {
+  // Create new mock instruction and ExceptionHandler
+  std::shared_ptr<MockInstruction> uopPtr(new MockInstruction);
+  uopPtr->setSequenceId(5);
+  ExceptionHandler handler(uopPtr, core, memory, kernel);
+
+  // Initialise needed values for function
+  uint64_t retVal = 0;
+  uint64_t ptr = 0;
+  uint64_t length = 192;
+
+  // Initialise data to "read" from MockMemory
+  std::vector<char> dataVec(length, 'q');
+  std::vector<char> dataVec2(length, 'q');
+  // Initialise the two required targets (128-bytes per read request in
+  // readBufferThen())
+  memory::MemoryAccessTarget tar1 = {ptr, 128};
+  memory::MemoryAccessTarget tar2 = {ptr + 128,
+                                     static_cast<uint16_t>(length - 128)};
+  // Initialise "responses" from the MockMemory
+  memory::MemoryReadResult res1 = {
+      tar1, RegisterValue(dataVec.data() + ptr, 128), uopPtr->getSequenceId()};
+  memory::MemoryReadResult res2 = {
+      tar2, RegisterValue(dataVec.data() + ptr + 128, length - 128),
+      uopPtr->getSequenceId()};
+
+  // Confirm that internal dataBuffer_ is empty
+  EXPECT_EQ(handler.dataBuffer_.size(), 0);
+
+  // Initial call to readBufferThen - expect resumeHandling to be updated to
+  // readBufferThen and a memory read request to have occurred
+  EXPECT_CALL(memory, requestRead(tar1, uopPtr->getSequenceId())).Times(1);
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  bool outcome = handler.readBufferThen(ptr, length, [&retVal]() {
+    retVal = 10;
+    return true;
+  });
+  EXPECT_FALSE(outcome);
+  EXPECT_EQ(retVal, 0);
+  EXPECT_EQ(handler.dataBuffer_.size(), 0);
+
+  // Can now call tick() - on call, emulate no reads completed
+  ON_CALL(memory, getCompletedReads())
+      .WillByDefault(Return(span<memory::MemoryReadResult>()));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  outcome = handler.tick();
+  EXPECT_FALSE(outcome);
+  EXPECT_EQ(retVal, 0);
+  EXPECT_EQ(handler.dataBuffer_.size(), 0);
+
+  // Call tick() again, simulating completed read + new read requested as still
+  // data to fetch
+  ON_CALL(memory, getCompletedReads())
+      .WillByDefault(Return(span<memory::MemoryReadResult>(&res1, 1)));
+  // Make sure clearCompletedReads() alters functionality of getCompletedReads()
+  ON_CALL(memory, clearCompletedReads())
+      .WillByDefault(::testing::InvokeWithoutArgs([&]() {
+        ON_CALL(memory, getCompletedReads())
+            .WillByDefault(Return(span<memory::MemoryReadResult>()));
+      }));
+  EXPECT_CALL(memory, getCompletedReads()).Times(2);
+  EXPECT_CALL(memory, clearCompletedReads()).Times(1);
+  EXPECT_CALL(memory, requestRead(tar2, uopPtr->getSequenceId())).Times(1);
+  outcome = handler.tick();
+  EXPECT_FALSE(outcome);
+  EXPECT_EQ(retVal, 0);
+  EXPECT_EQ(handler.dataBuffer_.size(), 128);
+  for (size_t i = 0; i < handler.dataBuffer_.size(); i++) {
+    EXPECT_EQ(handler.dataBuffer_[i], 'q');
+  }
+
+  // One final call to tick() to get last bits of data from memory and call
+  // then()
+  ON_CALL(memory, getCompletedReads())
+      .WillByDefault(Return(span<memory::MemoryReadResult>(&res2, 1)));
+  EXPECT_CALL(memory, getCompletedReads()).Times(1);
+  EXPECT_CALL(memory, clearCompletedReads()).Times(1);
+  outcome = handler.tick();
+  EXPECT_TRUE(outcome);
+  EXPECT_EQ(retVal, 10);
+  EXPECT_EQ(handler.dataBuffer_.size(), length);
+  for (size_t i = 0; i < length; i++) {
+    EXPECT_EQ(handler.dataBuffer_[i], static_cast<unsigned char>('q'));
+  }
+}
+
+// Test that `readBufferThen()` calls then if length is 0
+TEST_F(RiscVExceptionHandlerTest, readBufferThen_length0) {
+  // Create new mock instruction and ExceptionHandler
+  std::shared_ptr<MockInstruction> uopPtr(new MockInstruction);
+  ExceptionHandler handler(uopPtr, core, memory, kernel);
+
+  const size_t expectedVal = 10;
+  uint64_t retVal = 0;
+  uint64_t ptr = 0;
+  uint64_t length = 0;
+
+  bool outcome = handler.readBufferThen(ptr, length, [&retVal]() {
+    retVal = 10;
+    return true;
+  });
+  EXPECT_TRUE(outcome);
+  EXPECT_EQ(retVal, expectedVal);
+}
+
+// Test that all RISC-V exception types print as expected
+TEST_F(RiscVExceptionHandlerTest, printException) {
+  ON_CALL(core, getArchitecturalRegisterFileSet())
+      .WillByDefault(ReturnRef(archRegFileSet));
+  uint64_t insnAddr = 0x4;
+  MacroOp uops;
+
+  // Create instruction for EncodingUnallocated
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  InstructionException exception = InstructionException::EncodingUnallocated;
+  std::shared_ptr<Instruction> insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_0(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  std::stringstream buffer;
+  std::streambuf* sbuf = std::cout.rdbuf();  // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());           // Redirect cout to buffer
+  handler_0.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(),
+              HasSubstr("[SimEng:ExceptionHandler] Encountered "
+                        "unallocated instruction encoding exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for ExecutionNotYetImplemented
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::ExecutionNotYetImplemented;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_1(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_1.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(),
+              HasSubstr("[SimEng:ExceptionHandler] Encountered execution "
+                        "not-yet-implemented exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for AliasNotYetImplemented
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::AliasNotYetImplemented;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_2(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_2.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(), HasSubstr("[SimEng:ExceptionHandler] Encountered "
+                                      "alias not-yet-implemented exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for MisalignedPC
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::MisalignedPC;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_3(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_3.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(),
+              HasSubstr("[SimEng:ExceptionHandler] Encountered misaligned "
+                        "program counter exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for DataAbort
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::DataAbort;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_4(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_4.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(
+      buffer.str(),
+      HasSubstr("[SimEng:ExceptionHandler] Encountered data abort exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for SupervisorCall
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::SupervisorCall;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_5(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_5.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(
+      buffer.str(),
+      HasSubstr(
+          "[SimEng:ExceptionHandler] Encountered supervisor call exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for HypervisorCall
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::HypervisorCall;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_6(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_6.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(
+      buffer.str(),
+      HasSubstr(
+          "[SimEng:ExceptionHandler] Encountered hypervisor call exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for SecureMonitorCall
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::SecureMonitorCall;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_7(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_7.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(), HasSubstr("[SimEng:ExceptionHandler] Encountered "
+                                      "secure monitor call exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for NoAvailablePort
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::NoAvailablePort;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_8(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_8.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(), HasSubstr("[SimEng:ExceptionHandler] Encountered "
+                                      "unsupported execution port exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for IllegalInstruction
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::IllegalInstruction;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_9(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_9.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(), HasSubstr("[SimEng:ExceptionHandler] Encountered "
+                                      "illegal instruction exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for PipelineFlush
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::PipelineFlush;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_10(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_10.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(), HasSubstr("[SimEng:ExceptionHandler] Encountered "
+                                      "unknown atomic operation exception"));
+  buffer.str(std::string());
+  uops.clear();
+
+  // Create instruction for default case
+  arch.predecode(validInstrBytes.data(), validInstrBytes.size(), insnAddr,
+                 uops);
+  exception = InstructionException::None;
+  insn = std::make_shared<Instruction>(
+      arch, static_cast<Instruction*>(uops[0].get())->getMetadata(), exception);
+  // Create ExceptionHandler
+  ExceptionHandler handler_11(insn, core, memory, kernel);
+  // Capture std::cout and tick exceptionHandler
+  sbuf = std::cout.rdbuf();         // Save cout's buffer
+  std::cout.rdbuf(buffer.rdbuf());  // Redirect cout to buffer
+  handler_11.printException(*static_cast<Instruction*>(insn.get()));
+  std::cout.rdbuf(sbuf);  // Restore cout
+  EXPECT_THAT(buffer.str(),
+              HasSubstr("[SimEng:ExceptionHandler] Encountered unknown (id: "
+                        "0) exception"));
+  buffer.str(std::string());
+  uops.clear();
+}
+
+}  // namespace riscv
+}  // namespace arch
+}  // namespace simeng
\ No newline at end of file
diff --git a/test/unit/riscv/InstructionTest.cc b/test/unit/riscv/InstructionTest.cc
new file mode 100644
index 0000000000..0642b09b95
--- /dev/null
+++ b/test/unit/riscv/InstructionTest.cc
@@ -0,0 +1,573 @@
+#include "../ConfigInit.hh"
+#include "arch/riscv/InstructionMetadata.hh"
+#include "gmock/gmock.h"
+#include "simeng/arch/riscv/Instruction.hh"
+#include "simeng/version.hh"
+
+namespace simeng {
+namespace arch {
+namespace riscv {
+
+// RiscV Instruction Tests
+class RiscVInstructionTest : public testing::Test {
+ public:
+  RiscVInstructionTest()
+      : os(config::SimInfo::getConfig()["CPU-Info"]["Special-File-Dir-Path"]
+               .as<std::string>()),
+        arch(os) {
+    // Create InstructionMetadata objects
+    cs_open(CS_ARCH_RISCV, CS_MODE_RISCV64, &capstoneHandle);
+    cs_option(capstoneHandle, CS_OPT_DETAIL, CS_OPT_ON);
+
+    // Create instructions which cover the 3 main types: Arithmetic, Memory,
+    // Branch. This allows for full testing of the Instruction class.
+
+    // div
+    cs_insn rawInsn_div;
+    cs_detail rawDetail_div;
+    rawInsn_div.detail = &rawDetail_div;
+    size_t size_div = 4;
+    uint64_t address_div = 0;
+    const uint8_t* encoding_div =
+        reinterpret_cast<const uint8_t*>(divInstrBytes.data());
+    cs_disasm_iter(capstoneHandle, &encoding_div, &size_div, &address_div,
+                   &rawInsn_div);
+    divMetadata = std::make_unique<InstructionMetadata>(rawInsn_div);
+
+    // lbu
+    cs_insn rawInsn_lbu;
+    cs_detail rawDetail_ldp;
+    rawInsn_lbu.detail = &rawDetail_ldp;
+    size_t size_lbu = 4;
+    uint64_t address_lbu = 0;
+    const uint8_t* encoding_lbu =
+        reinterpret_cast<const uint8_t*>(lbuInstrBytes.data());
+    cs_disasm_iter(capstoneHandle, &encoding_lbu, &size_lbu, &address_lbu,
+                   &rawInsn_lbu);
+    lbuMetadata = std::make_unique<InstructionMetadata>(rawInsn_lbu);
+
+    // bgeu
+    cs_insn rawInsn_bgeu;
+    cs_detail rawDetail_bgeu;
+    rawInsn_bgeu.detail = &rawDetail_bgeu;
+    size_t size_bgeu = 4;
+    uint64_t address_bgeu = 0;
+    const uint8_t* encoding_bgeu =
+        reinterpret_cast<const uint8_t*>(bgeuInstrBytes.data());
+    cs_disasm_iter(capstoneHandle, &encoding_bgeu, &size_bgeu, &address_bgeu,
+                   &rawInsn_bgeu);
+    bgeuMetadata = std::make_unique<InstructionMetadata>(rawInsn_bgeu);
+
+    const uint8_t* badEncoding =
+        reinterpret_cast<const uint8_t*>(invalidInstrBytes.data());
+    invalidMetadata = std::make_unique<InstructionMetadata>(badEncoding);
+  }
+
+  ~RiscVInstructionTest() { cs_close(&capstoneHandle); }
+
+ protected:
+  ConfigInit configInit = ConfigInit(config::ISA::RV64, "");
+
+  // div a3, a3, a0
+  std::array<uint8_t, 4> divInstrBytes = {0xB3, 0xC6, 0xA6, 0x02};
+  // lbu a5, 0(s3)
+  std::array<uint8_t, 4> lbuInstrBytes = {0x83, 0xC7, 0x09, 0x00};
+  // bgeu a5, a4, -86
+  std::array<uint8_t, 4> bgeuInstrBytes = {0xE3, 0xF5, 0xE7, 0xFA};
+  std::array<uint8_t, 4> invalidInstrBytes = {0x20, 0x00, 0x02, 0x8c};
+
+  // A Capstone decoding library handle, for decoding instructions.
+  csh capstoneHandle;
+
+  kernel::Linux os;
+  Architecture arch;
+
+  std::unique_ptr<InstructionMetadata> divMetadata;
+  std::unique_ptr<InstructionMetadata> lbuMetadata;
+  std::unique_ptr<InstructionMetadata> bgeuMetadata;
+  std::unique_ptr<InstructionMetadata> invalidMetadata;
+  InstructionException exception;
+};
+
+// Test that a valid instruction is created correctly
+TEST_F(RiscVInstructionTest, validInsn) {
+  // Insn is `div	a3, a3, a0`
+  Instruction insn = Instruction(arch, *divMetadata.get());
+  // Define instruction's registers
+  std::vector<Register> destRegs = {{RegisterType::GENERAL, 13}};
+  std::vector<Register> srcRegs = {{RegisterType::GENERAL, 13},
+                                   {RegisterType::GENERAL, 10}};
+  const std::vector<uint16_t> ports = {1, 2, 3};
+  insn.setExecutionInfo({3, 4, ports});
+  insn.setInstructionAddress(0x48);
+  insn.setInstructionId(11);
+  insn.setSequenceId(12);
+
+  // Ensure that all instruction values are as expected after creation
+  BranchPrediction pred = {false, 0};
+  bool matchingPred = (insn.getBranchPrediction() == pred) ? true : false;
+  EXPECT_EQ(&insn.getArchitecture(), &arch);
+  EXPECT_EQ(insn.getBranchAddress(), 0);
+  EXPECT_TRUE(matchingPred);
+  EXPECT_EQ(insn.getBranchType(), BranchType::Unknown);
+  EXPECT_EQ(insn.getData().size(), 0);
+  EXPECT_EQ(insn.getDestinationRegisters().size(), destRegs.size());
+  for (size_t i = 0; i < destRegs.size(); i++) {
+    EXPECT_EQ(insn.getDestinationRegisters()[i], destRegs[i]);
+  }
+  EXPECT_EQ(insn.getException(), InstructionException::None);
+  EXPECT_EQ(insn.getGeneratedAddresses().size(), 0);
+  EXPECT_EQ(insn.getGroup(), InstructionGroups::INT_DIV_OR_SQRT);
+  EXPECT_EQ(insn.getInstructionAddress(), 0x48);
+  EXPECT_EQ(insn.getInstructionId(), 11);
+  EXPECT_EQ(insn.getKnownOffset(), 0);
+  EXPECT_EQ(insn.getLatency(), 3);
+  EXPECT_EQ(insn.getLSQLatency(), 1);
+  EXPECT_EQ(&insn.getMetadata(), divMetadata.get());
+  EXPECT_EQ(insn.getMicroOpIndex(), 0);
+  EXPECT_EQ(insn.getResults().size(), 1);
+  EXPECT_EQ(insn.getSequenceId(), 12);
+  EXPECT_EQ(insn.getSourceOperands().size(), 2);
+  EXPECT_EQ(insn.getSourceRegisters().size(), srcRegs.size());
+  for (size_t i = 0; i < srcRegs.size(); i++) {
+    EXPECT_EQ(insn.getSourceRegisters()[i], srcRegs[i]);
+    EXPECT_FALSE(insn.isOperandReady(i));
+  }
+  EXPECT_EQ(insn.getStallCycles(), 4);
+  EXPECT_EQ(insn.getSupportedPorts(), ports);
+
+  EXPECT_FALSE(insn.canExecute());
+  EXPECT_FALSE(insn.isStoreAddress());
+  EXPECT_FALSE(insn.isStoreData());
+  EXPECT_FALSE(insn.isLoad());
+  EXPECT_FALSE(insn.isBranch());
+  EXPECT_FALSE(insn.exceptionEncountered());
+  EXPECT_FALSE(insn.hasExecuted());
+  EXPECT_FALSE(insn.canCommit());
+  EXPECT_TRUE(insn.hasAllData());
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_FALSE(insn.isFlushed());
+  EXPECT_FALSE(insn.isMicroOp());
+  EXPECT_TRUE(insn.isLastMicroOp());
+  EXPECT_FALSE(insn.isWaitingCommit());
+}
+
+// Test that an invalid instruction can be created - invalid due to byte stream
+TEST_F(RiscVInstructionTest, invalidInsn_1) {
+  Instruction insn = Instruction(arch, *invalidMetadata.get());
+  // Define instruction's registers
+  std::vector<Register> destRegs = {};
+  std::vector<Register> srcRegs = {};
+  const std::vector<uint16_t> ports = {};
+  insn.setExecutionInfo({1, 1, ports});
+  insn.setInstructionAddress(0x44);
+  insn.setInstructionId(13);
+  insn.setSequenceId(14);
+
+  // Ensure that all instruction values are as expected after creation
+  BranchPrediction pred = {false, 0};
+  bool matchingPred = (insn.getBranchPrediction() == pred) ? true : false;
+  EXPECT_EQ(&insn.getArchitecture(), &arch);
+  EXPECT_EQ(insn.getBranchAddress(), 0);
+  EXPECT_TRUE(matchingPred);
+  EXPECT_EQ(insn.getBranchType(), BranchType::Unknown);
+  EXPECT_EQ(insn.getData().size(), 0);
+  EXPECT_EQ(insn.getDestinationRegisters().size(), destRegs.size());
+  for (size_t i = 0; i < destRegs.size(); i++) {
+    EXPECT_EQ(insn.getDestinationRegisters()[i], destRegs[i]);
+  }
+  EXPECT_EQ(insn.getException(), InstructionException::EncodingUnallocated);
+  EXPECT_EQ(insn.getGeneratedAddresses().size(), 0);
+  // Default Group for instruction that is not decoded
+  EXPECT_EQ(insn.getGroup(), InstructionGroups::INT_SIMPLE_ARTH);
+  EXPECT_EQ(insn.getInstructionAddress(), 0x44);
+  EXPECT_EQ(insn.getInstructionId(), 13);
+  EXPECT_EQ(insn.getKnownOffset(), 0);
+  EXPECT_EQ(insn.getLatency(), 1);
+  EXPECT_EQ(insn.getLSQLatency(), 1);
+  EXPECT_EQ(&insn.getMetadata(), invalidMetadata.get());
+  EXPECT_EQ(insn.getMicroOpIndex(), 0);
+  EXPECT_EQ(insn.getResults().size(), 0);
+  EXPECT_EQ(insn.getSequenceId(), 14);
+  EXPECT_EQ(insn.getSourceOperands().size(), 0);
+  EXPECT_EQ(insn.getSourceRegisters().size(), srcRegs.size());
+  for (size_t i = 0; i < srcRegs.size(); i++) {
+    EXPECT_EQ(insn.getSourceRegisters()[i], srcRegs[i]);
+    EXPECT_FALSE(insn.isOperandReady(i));
+  }
+  EXPECT_EQ(insn.getStallCycles(), 1);
+  EXPECT_EQ(insn.getSupportedPorts(), ports);
+
+  EXPECT_TRUE(insn.canExecute());
+  EXPECT_FALSE(insn.isStoreAddress());
+  EXPECT_FALSE(insn.isStoreData());
+  EXPECT_FALSE(insn.isLoad());
+  EXPECT_FALSE(insn.isBranch());
+  EXPECT_TRUE(insn.exceptionEncountered());
+  EXPECT_FALSE(insn.hasExecuted());
+  EXPECT_FALSE(insn.canCommit());
+  EXPECT_TRUE(insn.hasAllData());
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_FALSE(insn.isFlushed());
+  EXPECT_FALSE(insn.isMicroOp());
+  EXPECT_TRUE(insn.isLastMicroOp());
+  EXPECT_FALSE(insn.isWaitingCommit());
+}
+
+// Test that an invalid instruction can be created - invalid due to exception
+// provided
+TEST_F(RiscVInstructionTest, invalidInsn_2) {
+  Instruction insn = Instruction(arch, *invalidMetadata.get(),
+                                 InstructionException::HypervisorCall);
+  // Define instruction's registers
+  std::vector<Register> destRegs = {};
+  std::vector<Register> srcRegs = {};
+  const std::vector<uint16_t> ports = {};
+  insn.setExecutionInfo({1, 1, ports});
+  insn.setInstructionAddress(0x43);
+  insn.setInstructionId(15);
+  insn.setSequenceId(16);
+
+  // Ensure that all instruction values are as expected after creation
+  BranchPrediction pred = {false, 0};
+  bool matchingPred = (insn.getBranchPrediction() == pred) ? true : false;
+  EXPECT_EQ(&insn.getArchitecture(), &arch);
+  EXPECT_EQ(insn.getBranchAddress(), 0);
+  EXPECT_TRUE(matchingPred);
+  EXPECT_EQ(insn.getBranchType(), BranchType::Unknown);
+  EXPECT_EQ(insn.getData().size(), 0);
+  EXPECT_EQ(insn.getDestinationRegisters().size(), destRegs.size());
+  for (size_t i = 0; i < destRegs.size(); i++) {
+    EXPECT_EQ(insn.getDestinationRegisters()[i], destRegs[i]);
+  }
+  EXPECT_EQ(insn.getException(), InstructionException::HypervisorCall);
+  EXPECT_EQ(insn.getGeneratedAddresses().size(), 0);
+  // Default Group for instruction that is not decoded
+  EXPECT_EQ(insn.getGroup(), InstructionGroups::INT_SIMPLE_ARTH);
+  EXPECT_EQ(insn.getInstructionAddress(), 0x43);
+  EXPECT_EQ(insn.getInstructionId(), 15);
+  EXPECT_EQ(insn.getKnownOffset(), 0);
+  EXPECT_EQ(insn.getLatency(), 1);
+  EXPECT_EQ(insn.getLSQLatency(), 1);
+  EXPECT_EQ(&insn.getMetadata(), invalidMetadata.get());
+  EXPECT_EQ(insn.getMicroOpIndex(), 0);
+  EXPECT_EQ(insn.getResults().size(), 0);
+  EXPECT_EQ(insn.getSequenceId(), 16);
+  EXPECT_EQ(insn.getSourceOperands().size(), 0);
+  EXPECT_EQ(insn.getSourceRegisters().size(), srcRegs.size());
+  for (size_t i = 0; i < srcRegs.size(); i++) {
+    EXPECT_EQ(insn.getSourceRegisters()[i], srcRegs[i]);
+    EXPECT_FALSE(insn.isOperandReady(i));
+  }
+  EXPECT_EQ(insn.getStallCycles(), 1);
+  EXPECT_EQ(insn.getSupportedPorts(), ports);
+
+  EXPECT_TRUE(insn.canExecute());
+  EXPECT_FALSE(insn.isStoreAddress());
+  EXPECT_FALSE(insn.isStoreData());
+  EXPECT_FALSE(insn.isLoad());
+  EXPECT_FALSE(insn.isBranch());
+  EXPECT_TRUE(insn.exceptionEncountered());
+  EXPECT_FALSE(insn.hasExecuted());
+  EXPECT_FALSE(insn.canCommit());
+  EXPECT_TRUE(insn.hasAllData());
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_FALSE(insn.isFlushed());
+  EXPECT_FALSE(insn.isMicroOp());
+  EXPECT_TRUE(insn.isLastMicroOp());
+  EXPECT_FALSE(insn.isWaitingCommit());
+}
+
+// Test to ensure that source and operand registers can be renamed correctly
+TEST_F(RiscVInstructionTest, renameRegs) {
+  // Insn is `div	a3, a3, a0`
+  Instruction insn = Instruction(arch, *divMetadata.get());
+  // Define instruction's registers
+  std::vector<Register> destRegs = {{RegisterType::GENERAL, 13}};
+  std::vector<Register> srcRegs = {{RegisterType::GENERAL, 13},
+                                   {RegisterType::GENERAL, 10}};
+  // Ensure registers decoded correctly
+  EXPECT_EQ(insn.getSourceRegisters().size(), srcRegs.size());
+  for (size_t i = 0; i < srcRegs.size(); i++) {
+    EXPECT_EQ(insn.getSourceRegisters()[i], srcRegs[i]);
+  }
+  EXPECT_EQ(insn.getDestinationRegisters().size(), destRegs.size());
+  for (size_t i = 0; i < destRegs.size(); i++) {
+    EXPECT_EQ(insn.getDestinationRegisters()[i], destRegs[i]);
+  }
+
+  // Define renamed registers
+  std::vector<Register> destRegs_new = {{RegisterType::GENERAL, 24}};
+  std::vector<Register> srcRegs_new = {{RegisterType::GENERAL, 13},
+                                       {RegisterType::GENERAL, 97}};
+  insn.renameDestination(0, destRegs_new[0]);
+  insn.renameSource(1, srcRegs_new[1]);
+  // Ensure renaming functionality works as expected
+  EXPECT_EQ(insn.getSourceRegisters().size(), srcRegs_new.size());
+  for (size_t i = 0; i < srcRegs_new.size(); i++) {
+    EXPECT_EQ(insn.getSourceRegisters()[i], srcRegs_new[i]);
+  }
+  EXPECT_EQ(insn.getDestinationRegisters().size(), destRegs_new.size());
+  for (size_t i = 0; i < destRegs_new.size(); i++) {
+    EXPECT_EQ(insn.getDestinationRegisters()[i], destRegs_new[i]);
+  }
+}
+
+// Test that operand values can be properly supplied and change the state of
+// `canExecute`
+TEST_F(RiscVInstructionTest, supplyOperand) {
+  // Insn is `div	a3, a3, a0`
+  Instruction insn = Instruction(arch, *divMetadata.get());
+  // Define instruction's registers
+  std::vector<Register> destRegs = {{RegisterType::GENERAL, 13}};
+  std::vector<Register> srcRegs = {{RegisterType::GENERAL, 13},
+                                   {RegisterType::GENERAL, 10}};
+  // Check initial state is as expected
+  EXPECT_FALSE(insn.canExecute());
+  EXPECT_FALSE(insn.isOperandReady(0));
+  EXPECT_FALSE(insn.isOperandReady(1));
+
+  // Define mock register values for source registers
+  RegisterValue val = {0xABBACAFE, 8};
+  // Supply values for all source registers
+  insn.supplyOperand(0, val);
+  insn.supplyOperand(1, val);
+  // Ensure Instruction state has updated as expected
+  EXPECT_TRUE(insn.canExecute());
+  EXPECT_TRUE(insn.isOperandReady(0));
+  EXPECT_TRUE(insn.isOperandReady(1));
+  auto sourceVals = insn.getSourceOperands();
+  EXPECT_EQ(sourceVals.size(), 2);
+  EXPECT_EQ(sourceVals[0], val);
+  EXPECT_EQ(sourceVals[1], val);
+
+  // Ensure instruction execute updates instruction state as expected, and
+  // produces the expected result.
+  EXPECT_FALSE(insn.hasExecuted());
+  insn.execute();
+  EXPECT_TRUE(insn.hasExecuted());
+  auto results = insn.getResults();
+  RegisterValue refRes = {0x00000001, 8};
+  EXPECT_EQ(results.size(), 1);
+  EXPECT_EQ(results[0], refRes);
+}
+
+// Test that data can be supplied successfully
+TEST_F(RiscVInstructionTest, supplyData) {
+  // Insn is `lbu	a5, 0(s3)`
+  Instruction insn = Instruction(arch, *lbuMetadata.get());
+  // Define instruction's registers
+  std::vector<Register> destRegs = {{RegisterType::GENERAL, 15}};
+  std::vector<Register> srcRegs = {{RegisterType::GENERAL, 19}};
+
+  // Check instruction created correctly
+  EXPECT_FALSE(insn.exceptionEncountered());
+  EXPECT_EQ(&insn.getMetadata(), lbuMetadata.get());
+  EXPECT_EQ(insn.getGroup(), InstructionGroups::LOAD_INT);
+
+  // Check source and destination registers extracted correctly
+  EXPECT_EQ(insn.getSourceRegisters().size(), srcRegs.size());
+  for (size_t i = 0; i < srcRegs.size(); i++) {
+    EXPECT_EQ(insn.getSourceRegisters()[i], srcRegs[i]);
+  }
+  EXPECT_EQ(insn.getDestinationRegisters().size(), destRegs.size());
+  for (size_t i = 0; i < destRegs.size(); i++) {
+    EXPECT_EQ(insn.getDestinationRegisters()[i], destRegs[i]);
+  }
+
+  // Supply needed operands
+  EXPECT_FALSE(insn.isOperandReady(0));
+  RegisterValue addr = {0x480, 8};
+  insn.supplyOperand(0, addr);
+  EXPECT_TRUE(insn.isOperandReady(0));
+
+  // Generate memory addresses
+  EXPECT_EQ(insn.getGeneratedAddresses().size(), 0);
+  insn.generateAddresses();
+  auto generatedAddresses = insn.getGeneratedAddresses();
+  EXPECT_EQ(generatedAddresses.size(), 1);
+  EXPECT_EQ(generatedAddresses[0].address, 0x480);
+  EXPECT_EQ(generatedAddresses[0].size, 1);
+
+  // Supply required data
+  EXPECT_FALSE(insn.hasAllData());
+  std::vector<RegisterValue> data = {{123, 1}};
+  EXPECT_EQ(generatedAddresses.size(), data.size());
+  insn.supplyData(generatedAddresses[0].address, data[0]);
+  // Ensure data was supplied correctly
+  auto retrievedData = insn.getData();
+  for (size_t i = 0; i < retrievedData.size(); i++) {
+    EXPECT_EQ(retrievedData[i], data[i]);
+  }
+  EXPECT_TRUE(insn.hasAllData());
+}
+
+// Test DataAbort Exception is triggered correctly when supplying data
+TEST_F(RiscVInstructionTest, supplyData_dataAbort) {
+  // Insn is `lbu	a5, 0(s3)`
+  Instruction insn = Instruction(arch, *lbuMetadata.get());
+  // Define instruction's registers
+  std::vector<Register> destRegs = {{RegisterType::GENERAL, 15}};
+  std::vector<Register> srcRegs = {{RegisterType::GENERAL, 19}};
+
+  // Check instruction created correctly
+  EXPECT_EQ(&insn.getMetadata(), lbuMetadata.get());
+  EXPECT_EQ(insn.getGroup(), InstructionGroups::LOAD_INT);
+
+  // Supply needed operands
+  EXPECT_FALSE(insn.isOperandReady(0));
+  RegisterValue addr = {0x480, 8};
+  insn.supplyOperand(0, addr);
+  EXPECT_TRUE(insn.isOperandReady(0));
+
+  // Generate memory addresses
+  EXPECT_EQ(insn.getGeneratedAddresses().size(), 0);
+  insn.generateAddresses();
+  auto generatedAddresses = insn.getGeneratedAddresses();
+  EXPECT_EQ(generatedAddresses.size(), 1);
+  EXPECT_EQ(generatedAddresses[0].address, 0x480);
+  EXPECT_EQ(generatedAddresses[0].size, 1);
+
+  // Trigger data abort
+  EXPECT_FALSE(insn.exceptionEncountered());
+  insn.supplyData(generatedAddresses[0].address, RegisterValue());
+  EXPECT_TRUE(insn.exceptionEncountered());
+  EXPECT_EQ(insn.getException(), InstructionException::DataAbort);
+}
+
+// Test that a correct prediction (branch taken) is handled correctly
+TEST_F(RiscVInstructionTest, correctPred_taken) {
+  // insn is `bgeu a5, a4, -86`
+  Instruction insn = Instruction(arch, *bgeuMetadata.get());
+  insn.setInstructionAddress(400);
+
+  // Check initial state of an instruction's branch related options
+  BranchPrediction pred = {false, 0};
+  bool matchingPred = (insn.getBranchPrediction() == pred);
+  EXPECT_TRUE(matchingPred);
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_EQ(insn.getBranchAddress(), 0);
+  EXPECT_EQ(insn.getBranchType(), BranchType::Conditional);
+  EXPECT_TRUE(insn.isBranch());
+
+  // Test a correct prediction where branch is taken is handled correctly
+  pred = {true, 400 - 86};
+  insn.setBranchPrediction(pred);
+  matchingPred = (insn.getBranchPrediction() == pred);
+  insn.supplyOperand(0, RegisterValue(3, 8));
+  insn.supplyOperand(1, RegisterValue(0, 8));
+  insn.execute();
+  EXPECT_TRUE(matchingPred);
+  EXPECT_TRUE(insn.wasBranchTaken());
+  EXPECT_FALSE(insn.wasBranchMispredicted());
+  EXPECT_EQ(insn.getBranchAddress(), pred.target);
+}
+
+// Test that a correct prediction (branch not taken) is handled correctly
+TEST_F(RiscVInstructionTest, correctPred_notTaken) {
+  // insn is `bgeu a5, a4, -86`
+  Instruction insn = Instruction(arch, *bgeuMetadata.get());
+  insn.setInstructionAddress(400);
+
+  // Check initial state of an instruction's branch related options
+  BranchPrediction pred = {false, 0};
+  bool matchingPred = (insn.getBranchPrediction() == pred);
+  EXPECT_TRUE(matchingPred);
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_EQ(insn.getBranchAddress(), 0);
+  EXPECT_EQ(insn.getBranchType(), BranchType::Conditional);
+  EXPECT_TRUE(insn.isBranch());
+
+  // Test a correct prediction where a branch isn't taken is handled correctly
+  // imm operand 0x28 has 4 added implicitly by dissassembler
+  pred = {false, 400 + 4};
+  insn.setBranchPrediction(pred);
+  matchingPred = (insn.getBranchPrediction() == pred);
+  insn.supplyOperand(0, RegisterValue(0, 8));
+  insn.supplyOperand(1, RegisterValue(3, 8));
+  insn.execute();
+  EXPECT_TRUE(matchingPred);
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_FALSE(insn.wasBranchMispredicted());
+  EXPECT_EQ(insn.getBranchAddress(), pred.target);
+}
+
+// Test that an incorrect prediction (wrong target) is handled correctly
+TEST_F(RiscVInstructionTest, incorrectPred_target) {
+  // insn is `bgeu a5, a4, -86`
+  Instruction insn = Instruction(arch, *bgeuMetadata.get());
+  insn.setInstructionAddress(400);
+
+  // Check initial state of an instruction's branch related options
+  BranchPrediction pred = {false, 0};
+  bool matchingPred = (insn.getBranchPrediction() == pred);
+  EXPECT_TRUE(matchingPred);
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_EQ(insn.getBranchAddress(), 0);
+  EXPECT_EQ(insn.getBranchType(), BranchType::Conditional);
+  EXPECT_TRUE(insn.isBranch());
+
+  // Test an incorrect prediction is handled correctly - target is wrong
+  // imm operand 0x28 has 4 added implicitly by dissassembler
+  pred = {true, 80 + (0x28 + 0x4)};
+  insn.setBranchPrediction(pred);
+  matchingPred = (insn.getBranchPrediction() == pred);
+  insn.supplyOperand(0, RegisterValue(3, 8));
+  insn.supplyOperand(1, RegisterValue(0, 8));
+  insn.execute();
+  EXPECT_TRUE(matchingPred);
+  EXPECT_TRUE(insn.wasBranchTaken());
+  EXPECT_TRUE(insn.wasBranchMispredicted());
+  EXPECT_EQ(insn.getBranchAddress(), 400 - 86);
+}
+
+// Test that an incorrect prediction (wrong taken) is handled correctly
+TEST_F(RiscVInstructionTest, incorrectPred_taken) {
+  // insn is `bgeu a5, a4, -86`
+  Instruction insn = Instruction(arch, *bgeuMetadata.get());
+  insn.setInstructionAddress(400);
+
+  // Check initial state of an instruction's branch related options
+  BranchPrediction pred = {false, 0};
+  bool matchingPred = (insn.getBranchPrediction() == pred);
+  EXPECT_TRUE(matchingPred);
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_EQ(insn.getBranchAddress(), 0);
+  EXPECT_EQ(insn.getBranchType(), BranchType::Conditional);
+  EXPECT_TRUE(insn.isBranch());
+
+  // Test an incorrect prediction is handled correctly - taken is wrong
+  // imm operand 0x28 has 4 added implicitly by dissassembler
+  pred = {true, 400 - 86};
+  insn.setBranchPrediction(pred);
+  matchingPred = (insn.getBranchPrediction() == pred);
+  insn.supplyOperand(0, RegisterValue(0, 8));
+  insn.supplyOperand(1, RegisterValue(3, 8));
+  insn.execute();
+  EXPECT_TRUE(matchingPred);
+  EXPECT_FALSE(insn.wasBranchTaken());
+  EXPECT_TRUE(insn.wasBranchMispredicted());
+  EXPECT_EQ(insn.getBranchAddress(), 400 + 4);
+}
+
+// Test commit and flush setters such as `setFlushed`, `setCommitReady`, etc.
+TEST_F(RiscVInstructionTest, setters) {
+  // Insn is `div	a3, a3, a0`
+  Instruction insn = Instruction(arch, *divMetadata.get());
+
+  EXPECT_FALSE(insn.canCommit());
+  insn.setCommitReady();
+  EXPECT_TRUE(insn.canCommit());
+
+  EXPECT_FALSE(insn.isFlushed());
+  insn.setFlushed();
+  EXPECT_TRUE(insn.isFlushed());
+
+  EXPECT_FALSE(insn.isWaitingCommit());
+  insn.setWaitingCommit();
+  EXPECT_TRUE(insn.isWaitingCommit());
+}
+
+}  // namespace riscv
+}  // namespace arch
+}  // namespace simeng
\ No newline at end of file