diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..d5e0157
Binary files /dev/null and b/.DS_Store differ
diff --git a/.clang-format b/.clang-format
index 2aec894..276a9db 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,4 +2,5 @@ BasedOnStyle: Google
 
 DerivePointerAlignment: false
 PointerAlignment: true
-Standard: C++11
\ No newline at end of file
+AlignAfterOpenBracket: BlockIndent
+Standard: C++20
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index da20b26..c2b88e6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,5 +56,15 @@ gpu-blob
 CSV*
 Graphs*
 
-# VS Code
-.vscode
\ No newline at end of file
+# IDE
+.vscode
+
+# MAC metadata
+.DS_Store
+
+# CSV files and graphs
+*.csv 
+*.png
+
+# Bash scripts to run on different systems
+*.sh
diff --git a/.idea/GPU-BLAS-Offload-Benchmark.iml b/.idea/GPU-BLAS-Offload-Benchmark.iml
new file mode 100644
index 0000000..190534e
--- /dev/null
+++ b/.idea/GPU-BLAS-Offload-Benchmark.iml
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module classpath="External" external.linked.project.id="GPU-BLAS-Offload-Benchmark" external.linked.project.path="$MODULE_DIR$" external.root.project.path="$MODULE_DIR$" external.system.id="Makefile" type="CPP_MODULE" version="4" />
\ No newline at end of file
diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml
new file mode 100644
index 0000000..a55e7a1
--- /dev/null
+++ b/.idea/codeStyles/codeStyleConfig.xml
@@ -0,0 +1,5 @@
+<component name="ProjectCodeStyleConfiguration">
+  <state>
+    <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" />
+  </state>
+</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..830d3c8
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="MakefileWorkspace">
+    <contentRoot DIR="$PROJECT_DIR$" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..eff3984
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/GPU-BLAS-Offload-Benchmark.iml" filepath="$PROJECT_DIR$/.idea/GPU-BLAS-Offload-Benchmark.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 0000000..461bf83
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,628 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="AutoImportSettings">
+    <option name="autoReloadType" value="SELECTIVE" />
+  </component>
+  <component name="CMakeRunConfigurationManager">
+    <generated>
+      <config projectName="GPU-BLAS-Offload-Benchmark" targetName="all" />
+      <config projectName="GPU-BLAS-Offload-Benchmark" targetName="gpu-blob" />
+    </generated>
+  </component>
+  <component name="CMakeSettings">
+    <configurations>
+      <configuration PROFILE_NAME="Debug" ENABLED="true" CONFIG_NAME="Debug" />
+    </configurations>
+  </component>
+  <component name="ChangeListManager">
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Debugging AOCL -- Setting up makefile to build sparse stuff">
+      <change beforePath="$PROJECT_DIR$/Makefile" beforeDir="false" afterPath="$PROJECT_DIR$/Makefile" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/doSpgemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doSpgemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/doSpgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doSpgemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/doSpmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doSpmm.hh" afterDir="false" />
+    </list>
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="ClangdSettings">
+    <option name="clangTidyViaClangd" value="false" />
+    <option name="formatViaClangd" value="false" />
+  </component>
+  <component name="ExternalProjectsData">
+    <projectState path="$PROJECT_DIR$">
+      <ProjectState />
+    </projectState>
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+    <option name="UPDATE_TYPE" value="REBASE" />
+  </component>
+  <component name="MakefileLocalSettings">
+    <option name="availableProjects">
+      <map>
+        <entry>
+          <key>
+            <ExternalProjectPojo>
+              <option name="name" value="GPU-BLAS-Offload-Benchmark" />
+              <option name="path" value="$PROJECT_DIR$" />
+            </ExternalProjectPojo>
+          </key>
+          <value>
+            <list>
+              <ExternalProjectPojo>
+                <option name="name" value="GPU-BLAS-Offload-Benchmark" />
+                <option name="path" value="$PROJECT_DIR$" />
+              </ExternalProjectPojo>
+            </list>
+          </value>
+        </entry>
+      </map>
+    </option>
+    <option name="projectSyncType">
+      <map>
+        <entry key="$PROJECT_DIR$" value="RE_IMPORT" />
+      </map>
+    </option>
+  </component>
+  <component name="MarkdownSettingsMigration">
+    <option name="stateVersion" value="1" />
+  </component>
+  <component name="OCResolveContextSettings">
+    <option name="configuration" value="$PROJECT_DIR$/src/main.cc" />
+  </component>
+  <component name="ProjectApplicationVersion">
+    <option name="ide" value="CLion" />
+    <option name="majorVersion" value="2023" />
+    <option name="minorVersion" value="3" />
+  </component>
+  <component name="ProjectColorInfo">{
+  &quot;associatedIndex&quot;: 2
+}</component>
+  <component name="ProjectId" id="2bAwYDqoTyLBV0DE8xYqkQ0FEw0" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent">{
+  &quot;keyToString&quot;: {
+    &quot;C/C++ File.main.cc.executor&quot;: &quot;Run&quot;,
+    &quot;RunOnceActivity.OpenProjectViewOnStart&quot;: &quot;true&quot;,
+    &quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
+    &quot;RunOnceActivity.cidr.known.project.marker&quot;: &quot;true&quot;,
+    &quot;RunOnceActivity.readMode.enableVisualFormatting&quot;: &quot;true&quot;,
+    &quot;cf.advertisement.text.has.clang-format&quot;: &quot;true&quot;,
+    &quot;cf.first.check.clang-format&quot;: &quot;false&quot;,
+    &quot;cidr.known.project.marker&quot;: &quot;true&quot;,
+    &quot;git-widget-placeholder&quot;: &quot;sparse&quot;,
+    &quot;last_opened_file_path&quot;: &quot;/Users/no22498/Documents/GPU-BLAS-Offload-Benchmark&quot;,
+    &quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
+    &quot;node.js.detected.package.tslint&quot;: &quot;true&quot;,
+    &quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
+    &quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;,
+    &quot;nodejs_package_manager_path&quot;: &quot;npm&quot;,
+    &quot;settings.editor.selected.configurable&quot;: &quot;preferences.sourceCode.C/C++&quot;,
+    &quot;structure.view.defaults.are.configured&quot;: &quot;true&quot;,
+    &quot;vue.rearranger.settings.migration&quot;: &quot;true&quot;
+  }
+}</component>
+  <component name="RecentsManager">
+    <key name="MoveFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$/CSV_Results" />
+    </key>
+  </component>
+  <component name="RunManager" selected="C/C++ File.main.cc">
+    <configuration name="all" type="CLionNativeAppRunConfigurationType" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" EMULATE_TERMINAL="false" PASS_PARENT_ENVS_2="true" PROJECT_NAME="GPU-BLAS-Offload-Benchmark" TARGET_NAME="all" CONFIG_NAME="all" version="1">
+      <method v="2">
+        <option name="CLION.COMPOUND.BUILD" enabled="true" />
+      </method>
+    </configuration>
+    <configuration name="gpu-blob" type="CLionNativeAppRunConfigurationType" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" EMULATE_TERMINAL="false" PASS_PARENT_ENVS_2="true" PROJECT_NAME="GPU-BLAS-Offload-Benchmark" TARGET_NAME="gpu-blob" CONFIG_NAME="gpu-blob" version="1">
+      <method v="2">
+        <option name="CLION.COMPOUND.BUILD" enabled="true" />
+      </method>
+    </configuration>
+    <configuration name="main.cc" type="CppFileRunConfiguration" factoryName="CppFileRunConfiguration" temporary="true" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" EMULATE_TERMINAL="false" PASS_PARENT_ENVS_2="true" PROJECT_NAME="GPU-BLAS-Offload-Benchmark" TARGET_NAME="main.cc" CONFIG_NAME="main.cc">
+      <option name="sourceFile" value="src/main.cc" />
+      <method v="2">
+        <option name="com.jetbrains.cidr.cpp.runfile.CppFileBuildBeforeRunTaskProvider$BasicBuildBeforeRunTask" enabled="true" />
+      </method>
+    </configuration>
+    <list>
+      <item itemvalue="Native Application.all" />
+      <item itemvalue="Native Application.gpu-blob" />
+      <item itemvalue="C/C++ File.main.cc" />
+    </list>
+    <recent_temporary>
+      <list>
+        <item itemvalue="C/C++ File.main.cc" />
+      </list>
+    </recent_temporary>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="" />
+      <created>1705671236426</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1705671236426</updated>
+      <workItem from="1705671237559" duration="4602000" />
+      <workItem from="1706262352145" duration="10830000" />
+      <workItem from="1706520146967" duration="113000" />
+      <workItem from="1706524361669" duration="11224000" />
+      <workItem from="1706871479435" duration="19313000" />
+      <workItem from="1707150032379" duration="1154000" />
+      <workItem from="1707218344676" duration="510000" />
+      <workItem from="1707218861842" duration="7823000" />
+      <workItem from="1707568200980" duration="5614000" />
+      <workItem from="1708954563821" duration="751000" />
+      <workItem from="1708955322064" duration="16518000" />
+      <workItem from="1709217936554" duration="14897000" />
+      <workItem from="1709904670690" duration="598000" />
+      <workItem from="1710146767066" duration="2251000" />
+      <workItem from="1710157491483" duration="1263000" />
+      <workItem from="1710158763389" duration="2993000" />
+      <workItem from="1710161850416" duration="103978000" />
+      <workItem from="1711446443157" duration="118701000" />
+      <workItem from="1715785109710" duration="13531000" />
+      <workItem from="1716389199190" duration="1275000" />
+      <workItem from="1716897681894" duration="598000" />
+      <workItem from="1716899034743" duration="1217000" />
+      <workItem from="1716981059825" duration="14000" />
+      <workItem from="1722246444109" duration="2990000" />
+      <workItem from="1722496439084" duration="24843000" />
+      <workItem from="1723101242209" duration="21225000" />
+      <workItem from="1724244974273" duration="40294000" />
+      <workItem from="1726568120590" duration="8508000" />
+      <workItem from="1726828018604" duration="52619000" />
+      <workItem from="1727941759103" duration="43000" />
+      <workItem from="1727941814674" duration="165000" />
+      <workItem from="1727941995420" duration="22747000" />
+      <workItem from="1729503392250" duration="1773000" />
+      <workItem from="1730878516596" duration="9915000" />
+      <workItem from="1738238254258" duration="39310000" />
+      <workItem from="1739450473604" duration="26000" />
+      <workItem from="1747736781678" duration="61000" />
+      <workItem from="1747736860910" duration="176280000" />
+      <workItem from="1752493759915" duration="1677000" />
+    </task>
+    <task id="LOCAL-00046" summary="Beginning gemv ARMPL">
+      <option name="closed" value="true" />
+      <created>1729522244950</created>
+      <option name="number" value="00046" />
+      <option name="presentableId" value="LOCAL-00046" />
+      <option name="project" value="LOCAL" />
+      <updated>1729522244950</updated>
+    </task>
+    <task id="LOCAL-00047" summary="Getting rid of old oneMKL sparse file">
+      <option name="closed" value="true" />
+      <created>1735823512058</created>
+      <option name="number" value="00047" />
+      <option name="presentableId" value="LOCAL-00047" />
+      <option name="project" value="LOCAL" />
+      <updated>1735823512058</updated>
+    </task>
+    <task id="LOCAL-00048" summary="Refactoring to make individual files relate to a single kernel">
+      <option name="closed" value="true" />
+      <created>1736268772766</created>
+      <option name="number" value="00048" />
+      <option name="presentableId" value="LOCAL-00048" />
+      <option name="project" value="LOCAL" />
+      <updated>1736268772766</updated>
+    </task>
+    <task id="LOCAL-00049" summary="Moving spgemv into new format">
+      <option name="closed" value="true" />
+      <created>1736345071717</created>
+      <option name="number" value="00049" />
+      <option name="presentableId" value="LOCAL-00049" />
+      <option name="project" value="LOCAL" />
+      <updated>1736345071717</updated>
+    </task>
+    <task id="LOCAL-00050" summary="Finishing off armpl and cusparse kernels">
+      <option name="closed" value="true" />
+      <created>1736437501127</created>
+      <option name="number" value="00050" />
+      <option name="presentableId" value="LOCAL-00050" />
+      <option name="project" value="LOCAL" />
+      <updated>1736437501127</updated>
+    </task>
+    <task id="LOCAL-00051" summary="Finishing off OneMKL CPU support">
+      <option name="closed" value="true" />
+      <created>1736855919103</created>
+      <option name="number" value="00051" />
+      <option name="presentableId" value="LOCAL-00051" />
+      <option name="project" value="LOCAL" />
+      <updated>1736855919103</updated>
+    </task>
+    <task id="LOCAL-00052" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749114455274</created>
+      <option name="number" value="00052" />
+      <option name="presentableId" value="LOCAL-00052" />
+      <option name="project" value="LOCAL" />
+      <updated>1749114455275</updated>
+    </task>
+    <task id="LOCAL-00053" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749207243302</created>
+      <option name="number" value="00053" />
+      <option name="presentableId" value="LOCAL-00053" />
+      <option name="project" value="LOCAL" />
+      <updated>1749207243303</updated>
+    </task>
+    <task id="LOCAL-00054" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749208200109</created>
+      <option name="number" value="00054" />
+      <option name="presentableId" value="LOCAL-00054" />
+      <option name="project" value="LOCAL" />
+      <updated>1749208200109</updated>
+    </task>
+    <task id="LOCAL-00055" summary="Debugging onemkl -- now seems to be running SPMM">
+      <option name="closed" value="true" />
+      <created>1749216253738</created>
+      <option name="number" value="00055" />
+      <option name="presentableId" value="LOCAL-00055" />
+      <option name="project" value="LOCAL" />
+      <updated>1749216253738</updated>
+    </task>
+    <task id="LOCAL-00056" summary="Debugging onemkl -- now seems to be running SPMM BUT SLOWWWWW">
+      <option name="closed" value="true" />
+      <created>1749220180859</created>
+      <option name="number" value="00056" />
+      <option name="presentableId" value="LOCAL-00056" />
+      <option name="project" value="LOCAL" />
+      <updated>1749220180859</updated>
+    </task>
+    <task id="LOCAL-00057" summary="Debugging onemkl -- now seems to be running SPMM BUT SLOWWWWW">
+      <option name="closed" value="true" />
+      <created>1749222039569</created>
+      <option name="number" value="00057" />
+      <option name="presentableId" value="LOCAL-00057" />
+      <option name="project" value="LOCAL" />
+      <updated>1749222039569</updated>
+    </task>
+    <task id="LOCAL-00058" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749222185024</created>
+      <option name="number" value="00058" />
+      <option name="presentableId" value="LOCAL-00058" />
+      <option name="project" value="LOCAL" />
+      <updated>1749222185024</updated>
+    </task>
+    <task id="LOCAL-00059" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749224055692</created>
+      <option name="number" value="00059" />
+      <option name="presentableId" value="LOCAL-00059" />
+      <option name="project" value="LOCAL" />
+      <updated>1749224055692</updated>
+    </task>
+    <task id="LOCAL-00060" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749224825111</created>
+      <option name="number" value="00060" />
+      <option name="presentableId" value="LOCAL-00060" />
+      <option name="project" value="LOCAL" />
+      <updated>1749224825111</updated>
+    </task>
+    <task id="LOCAL-00061" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749299101343</created>
+      <option name="number" value="00061" />
+      <option name="presentableId" value="LOCAL-00061" />
+      <option name="project" value="LOCAL" />
+      <updated>1749299101343</updated>
+    </task>
+    <task id="LOCAL-00062" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749300339857</created>
+      <option name="number" value="00062" />
+      <option name="presentableId" value="LOCAL-00062" />
+      <option name="project" value="LOCAL" />
+      <updated>1749300339857</updated>
+    </task>
+    <task id="LOCAL-00063" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749300581723</created>
+      <option name="number" value="00063" />
+      <option name="presentableId" value="LOCAL-00063" />
+      <option name="project" value="LOCAL" />
+      <updated>1749300581723</updated>
+    </task>
+    <task id="LOCAL-00064" summary="Debugging onemkl -- unified now working?">
+      <option name="closed" value="true" />
+      <created>1749302000063</created>
+      <option name="number" value="00064" />
+      <option name="presentableId" value="LOCAL-00064" />
+      <option name="project" value="LOCAL" />
+      <updated>1749302000063</updated>
+    </task>
+    <task id="LOCAL-00065" summary="Debugging onemkl -- unified now working?">
+      <option name="closed" value="true" />
+      <created>1749302760970</created>
+      <option name="number" value="00065" />
+      <option name="presentableId" value="LOCAL-00065" />
+      <option name="project" value="LOCAL" />
+      <updated>1749302760970</updated>
+    </task>
+    <task id="LOCAL-00066" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749304062687</created>
+      <option name="number" value="00066" />
+      <option name="presentableId" value="LOCAL-00066" />
+      <option name="project" value="LOCAL" />
+      <updated>1749304062687</updated>
+    </task>
+    <task id="LOCAL-00067" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749304719210</created>
+      <option name="number" value="00067" />
+      <option name="presentableId" value="LOCAL-00067" />
+      <option name="project" value="LOCAL" />
+      <updated>1749304719210</updated>
+    </task>
+    <task id="LOCAL-00068" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749459873227</created>
+      <option name="number" value="00068" />
+      <option name="presentableId" value="LOCAL-00068" />
+      <option name="project" value="LOCAL" />
+      <updated>1749459873227</updated>
+    </task>
+    <task id="LOCAL-00069" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749461033029</created>
+      <option name="number" value="00069" />
+      <option name="presentableId" value="LOCAL-00069" />
+      <option name="project" value="LOCAL" />
+      <updated>1749461033029</updated>
+    </task>
+    <task id="LOCAL-00070" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749464210208</created>
+      <option name="number" value="00070" />
+      <option name="presentableId" value="LOCAL-00070" />
+      <option name="project" value="LOCAL" />
+      <updated>1749464210208</updated>
+    </task>
+    <task id="LOCAL-00071" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749466409343</created>
+      <option name="number" value="00071" />
+      <option name="presentableId" value="LOCAL-00071" />
+      <option name="project" value="LOCAL" />
+      <updated>1749466409343</updated>
+    </task>
+    <task id="LOCAL-00072" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749483888557</created>
+      <option name="number" value="00072" />
+      <option name="presentableId" value="LOCAL-00072" />
+      <option name="project" value="LOCAL" />
+      <updated>1749483888558</updated>
+    </task>
+    <task id="LOCAL-00073" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749484807520</created>
+      <option name="number" value="00073" />
+      <option name="presentableId" value="LOCAL-00073" />
+      <option name="project" value="LOCAL" />
+      <updated>1749484807520</updated>
+    </task>
+    <task id="LOCAL-00074" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749485968750</created>
+      <option name="number" value="00074" />
+      <option name="presentableId" value="LOCAL-00074" />
+      <option name="project" value="LOCAL" />
+      <updated>1749485968750</updated>
+    </task>
+    <task id="LOCAL-00075" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749550630684</created>
+      <option name="number" value="00075" />
+      <option name="presentableId" value="LOCAL-00075" />
+      <option name="project" value="LOCAL" />
+      <updated>1749550630684</updated>
+    </task>
+    <task id="LOCAL-00076" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749553951713</created>
+      <option name="number" value="00076" />
+      <option name="presentableId" value="LOCAL-00076" />
+      <option name="project" value="LOCAL" />
+      <updated>1749553951713</updated>
+    </task>
+    <task id="LOCAL-00077" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749555731977</created>
+      <option name="number" value="00077" />
+      <option name="presentableId" value="LOCAL-00077" />
+      <option name="project" value="LOCAL" />
+      <updated>1749555731977</updated>
+    </task>
+    <task id="LOCAL-00078" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749557799668</created>
+      <option name="number" value="00078" />
+      <option name="presentableId" value="LOCAL-00078" />
+      <option name="project" value="LOCAL" />
+      <updated>1749557799668</updated>
+    </task>
+    <task id="LOCAL-00079" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749558088187</created>
+      <option name="number" value="00079" />
+      <option name="presentableId" value="LOCAL-00079" />
+      <option name="project" value="LOCAL" />
+      <updated>1749558088188</updated>
+    </task>
+    <task id="LOCAL-00080" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749560123260</created>
+      <option name="number" value="00080" />
+      <option name="presentableId" value="LOCAL-00080" />
+      <option name="project" value="LOCAL" />
+      <updated>1749560123260</updated>
+    </task>
+    <task id="LOCAL-00081" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749571193282</created>
+      <option name="number" value="00081" />
+      <option name="presentableId" value="LOCAL-00081" />
+      <option name="project" value="LOCAL" />
+      <updated>1749571193282</updated>
+    </task>
+    <task id="LOCAL-00082" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749634004891</created>
+      <option name="number" value="00082" />
+      <option name="presentableId" value="LOCAL-00082" />
+      <option name="project" value="LOCAL" />
+      <updated>1749634004891</updated>
+    </task>
+    <task id="LOCAL-00083" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749634821436</created>
+      <option name="number" value="00083" />
+      <option name="presentableId" value="LOCAL-00083" />
+      <option name="project" value="LOCAL" />
+      <updated>1749634821436</updated>
+    </task>
+    <task id="LOCAL-00084" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749636983737</created>
+      <option name="number" value="00084" />
+      <option name="presentableId" value="LOCAL-00084" />
+      <option name="project" value="LOCAL" />
+      <updated>1749636983737</updated>
+    </task>
+    <task id="LOCAL-00085" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749649606510</created>
+      <option name="number" value="00085" />
+      <option name="presentableId" value="LOCAL-00085" />
+      <option name="project" value="LOCAL" />
+      <updated>1749649606510</updated>
+    </task>
+    <task id="LOCAL-00086" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749650425306</created>
+      <option name="number" value="00086" />
+      <option name="presentableId" value="LOCAL-00086" />
+      <option name="project" value="LOCAL" />
+      <updated>1749650425306</updated>
+    </task>
+    <task id="LOCAL-00087" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749650457446</created>
+      <option name="number" value="00087" />
+      <option name="presentableId" value="LOCAL-00087" />
+      <option name="project" value="LOCAL" />
+      <updated>1749650457447</updated>
+    </task>
+    <task id="LOCAL-00088" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749650996166</created>
+      <option name="number" value="00088" />
+      <option name="presentableId" value="LOCAL-00088" />
+      <option name="project" value="LOCAL" />
+      <updated>1749650996166</updated>
+    </task>
+    <task id="LOCAL-00089" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749651167150</created>
+      <option name="number" value="00089" />
+      <option name="presentableId" value="LOCAL-00089" />
+      <option name="project" value="LOCAL" />
+      <updated>1749651167150</updated>
+    </task>
+    <task id="LOCAL-00090" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749739754825</created>
+      <option name="number" value="00090" />
+      <option name="presentableId" value="LOCAL-00090" />
+      <option name="project" value="LOCAL" />
+      <updated>1749739754825</updated>
+    </task>
+    <task id="LOCAL-00091" summary="Debugging onemkl">
+      <option name="closed" value="true" />
+      <created>1749740728357</created>
+      <option name="number" value="00091" />
+      <option name="presentableId" value="LOCAL-00091" />
+      <option name="project" value="LOCAL" />
+      <updated>1749740728357</updated>
+    </task>
+    <task id="LOCAL-00092" summary="Debugging onemkl -- SpMM running but far too slowly">
+      <option name="closed" value="true" />
+      <created>1750062944488</created>
+      <option name="number" value="00092" />
+      <option name="presentableId" value="LOCAL-00092" />
+      <option name="project" value="LOCAL" />
+      <updated>1750062944488</updated>
+    </task>
+    <task id="LOCAL-00093" summary="Debugging AOCL -- Setting up makefile to build sparse stuff">
+      <option name="closed" value="true" />
+      <created>1750077193292</created>
+      <option name="number" value="00093" />
+      <option name="presentableId" value="LOCAL-00093" />
+      <option name="project" value="LOCAL" />
+      <updated>1750077193292</updated>
+    </task>
+    <task id="LOCAL-00094" summary="Debugging AOCL -- Setting up makefile to build sparse stuff">
+      <option name="closed" value="true" />
+      <created>1750348948237</created>
+      <option name="number" value="00094" />
+      <option name="presentableId" value="LOCAL-00094" />
+      <option name="project" value="LOCAL" />
+      <updated>1750348948237</updated>
+    </task>
+    <option name="localTasksCounter" value="95" />
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+  <component name="Vcs.Log.Tabs.Properties">
+    <option name="TAB_STATES">
+      <map>
+        <entry key="MAIN">
+          <value>
+            <State />
+          </value>
+        </entry>
+      </map>
+    </option>
+  </component>
+  <component name="VcsManagerConfiguration">
+    <MESSAGE value="Tidying up spGEMM classes to remove duplicated code" />
+    <MESSAGE value="Fixing py script to accomodate new kernels" />
+    <MESSAGE value="Fixing memory bug.  Implementing --kernels flag" />
+    <MESSAGE value="Getting rid of print statements" />
+    <MESSAGE value="WIP" />
+    <MESSAGE value="Finalising" />
+    <MESSAGE value="Rebasing" />
+    <MESSAGE value="Adding AOCL files" />
+    <MESSAGE value="working changes" />
+    <MESSAGE value="No longer overwriting B_" />
+    <MESSAGE value="Adding kernel selection for gemv" />
+    <MESSAGE value="Providing armpl with hints" />
+    <MESSAGE value="Updating createGflopsGraphs.py to show sparsity" />
+    <MESSAGE value="Beginning gemv ARMPL" />
+    <MESSAGE value="Getting rid of old oneMKL sparse file" />
+    <MESSAGE value="Refactoring to make individual files relate to a single kernel" />
+    <MESSAGE value="Moving spgemv into new format" />
+    <MESSAGE value="Finishing off armpl and cusparse kernels" />
+    <MESSAGE value="Finishing off OneMKL CPU support" />
+    <MESSAGE value="Debugging onemkl -- now seems to be running SPMM" />
+    <MESSAGE value="Debugging onemkl -- now seems to be running SPMM BUT SLOWWWWW" />
+    <MESSAGE value="Debugging onemkl -- unified now working?" />
+    <MESSAGE value="Debugging onemkl" />
+    <MESSAGE value="Debugging onemkl -- SpMM running but far too slowly" />
+    <MESSAGE value="Debugging AOCL -- Setting up makefile to build sparse stuff" />
+    <option name="LAST_COMMIT_MESSAGE" value="Debugging AOCL -- Setting up makefile to build sparse stuff" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/AOCL/gemm.hh b/AOCL/gemm.hh
index 3c6b5c0..f418bdc 100644
--- a/AOCL/gemm.hh
+++ b/AOCL/gemm.hh
@@ -23,6 +23,7 @@ class gemm_cpu : public gemm<T> {
  private:
   /** Make call to the GEMM kernel. */
   void callGemm() override {
+
     if constexpr (std::is_same_v<T, float>) {
       bli_sgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_,
                 rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_),
diff --git a/AOCL/spmdnm.hh b/AOCL/spmdnm.hh
new file mode 100644
index 0000000..f47007c
--- /dev/null
+++ b/AOCL/spmdnm.hh
@@ -0,0 +1,336 @@
+#pragma once
+
+#ifdef CPU_AOCL
+#include "aoclsparse.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "../include/kernels/CPU/spmdnm.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+template <typename T>
+class spmdnm_cpu : public spmdnm<T> {
+public:
+  using spmdnm<T>::spmdnm;
+  using spmdnm<T>::callConsume;
+  using spmdnm<T>::initInputMatrices;
+  using spmdnm<T>::m_;
+  using spmdnm<T>::n_;
+  using spmdnm<T>::k_;
+  using spmdnm<T>::B_;
+  using spmdnm<T>::C_;
+  using spmdnm<T>::sparsity_;
+  using spmdnm<T>::type_;
+  using spmdnm<T>::nnz_;
+  using spmdnm<T>::iterations_;
+
+  void initialise(int m, int n, int k, double sparsity,
+                  matrixType type, bool binary = false) {
+    base_ = aoclsparse_index_base_zero;
+    order_ = aoclsparse_order_row;
+
+    status_ = aoclsparse_create_mat_descr(&A_description_);
+    if (status_ != aoclsparse_status_success) {
+      std::cerr << "aoclsparse_create_mat_descr is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+      printAOCLError(status_);
+    }
+
+    status_ = aoclsparse_set_mat_index_base(A_description_, base_);
+    if (status_ != aoclsparse_status_success) {
+      std::cerr << "aoclsparse_set_mat_index_base is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+      printAOCLError(status_);
+    }
+    
+
+    m_aocl_ = m_ = m;
+    n_aocl_ = n_ = n;
+    k_aocl_ = k_ = k;
+    sparsity_ = sparsity;
+    type_ = type;
+
+    nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+    nnz_aocl_ = nnz_;
+
+    B_ = (T*)calloc(k_ * n_, sizeof(T));
+    C_ = (T*)calloc(m_ * n_, sizeof(T));
+
+    initInputMatrices();
+  }
+
+protected:
+  void toSparseFormat() override {
+  
+    // Initialise datastructures for the CSR format
+    A_rows_ = new aoclsparse_int[m_ + 1];
+    A_cols_ = new aoclsparse_int[nnz_aocl_];
+    A_vals_ = new T[nnz_aocl_];
+
+    if (type_ == matrixType::rmat) {
+      rMatCSR<T, aoclsparse_int>(A_vals_, A_cols_, A_rows_, m_, k_, nnz_);
+    } else if (type_ == matrixType::random) {
+      randomCSR<T, aoclsparse_int>(A_vals_, A_cols_, A_rows_, m_, k_, nnz_);
+    } else if (type_ == matrixType::finiteElements) {
+      finiteElementCSR<T, aoclsparse_int>(A_vals_, A_cols_, A_rows_, m_, k_, nnz_);
+    } else {
+      std::cerr << "Matrix type not supported" << std::endl;
+      exit(1);
+    }
+    
+    // Move into the AOCL CSR matrix handle
+    if constexpr (std::is_same_v<T, float>) {
+      status_ = aoclsparse_create_scsr(&A_aocl_, 
+                                       base_, 
+                                       m_aocl_, 
+                                       k_aocl_, 
+                                       nnz_aocl_, 
+                                       A_rows_, 
+                                       A_cols_, 
+                                       A_vals_);
+    } else if constexpr (std::is_same_v<T, double>) {
+      status_ = aoclsparse_create_dcsr(&A_aocl_, 
+                                       base_, 
+                                       m_aocl_, 
+                                       k_aocl_, 
+                                       nnz_aocl_, 
+                                       A_rows_, 
+                                       A_cols_, 
+                                       A_vals_);
+    }
+    if (status_ != aoclsparse_status_success) {
+      std::cerr << "aoclsparse_create_?csr is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+      printAOCLError(status_);
+    }
+  }
+
+private:
+  void preLoopRequirements() override {}
+
+  void callSpmdnm() override {
+    operation_ = aoclsparse_operation_none; // Just saying no transposition happening first
+    if constexpr (std::is_same_v<T, float>) {
+      status_ = aoclsparse_scsrmm(operation_, 
+                                  alpha, 
+                                  A_aocl_, 
+                                  A_description_, 
+                                  order_, 
+                                  B_, 
+                                  n_aocl_, 
+                                  n_aocl_, 
+                                  beta, 
+                                  C_, 
+                                  n_aocl_);
+    } else if constexpr(std::is_same_v<T, double>) {
+      status_ = aoclsparse_dcsrmm(operation_, 
+                                  alpha, 
+                                  A_aocl_, 
+                                  A_description_, 
+                                  order_, 
+                                  B_, 
+                                  n_aocl_, 
+                                  n_aocl_, 
+                                  beta, 
+                                  C_, 
+                                  n_aocl_);
+    }
+    if (status_ != aoclsparse_status_success) {
+      std::cerr << "aoclsparse_?csrmm is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+      std::cerr << "\tm_aocl_=" << m_aocl_ << std::endl;
+      std::cerr << "\tn_aocl_=" << n_aocl_ << std::endl;
+      std::cerr << "\tk_aocl_=" << k_aocl_ << std::endl;
+      std::cerr << "\tnnz_aocl_=" << nnz_aocl_ << std::endl;
+      printAOCLError(status_);
+    }
+  }
+
+  void postLoopRequirements() override {
+  }
+
+  void postCallKernelCleanup() override {
+    status_ = aoclsparse_destroy_mat_descr(A_description_);
+    if (status_ != aoclsparse_status_success) {
+      std::cerr << "aoclsparse_destroy_mat_descr is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+      printAOCLError(status_);
+    }
+    status_ = aoclsparse_destroy(&A_aocl_);
+    if (status_ != aoclsparse_status_success) {
+      std::cerr << "aoclsparse_destroy is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+      printAOCLError(status_);
+    }
+    delete[] A_vals_;
+    delete[] A_cols_;
+    delete[] A_rows_;
+    delete[] B_;
+    delete[] C_;
+  }
+
+  void printAOCLError(aoclsparse_status stat) {
+    switch (stat) {
+      case aoclsparse_status_success:
+        std::cerr << "SUCCESS - The operation completed successfully";
+        break;
+      case aoclsparse_status_not_implemented:
+        std::cerr << "NOT_IMPLEMENTED - The requested functionality is not yet implemented in this version";
+        break;
+      case aoclsparse_status_invalid_pointer:
+        std::cerr << "INVALID_POINTER - One or more pointer parameters are NULL or otherwise invalid";
+        break;
+      case aoclsparse_status_invalid_size:
+        std::cerr << "INVALID_SIZE - One or more size parameters (m, n, nnz, etc.) contain an invalid value (e.g., negative or zero where positive required)";
+        break;
+      case aoclsparse_status_internal_error:
+        std::cerr << "INTERNAL_ERROR - Internal library failure";
+        break;
+      case aoclsparse_status_invalid_value:
+        std::cerr << "INVALID_VALUE - Input parameters contain an invalid value (e.g., invalid enum value, base index neither 0 nor 1)";
+        break;
+      case aoclsparse_status_invalid_index_value:
+        std::cerr << "INVALID_INDEX_VALUE - At least one index value is invalid (e.g., negative or out of bounds)";
+        break;
+      case aoclsparse_status_maxit:
+        std::cerr << "MAXIT - function stopped after reaching number of iteration limit";
+        break;
+      case aoclsparse_status_user_stop:
+        std::cerr << "USER_STOP - user requested termination";
+        break;
+      case aoclsparse_status_wrong_type:
+        std::cerr << "WRONG_TYPE - Data type mismatch (e.g., matrix datatypes don't match between operations)";
+        break;
+      case aoclsparse_status_memory_error:
+        std::cerr << "MEMORY_ERROR - memory allocation failure";
+        break;
+      case aoclsparse_status_numerical_error:
+        std::cerr << "NUMERICAL_ERROR - numerical error, e.g., matrix is not positive definite, devide-by-zero error";
+        break;
+      case aoclsparse_status_invalid_operation:
+        std::cerr << "INVALID_OPERATION - cannot proceed with the request at this point";
+        break;
+      case aoclsparse_status_unsorted_input:
+        std::cerr << "UNSORTED_INPUT - the input matrices are not sorted";
+        break;
+      case aoclsparse_status::aoclsparse_status_invalid_kid:
+        std::cerr << "INVALID_KID - user requested kernel id was not available";
+        break;
+      default:
+        std::cerr << "UNKNOWN_STATUS - Unrecognized status code (" + std::to_string(stat) + ")";
+        break;
+    }
+    std::cerr << std::endl;
+    exit(1);
+  }
+
+  void internalCheck(aoclsparse_int          maj_dim,
+                     aoclsparse_int          min_dim,
+                     aoclsparse_int          nnz,
+                     const aoclsparse_int   *idx_ptr,
+                     const aoclsparse_int   *indices,
+                     const void             *val,
+                     int                    shape,
+                     int                    base) {            
+    if (idx_ptr == nullptr) {
+      std::cerr << "INVALID ROWS ARRAY" << std::endl;
+      exit(1);
+    }
+    if (indices == nullptr){
+      std::cerr << "INVALID COLS ARRAY" << std::endl;
+      exit(1);
+    }
+    if (val == nullptr){
+      std::cerr << "INVALID VALS ARRAY" << std::endl;
+      exit(1);
+    }
+
+    if ((min_dim < 0) || (maj_dim < 0) || (nnz < 0)) {
+      std::cerr << "Wrong min_dim/maj_dim/nnz" << std::endl;
+      exit(1);
+    }
+
+    if ((idx_ptr[0] - base) != 0) {
+      std::cerr << "Wrong csr_row_ptr[0] or csc.col_ptr[0]" << std::endl;
+      exit(1);
+    }
+
+    if ((idx_ptr[maj_dim] - base) != nnz) {
+      std::cerr << "Wrong csr_row_ptr[m]!=nnz or csc.col_ptr[n]!=nnz" << std::endl;
+      exit(1);
+    }
+    for (aoclsparse_int i = 1; i <= maj_dim; i++) {
+      if (idx_ptr[i - 1] > idx_ptr[i]) {
+        std::cerr << "Wrong csr_row_ptr/csc.col_ptr - not nondecreasing" << std::endl;
+        exit (1);
+      }
+    }
+
+    // assume indices are fully sorted & fulldiag matrix unless proved otherwise
+    int sort = 1;
+    bool fulldiag = true;
+
+    aoclsparse_int idxstart, idxend, j, jmin = 0, jmax = min_dim - 1;
+    for (aoclsparse_int i = 0; i < maj_dim; i++) {
+      idxend   = idx_ptr[i + 1] - base;
+      idxstart = idx_ptr[i] - base;
+      if (shape == 1) {
+          jmin = 0;
+          jmax = i;
+      } else if (shape == 2) {
+          jmin = i;
+          jmax = min_dim - 1;
+      }
+      // check if visited D, U group within this row
+      bool diagonal = false, upper = false;
+      aoclsparse_int prev = -1; // holds previous col index, initially set to -1
+
+      for (aoclsparse_int idx = idxstart; idx < idxend; idx++) {
+        j = indices[idx] - base;
+        if (j < jmin || j > jmax) {
+          std::cerr << "Wrong index - out of bounds or triangle, @idx=" << idx << ": j=" << j << ", i=" << i << std::endl;
+          exit(1);
+        }
+        // check for sorting pattern for each element in a row
+        if (sort != 3) {
+          if (prev > j) sort = 2; // unsorted col idx (duplicate elements are allowed)
+          else prev = j; // update previous col index
+
+          // check for group-order
+          if ((j <= i && upper) || (j < i && diagonal)) sort = 3;
+        }
+        if (j > i) upper = true;
+        else if(j == i) {
+          if (diagonal) {
+            std::cerr << "Wrong diag - duplicate diag for i=j=" << i << std::endl;
+            exit(1);
+          }
+          // diagonal element visited
+          diagonal = true;
+        }
+      }
+      if (!diagonal && i < min_dim) fulldiag = false; // missing diagonal
+    }
+  }
+
+  aoclsparse_status status_;
+  aoclsparse_order order_;
+
+  aoclsparse_operation operation_;
+  aoclsparse_index_base base_;
+
+  aoclsparse_mat_descr A_description_;
+  aoclsparse_matrix A_aocl_;
+  aoclsparse_int* A_rows_;
+  aoclsparse_int* A_cols_;
+  T* A_vals_;
+
+  aoclsparse_int m_aocl_;
+  aoclsparse_int n_aocl_;
+  aoclsparse_int k_aocl_;
+  aoclsparse_int nnz_aocl_;
+
+
+  const T alpha = ALPHA;
+  const T beta = BETA;
+};
+}
+
+
+#endif
diff --git a/AOCL/spmdnv.hh b/AOCL/spmdnv.hh
new file mode 100644
index 0000000..d529713
--- /dev/null
+++ b/AOCL/spmdnv.hh
@@ -0,0 +1,273 @@
+#pragma once
+
+#ifdef CPU_AOCL
+
+#include "aoclsparse.h"
+#include <algorithm>
+
+#include "../include/kernels/CPU/spmdnv.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+template <typename T>
+class spmdnv_cpu : public spmdnv<T> {
+public:
+    using spmdnv<T>::spmdnv;
+    using spmdnv<T>::callConsume;
+    using spmdnv<T>::initInputMatrixVector;
+    using spmdnv<T>::m_;
+    using spmdnv<T>::n_;
+    using spmdnv<T>::x_;
+    using spmdnv<T>::y_;
+    using spmdnv<T>::sparsity_;
+    using spmdnv<T>::type_;
+    using spmdnv<T>::nnz_;
+    using spmdnv<T>::iterations_;
+
+    void initialise(int m, int n, double sparsity, matrixType type, 
+                    bool binary = false) {
+      if (print_) std::cout << "=========== Matrix = " << m << "x" << n << " ===========" << std::endl;
+      base_ = aoclsparse_index_base_zero;
+      operation_ = aoclsparse_operation_none;
+
+      m_aocl_ = m_ = m;
+      n_aocl_ = n_ = n;
+      sparsity_ = sparsity;
+      type_ = type;
+
+      nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_));
+      nnz_aocl_ = nnz_;
+
+      x_ = (T*)calloc(n_, sizeof(T));
+      y_ = (T*)calloc(m_, sizeof(T));
+
+      if (print_) std::cout << "About to initialise matrices" << std::endl;
+      initInputMatrixVector();
+
+      status_ = aoclsparse_create_mat_descr(&A_description_);
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_create_mat_descr failing for A" << std::endl;
+        printAOCLError(status_);
+      }
+    }
+
+protected:
+    void toSparseFormat() override {
+      A_vals_ = (T*)calloc(nnz_aocl_, sizeof(T));
+      A_cols_ = (aoclsparse_int*)calloc(nnz_aocl_, sizeof(aoclsparse_int));
+      A_rows_ = (aoclsparse_int*)calloc(m_ + 1, sizeof(aoclsparse_int));
+      if (type_ == matrixType::rmat) {
+        rMatCSR<T, aoclsparse_int>(A_vals_, A_cols_, A_rows_, m_, n_, nnz_);
+      } else if (type_ == matrixType::random) {
+        randomCSR<T, aoclsparse_int>(A_vals_, A_cols_, A_rows_, m_, n_, nnz_);
+      } else if (type_ == matrixType::finiteElements) {
+        finiteElementCSR<T, aoclsparse_int>(A_vals_, A_cols_, A_rows_, m_, n_, nnz_);
+      } else {
+        std::cerr << "Matrix type not supported" << std::endl;
+        exit(1);
+      }
+      
+
+      // Move into the AOCL CSR matrix handle
+      if constexpr (std::is_same_v<T, float>) {
+        status_ = aoclsparse_create_scsr(&A_aocl_, 
+                                        base_, 
+                                        m_aocl_, 
+                                        n_aocl_, 
+                                        nnz_aocl_, 
+                                        A_rows_, 
+                                        A_cols_, 
+                                        A_vals_);
+      } else if constexpr (std::is_same_v<T, double>) {
+        status_ = aoclsparse_create_dcsr(&A_aocl_, 
+                                        base_, 
+                                        m_aocl_, 
+                                        n_aocl_, 
+                                        nnz_aocl_, 
+                                        A_rows_, 
+                                        A_cols_, 
+                                        A_vals_);
+      }
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_create_?csr is failing with problem size of " << m_ << "x" << n_ << " . " << n_ << "x" << n_ << std::endl;
+        printAOCLError(status_);
+      } else if (print_) {
+        std::cout << "aoclsparse_create_?csr success" << std::endl;
+      }
+    }
+
+private:
+    void preLoopRequirements() override {
+      status_ = aoclsparse_set_mv_hint(A_aocl_, 
+                                       operation_, 
+                                       A_description_,
+                                       5); // Currently hard coded iternation count
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_set_mv_hint failing" << std::endl;
+        printAOCLError(status_);
+      } else if (print_) {
+        std::cout << "aoclsparse_set_mv_hint success" << std::endl;
+      }
+
+      status_ = aoclsparse_optimize(A_aocl_);
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_optimize failing" << std::endl;
+        printAOCLError(status_);
+      } else if (print_) {
+        std::cout << "aoclsparse_optimize success" << std::endl;
+      }                           
+    }
+
+    void callSpMDnV() override {
+      if constexpr (std::is_same_v<T, float>) {
+        status_ = aoclsparse_smv(operation_, 
+                                 &alpha, 
+                                 A_aocl_, 
+                                 A_description_, 
+                                 x_, 
+                                 &beta, 
+                                 y_);
+      } else if constexpr (std::is_same_v<T, double>) {
+        status_ = aoclsparse_dmv(operation_, 
+                                 &alpha, 
+                                 A_aocl_, 
+                                 A_description_, 
+                                 x_, 
+                                 &beta, 
+                                 y_);
+      }
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_?mv failing" << std::endl;
+        printAOCLError(status_);
+      } else if (print_) {
+        std::cout << "aoclsparse_?mv success" << std::endl;
+      }                   
+    }
+
+    void postLoopRequirements() override {
+      if (debug) {
+        std::cout << "==========   CPU   ==========" << std::endl;
+        std::cout << "___________________________________________" << std::endl;
+        std::cout << "x =" << std::endl;
+        std::cout << "[";
+        for (int64_t i = 0; i < n_; i++) {
+          std::cout << x_[i];
+          if (i < (n_ - 1)) std::cout << ", ";
+        }
+        std::cout << "]" << std::endl;
+        
+        std::cout << "y =" << std::endl;
+        std::cout << "[";
+        for (int64_t i = 0; i < m_; i++) {
+          std::cout << y_[i];
+          if (i < (m_ - 1)) std::cout << ", ";
+        }
+        std::cout << "]" << std::endl;
+        std::cout << "___________________________________________" << std::endl;
+      }
+    }
+
+    void postCallKernelCleanup() override {
+      status_ = aoclsparse_destroy_mat_descr(A_description_);
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_destroy_mat_descr failing" << std::endl;
+        printAOCLError(status_);
+      } else if (print_) {
+        std::cout << "aoclsparse_destroy_mat_descr success" << std::endl;
+      }               
+
+      status_ = aoclsparse_destroy(&A_aocl_);
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_destroy failing" << std::endl;
+        printAOCLError(status_);
+      } else if (print_) {
+        std::cout << "aoclsparse_destroy success" << std::endl;
+      } 
+
+      delete[] A_vals_;
+      delete[] A_cols_;
+      delete[] A_rows_;
+      delete[] x_;
+      delete[] y_;
+    }
+
+    void printAOCLError(aoclsparse_status stat) {
+      switch (stat) {
+        case aoclsparse_status_success:
+          std::cerr << "SUCCESS - The operation completed successfully";
+          break;
+        case aoclsparse_status_not_implemented:
+          std::cerr << "NOT_IMPLEMENTED - The requested functionality is not yet implemented in this version";
+          break;
+        case aoclsparse_status_invalid_pointer:
+          std::cerr << "INVALID_POINTER - One or more pointer parameters are NULL or otherwise invalid";
+          break;
+        case aoclsparse_status_invalid_size:
+          std::cerr << "INVALID_SIZE - One or more size parameters (m, n, nnz, etc.) contain an invalid value (e.g., negative or zero where positive required)";
+          break;
+        case aoclsparse_status_internal_error:
+          std::cerr << "INTERNAL_ERROR - Internal library failure";
+          break;
+        case aoclsparse_status_invalid_value:
+          std::cerr << "INVALID_VALUE - Input parameters contain an invalid value (e.g., invalid enum value, base index neither 0 nor 1)";
+          break;
+        case aoclsparse_status_invalid_index_value:
+          std::cerr << "INVALID_INDEX_VALUE - At least one index value is invalid (e.g., negative or out of bounds)";
+          break;
+        case aoclsparse_status_maxit:
+          std::cerr << "MAXIT - function stopped after reaching number of iteration limit";
+          break;
+        case aoclsparse_status_user_stop:
+          std::cerr << "USER_STOP - user requested termination";
+          break;
+        case aoclsparse_status_wrong_type:
+          std::cerr << "WRONG_TYPE - Data type mismatch (e.g., matrix datatypes don't match between operations)";
+          break;
+        case aoclsparse_status_memory_error:
+          std::cerr << "MEMORY_ERROR - memory allocation failure";
+          break;
+        case aoclsparse_status_numerical_error:
+          std::cerr << "NUMERICAL_ERROR - numerical error, e.g., matrix is not positive definite, devide-by-zero error";
+          break;
+        case aoclsparse_status_invalid_operation:
+          std::cerr << "INVALID_OPERATION - cannot proceed with the request at this point";
+          break;
+        case aoclsparse_status_unsorted_input:
+          std::cerr << "UNSORTED_INPUT - the input matrices are not sorted";
+          break;
+        case aoclsparse_status::aoclsparse_status_invalid_kid:
+          std::cerr << "INVALID_KID - user requested kernel id was not available";
+          break;
+        default:
+          std::cerr << "UNKNOWN_STATUS - Unrecognized status code (" + std::to_string(stat) + ")";
+          break;
+      }
+      std::cerr << std::endl;
+      exit(1);
+    }
+
+    bool print_ = false;
+    bool debug = false;
+
+    aoclsparse_status status_;
+
+    aoclsparse_operation operation_;
+    aoclsparse_index_base base_;
+
+    aoclsparse_matrix A_aocl_;
+    aoclsparse_int* A_rows_;
+    aoclsparse_int* A_cols_;
+    T* A_vals_;
+    aoclsparse_int m_aocl_;
+    aoclsparse_int n_aocl_;
+    aoclsparse_int nnz_aocl_;
+
+    aoclsparse_mat_descr A_description_;
+
+    const T alpha = ALPHA;
+    const T beta = BETA;
+};
+}
+
+
+#endif
diff --git a/AOCL/spmspm.hh b/AOCL/spmspm.hh
new file mode 100644
index 0000000..38c5c3f
--- /dev/null
+++ b/AOCL/spmspm.hh
@@ -0,0 +1,375 @@
+#pragma once
+
+#ifdef CPU_AOCL
+#include "aoclsparse.h"
+
+#include <algorithm>
+#include <thread>
+#include <chrono>
+
+#include "../include/kernels/CPU/spmspm.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+template <typename T>
+class spmspm_cpu : public spmspm<T> {
+public:
+    using spmspm<T>::spmspm;
+    using spmspm<T>::callConsume;
+    using spmspm<T>::initInputMatrices;
+    using spmspm<T>::m_;
+    using spmspm<T>::n_;
+    using spmspm<T>::k_;
+    using spmspm<T>::sparsity_;
+    using spmspm<T>::type_;
+    using spmspm<T>::A_nnz_;
+    using spmspm<T>::B_nnz_;
+    using spmspm<T>::iterations_;
+    using spmspm<T>::C_rows_;
+    using spmspm<T>::C_cols_;
+    using spmspm<T>::C_vals_;
+    using spmspm<T>::C_nnz_;
+
+    void initialise(int m, int n, int k, double sparsity, matrixType type, 
+                    bool binary = false) {      
+      sparsity_ = sparsity;
+      type_ = type;
+      
+      m_aocl_ = m_ = m;
+      n_aocl_ = n_ = n;
+      k_aocl_ = k_ = k;
+
+    
+      uint64_t total_elements_A = (uint64_t)m_ * (uint64_t)k_;
+      uint64_t total_elements_B = (uint64_t)k_ * (uint64_t)n_;
+      nnzA_aocl_ = A_nnz_ = 1 + (uint64_t)((double)total_elements_A * (1.0 - sparsity));
+      nnzB_aocl_ = B_nnz_ = 1 + (uint64_t)((double)total_elements_B * (1.0 - sparsity));
+      C_allocated = false;
+      
+      base_ = aoclsparse_index_base_zero;
+      operationA_ = aoclsparse_operation_none;
+      operationB_ = aoclsparse_operation_none;
+
+      status_ = aoclsparse_create_mat_descr(&A_description_);
+      if (status_ != aoclsparse_status_success) {
+        printAOCLError(status_);
+      }
+      status_ = aoclsparse_create_mat_descr(&B_description_);
+      if (status_ != aoclsparse_status_success) {
+        printAOCLError(status_);
+      }
+      initInputMatrices();
+    }
+
+protected:
+    void toSparseFormat() override {
+      A_rows_ = (aoclsparse_int*)calloc(m_ + 1, sizeof(aoclsparse_int));
+      A_cols_ = (aoclsparse_int*)calloc(nnzA_aocl_, sizeof(aoclsparse_int));
+      A_vals_ = (T*)calloc(nnzA_aocl_, sizeof(T));
+      if (A_rows_ == nullptr || A_cols_ == nullptr || A_vals_ == nullptr) {
+        std::cerr << "Failed to allocate memory for A CSR arrays with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+        exit(1);
+      }
+
+      // Initialise datastructures for the CSR format
+      B_rows_ = (aoclsparse_int*)calloc(k_ + 1, sizeof(aoclsparse_int));
+      B_cols_ = (aoclsparse_int*)calloc(nnzB_aocl_, sizeof(aoclsparse_int));
+      B_vals_ = (T*)calloc(nnzB_aocl_, sizeof(T)); 
+      if (B_rows_ == nullptr || B_cols_ == nullptr || B_vals_ == nullptr) {
+        std::cerr << "Failed to allocate memory for B CSR arrays with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+        exit(1);
+      }
+
+      int seedOffset = 0;
+      do {
+        if (type_ == matrixType::rmat) {
+          rMatCSR<T, aoclsparse_int>(A_vals_, A_cols_, A_rows_, m_, k_, A_nnz_, SEED + seedOffset++);
+          rMatCSR<T, aoclsparse_int>(B_vals_, B_cols_, B_rows_, k_, n_, B_nnz_, SEED + seedOffset++);
+        } else if (type_ == matrixType::random) {
+          randomCSR<T, aoclsparse_int>(A_vals_, A_cols_, A_rows_, m_, k_, A_nnz_, SEED + seedOffset++);
+          randomCSR<T, aoclsparse_int>(B_vals_, B_cols_, B_rows_, k_, n_, B_nnz_, SEED + seedOffset++);
+        } else if (type_ == matrixType::finiteElements) {
+          finiteElementCSR<T, aoclsparse_int>(A_vals_, A_cols_, A_rows_, m_, k_, A_nnz_, SEED + seedOffset++);
+          finiteElementCSR<T, aoclsparse_int>(B_vals_, B_cols_, B_rows_, k_, n_, B_nnz_, SEED + seedOffset++);
+        } else {
+          std::cerr << "Matrix type not supported" << std::endl;
+          exit(1);
+        } 
+      } while (calcCNNZ<aoclsparse_int>(m_, A_nnz_, A_rows_, A_cols_, k_, B_nnz_, B_rows_, B_cols_) == 0);
+
+      // Move into the AOCL CSR matrix handle
+      if constexpr (std::is_same_v<T, float>) {
+        status_ = aoclsparse_create_scsr(&A_aocl_, 
+                                         base_, 
+                                         m_aocl_, 
+                                         k_aocl_, 
+                                         nnzA_aocl_, 
+                                         A_rows_, 
+                                         A_cols_, 
+                                         A_vals_);
+      } else if constexpr (std::is_same_v<T, double>) {
+        status_ = aoclsparse_create_dcsr(&A_aocl_, 
+                                         base_, 
+                                         m_aocl_, 
+                                         k_aocl_, 
+                                         nnzA_aocl_, 
+                                         A_rows_, 
+                                         A_cols_, 
+                                         A_vals_);
+      }
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_create_?csr for A is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+        printAOCLError(status_);
+      }
+
+      // Now sort the matrix -- needed for this AOCL function
+      status_ = aoclsparse_order_mat(A_aocl_);
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_order_mat for A is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+        printAOCLError(status_);
+      }
+      
+      // Move into the AOCL CSR matrix handle
+      if constexpr (std::is_same_v<T, float>) {
+        status_ = aoclsparse_create_scsr(&B_aocl_, 
+                                         base_, 
+                                         k_aocl_, 
+                                         n_aocl_, 
+                                         nnzB_aocl_, 
+                                         B_rows_, 
+                                         B_cols_, 
+                                         B_vals_);
+      } else if constexpr (std::is_same_v<T, double>) {
+        status_ = aoclsparse_create_dcsr(&B_aocl_, 
+                                         base_, 
+                                         k_aocl_, 
+                                         n_aocl_, 
+                                         nnzB_aocl_, 
+                                         B_rows_, 
+                                         B_cols_, 
+                                         B_vals_);
+      }
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_create_?csr for B is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+        printAOCLError(status_);
+      }
+
+      // Now sort the matrix -- needed for this AOCL function
+      status_ = aoclsparse_order_mat(B_aocl_);
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_order_mat for B is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+        printAOCLError(status_);
+      }
+    }
+
+private:
+    void preLoopRequirements() override {
+    }
+
+    void callSpmspm() override {
+      if (C_allocated) {
+        if (C_vals_ != nullptr) {
+          free(C_vals_);
+          C_vals_ = nullptr;
+        }
+        if (C_cols_aocl_ != nullptr) {
+          free(C_cols_aocl_);
+          C_cols_aocl_ = nullptr;
+        }
+        if (C_rows_aocl_ != nullptr) {
+          free(C_rows_aocl_);
+          C_rows_aocl_ = nullptr;
+        }
+        C_allocated = false;
+      }
+
+      request_ = aoclsparse_stage_nnz_count;
+      status_ = aoclsparse_sp2m(operationA_, 
+                                A_description_, 
+                                A_aocl_, 
+                                operationB_, 
+                                B_description_, 
+                                B_aocl_, 
+                                request_, 
+                                &C_aocl_);
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_sp2m failing with request = aoclsparse_stage_nnz_count" << std::endl;
+        printAOCLError(status_);
+      }
+
+      request_ = aoclsparse_stage_finalize;
+      status_ = aoclsparse_sp2m(operationA_, 
+                                A_description_, 
+                                A_aocl_, 
+                                operationB_, 
+                                B_description_, 
+                                B_aocl_, 
+                                request_, 
+                                &C_aocl_);
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_sp2m failing with request = aoclsparse_stage_finalize" << std::endl;
+        printAOCLError(status_);
+      }
+
+      if constexpr (std::is_same_v<T, float>) {
+        status_ = aoclsparse_export_scsr(C_aocl_, 
+                                         &base_,
+                                         &C_M,
+                                         &C_N,
+                                         &nnzC_aocl_,
+                                         &C_rows_aocl_,
+                                         &C_cols_aocl_,
+                                         &C_vals_);
+      } else if constexpr (std::is_same_v<T, double>) {
+        status_ = aoclsparse_export_dcsr(C_aocl_, 
+                                         &base_,
+                                         &C_M,
+                                         &C_N,
+                                         &nnzC_aocl_,
+                                         &C_rows_aocl_,
+                                         &C_cols_aocl_,
+                                         &C_vals_);
+      }
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_export_zcsr failing" << std::endl;
+        printAOCLError(status_);
+      }
+      C_allocated = true;
+    }
+
+    void postLoopRequirements() override {
+      C_nnz_ = nnzC_aocl_; // Needed for checksum
+    }
+
+    void postCallKernelCleanup() override {
+      status_ = aoclsparse_destroy_mat_descr(A_description_);
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_destroy_mat_descr failing for A_description_" << std::endl;
+        printAOCLError(status_);
+      }
+      status_ = aoclsparse_destroy_mat_descr(B_description_);
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_destroy_mat_descr failing for B_description_" << std::endl;
+        printAOCLError(status_);
+      }
+
+      status_ = aoclsparse_destroy(&A_aocl_);
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_destroy failing for A" << std::endl;
+        printAOCLError(status_);
+      }
+      status_ = aoclsparse_destroy(&B_aocl_);
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_destroy failing for B" << std::endl;
+        printAOCLError(status_);
+      }
+
+      status_ = aoclsparse_destroy(&C_aocl_);
+      if (status_ != aoclsparse_status_success) {
+        std::cerr << "aoclsparse_destroy failing for C" << std::endl;
+        printAOCLError(status_);
+      }
+      free(A_rows_);
+      free(A_cols_);
+      free(A_vals_);
+      free(B_rows_);
+      free(B_cols_);
+      free(B_vals_);
+    }
+
+    void printAOCLError(aoclsparse_status stat) {
+      switch (stat) {
+        case aoclsparse_status_success:
+          std::cerr << "SUCCESS - The operation completed successfully";
+          break;
+        case aoclsparse_status_not_implemented:
+          std::cerr << "NOT_IMPLEMENTED - The requested functionality is not yet implemented in this version";
+          break;
+        case aoclsparse_status_invalid_pointer:
+          std::cerr << "INVALID_POINTER - One or more pointer parameters are NULL or otherwise invalid";
+          break;
+        case aoclsparse_status_invalid_size:
+          std::cerr << "INVALID_SIZE - One or more size parameters (m, n, nnz, etc.) contain an invalid value (e.g., negative or zero where positive required)";
+          break;
+        case aoclsparse_status_internal_error:
+          std::cerr << "INTERNAL_ERROR - Internal library failure";
+          break;
+        case aoclsparse_status_invalid_value:
+          std::cerr << "INVALID_VALUE - Input parameters contain an invalid value (e.g., invalid enum value, base index neither 0 nor 1)";
+          break;
+        case aoclsparse_status_invalid_index_value:
+          std::cerr << "INVALID_INDEX_VALUE - At least one index value is invalid (e.g., negative or out of bounds)";
+          break;
+        case aoclsparse_status_maxit:
+          std::cerr << "MAXIT - function stopped after reaching number of iteration limit";
+          break;
+        case aoclsparse_status_user_stop:
+          std::cerr << "USER_STOP - user requested termination";
+          break;
+        case aoclsparse_status_wrong_type:
+          std::cerr << "WRONG_TYPE - Data type mismatch (e.g., matrix datatypes don't match between operations)";
+          break;
+        case aoclsparse_status_memory_error:
+          std::cerr << "MEMORY_ERROR - memory allocation failure";
+          break;
+        case aoclsparse_status_numerical_error:
+          std::cerr << "NUMERICAL_ERROR - numerical error, e.g., matrix is not positive definite, devide-by-zero error";
+          break;
+        case aoclsparse_status_invalid_operation:
+          std::cerr << "INVALID_OPERATION - cannot proceed with the request at this point";
+          break;
+        case aoclsparse_status_unsorted_input:
+          std::cerr << "UNSORTED_INPUT - the input matrices are not sorted";
+          break;
+        case aoclsparse_status::aoclsparse_status_invalid_kid:
+          std::cerr << "INVALID_KID - user requested kernel id was not available";
+          break;
+        default:
+          std::cerr << "UNKNOWN_STATUS - Unrecognized status code (" + std::to_string(stat) + ")";
+          break;
+      }
+      std::cerr << std::endl;
+      exit(1);
+    }
+
+    aoclsparse_status status_;
+
+    aoclsparse_operation operationA_;
+    aoclsparse_operation operationB_;
+    aoclsparse_index_base base_;
+    aoclsparse_request request_;
+
+    aoclsparse_matrix A_aocl_;
+    aoclsparse_int* A_rows_ = nullptr;
+    aoclsparse_int* A_cols_ = nullptr;
+    T* A_vals_ = nullptr;
+
+    aoclsparse_matrix B_aocl_;
+    aoclsparse_int* B_rows_ = nullptr;
+    aoclsparse_int* B_cols_ = nullptr;
+    T* B_vals_ = nullptr;
+
+    aoclsparse_matrix C_aocl_;
+    aoclsparse_int* C_rows_aocl_ = nullptr;
+    aoclsparse_int* C_cols_aocl_ = nullptr;
+    bool C_allocated = false;
+
+    aoclsparse_int m_aocl_;
+    aoclsparse_int n_aocl_;
+    aoclsparse_int k_aocl_;
+    aoclsparse_int nnzA_aocl_;
+    aoclsparse_int nnzB_aocl_;
+    aoclsparse_int nnzC_aocl_;
+
+    aoclsparse_int C_M, C_N;
+
+    aoclsparse_mat_descr A_description_;
+    aoclsparse_mat_descr B_description_;
+
+    const T alpha = ALPHA;
+    const T beta = BETA;
+};
+}
+
+
+#endif
diff --git a/ArmPL/gemm.hh b/ArmPL/gemm.hh
index af7f428..10903d8 100644
--- a/ArmPL/gemm.hh
+++ b/ArmPL/gemm.hh
@@ -1,7 +1,7 @@
 #pragma once
 
 #ifdef CPU_ARMPL
-#include <armpl.h>
+#include "armpl.h"
 #include <omp.h>
 
 #include <algorithm>
@@ -36,8 +36,7 @@ class gemm_cpu : public gemm<T> {
                   std::max(1, m_));
     } else {
       // Un-specialised class will not do any work - print error and exit.
-      std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported."
-                << std::endl;
+      std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported." << std::endl;
       exit(1);
     }
     // Ensure compiler doesn't optimise away the work being done
diff --git a/ArmPL/gemv.hh b/ArmPL/gemv.hh
index cc0e9bf..c568c99 100644
--- a/ArmPL/gemv.hh
+++ b/ArmPL/gemv.hh
@@ -1,7 +1,7 @@
 #pragma once
 
 #ifdef CPU_ARMPL
-#include <armpl.h>
+#include "armpl.h"
 #include <omp.h>
 
 #include <algorithm>
@@ -34,8 +34,7 @@ class gemv_cpu : public gemv<T> {
                   std::max(1, m_), x_, vecIncrement_, beta, y_, vecIncrement_);
     } else {
       // Un-specialised class will not do any work - print error and exit.
-      std::cout << "ERROR - Datatype for ArmPL CPU GEMV kernel not supported."
-                << std::endl;
+      std::cout << "ERROR - Datatype for ArmPL CPU GEMV kernel not supported." << std::endl;
       exit(1);
     }
     // Ensure compiler doesn't optimise away the work being done
diff --git a/ArmPL/spmdnm.hh b/ArmPL/spmdnm.hh
new file mode 100644
index 0000000..4f53c10
--- /dev/null
+++ b/ArmPL/spmdnm.hh
@@ -0,0 +1,43 @@
+#pragma once
+
+#ifdef CPU_ARMPL
+
+#include "../include/kernels/CPU/spmdnm.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+template <typename T>
+class spmdnm_cpu : public spmdnm<T> {
+public:
+  using spmdnm<T>::spmdnm;
+  using spmdnm<T>::callConsume;
+  using spmdnm<T>::initInputMatrices;
+  using spmdnm<T>::m_;
+  using spmdnm<T>::n_;
+  using spmdnm<T>::k_;
+  using spmdnm<T>::B_;
+  using spmdnm<T>::C_;
+  using spmdnm<T>::sparsity_;
+  using spmdnm<T>::type_;
+  using spmdnm<T>::nnz_;
+  using spmdnm<T>::iterations_;
+
+  void initialise(int m, int n, int k, double sparsity,
+                  matrixType type, bool binary = false) {}
+
+protected:
+  void toSparseFormat() override {}
+
+private:
+  void preLoopRequirements() override {}
+
+  void callSpmdnm() override {}
+
+  void postLoopRequirements() override {}
+
+  void postCallKernelCleanup() override {}
+};
+}
+
+
+#endif
diff --git a/ArmPL/spmdnv.hh b/ArmPL/spmdnv.hh
new file mode 100644
index 0000000..7b0cf93
--- /dev/null
+++ b/ArmPL/spmdnv.hh
@@ -0,0 +1,205 @@
+#pragma once
+
+#ifdef CPU_ARMPL
+#include <stdio.h>
+#include <stdlib.h>
+#include "armpl.h"
+#include <omp.h>
+
+#include <algorithm>
+
+#include "../include/kernels/CPU/spmdnv.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+/** A class for GEMM CPU BLAS kernels. */
+template <typename T>
+class spmdnv_cpu : public spmdnv<T> {
+public:
+  using spmdnv<T>::spmdnv;
+  using spmdnv<T>::callConsume;
+  using spmdnv<T>::initInputMatrixVector;
+  using spmdnv<T>::m_;
+  using spmdnv<T>::n_;
+  using spmdnv<T>::x_;
+  using spmdnv<T>::y_;
+  using spmdnv<T>::sparsity_;
+  using spmdnv<T>::type_;
+  using spmdnv<T>::nnz_;
+  using spmdnv<T>::iterations_;
+
+  /** Initialise the required data structures. */
+  void initialise(int m, int n, double sparsity, matrixType type, 
+                    bool binary = false) {
+    m_armpl_ = m_ = m;
+    n_armpl_ = n_ = n;
+    sparsity_ = sparsity;
+    type_ = type;
+
+    nnz_armpl_ = nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_));
+
+    x_ = (T*)calloc(n_, sizeof(T));
+    y_ = (T*)calloc(m_, sizeof(T));
+
+    // Initialise the matrix and vectors
+    initInputMatrixVector();
+  }
+
+protected:
+  void toSparseFormat() override {
+    // Make arrays for A
+    A_vals_ = (T*)calloc(nnz_armpl_, sizeof(T));
+    A_cols_ = (armpl_int_t*)calloc(nnz_armpl_, sizeof(armpl_int_t));
+    A_rows_ = (armpl_int_t*)calloc(m_ + 1, sizeof(armpl_int_t));
+
+    // Fill the CSR arrays
+    if (type_ == matrixType::rmat) {
+      rMatCSR<T, armpl_int_t>(A_vals_, A_cols_, A_rows_, m_armpl_, n_armpl_, nnz_);
+    } else if (type_ == matrixType::random) {
+      randomCSR<T, armpl_int_t>(A_vals_, A_cols_, A_rows_, m_armpl_, n_armpl_, nnz_);
+    } else if (type_ == matrixType::finiteElements) {
+      finiteElementCSR<T, armpl_int_t>(A_vals_, A_cols_, A_rows_, m_armpl_, n_armpl_, nnz_);
+    } else {
+      std::cerr << "Matrix type not supported" << std::endl;
+      exit(1);
+    }
+
+    // Create the armpl object for this sparse matrix
+    if constexpr (std::is_same_v<T, float>) {
+      status_ = armpl_spmat_create_csr_s(&A_armpl_, 
+                                         m_armpl_, 
+                                         n_armpl_, 
+                                         A_rows_, 
+                                         A_cols_,
+                                         A_vals_,
+                                         0);
+    } else if constexpr (std::is_same_v<T, double>) {
+      status_ = armpl_spmat_create_csr_d(&A_armpl_, 
+                                         m_armpl_, 
+                                         n_armpl_, 
+                                         A_rows_, 
+                                         A_cols_,
+                                         A_vals_,
+                                         0);
+    } else {
+      // Un-specialised class will not do any work - print error and exit.
+      std::cerr << "ERROR - Datatype for ArmPL CPU SpMDnV kernel not supported." << std::endl;
+      exit(1);
+    }
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cerr << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+  }
+
+private:/** Perform any required steps before calling the SpMDnV kernel that should
+   * be timed. */
+  void preLoopRequirements() override {
+    // Give the library some hints so it can optimise the performance of the kernel
+    status_ = armpl_spmat_hint(A_armpl_,
+                               ARMPL_SPARSE_HINT_MEMORY,
+                               ARMPL_SPARSE_MEMORY_NOALLOCS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cerr << "ERROR " << status_ << std::endl;
+      exit(1);
+    }                  
+
+    status_ = armpl_spmat_hint(A_armpl_,
+                               ARMPL_SPARSE_HINT_STRUCTURE,
+                               ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cerr << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+    status_ = armpl_spmat_hint(A_armpl_,
+                               ARMPL_SPARSE_HINT_SPMV_INVOCATIONS,
+                               ARMPL_SPARSE_INVOCATIONS_FEW);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cerr << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+    status_ = armpl_spmat_hint(A_armpl_,
+                               ARMPL_SPARSE_HINT_SPMV_OPERATION,
+                               ARMPL_SPARSE_OPERATION_NOTRANS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cerr << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+    // Now optimise the matrix for SpMV based on the hints given
+    status_ = armpl_spmv_optimize(A_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cerr << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+  }
+
+  /** Make call to the SpMDnV kernel. */
+  void callSpMDnV() override {
+    if constexpr (std::is_same_v<T, float>) {
+      status_ = armpl_spmv_exec_s(ARMPL_SPARSE_OPERATION_NOTRANS,
+                                  alpha,
+                                  A_armpl_,
+                                  x_,
+                                  beta,
+                                  y_);
+    } else if constexpr (std::is_same_v<T, double>) {
+      status_ = armpl_spmv_exec_d(ARMPL_SPARSE_OPERATION_NOTRANS,
+                                  alpha,
+                                  A_armpl_,
+                                  x_,
+                                  beta,
+                                  y_);
+    } else {
+      // Un-specialised class will not do any work - print error and exit.
+      std::cerr << "ERROR - Datatype for ArmPL CPU GEMV kernel not supported." << std::endl;
+      exit(1);
+    }
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cerr << "ERROR: " << status_ << std::endl;
+      exit(1); 
+    }
+
+    // Ensure compiler doesn't optimise away the work being done
+    callConsume();
+  }
+
+  
+
+  /** Perform any required steps after calling the SpMDnV kernel that should
+   * be timed. */
+  void postLoopRequirements() override {}
+
+  void postCallKernelCleanup() override {
+    status_ = armpl_spmat_destroy(A_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cerr << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+    free(A_rows_);
+    free(A_cols_);
+    free(A_vals_);
+    free(x_);
+    free(y_);
+  }
+
+  armpl_status_t status_;
+
+  armpl_int_t n_armpl_;
+  armpl_int_t m_armpl_;
+  armpl_int_t nnz_armpl_;
+
+  T* A_vals_;
+  armpl_int_t* A_rows_;
+  armpl_int_t* A_cols_;
+
+  armpl_spmat_t A_armpl_;
+
+  const T alpha = ALPHA;
+  const T beta = BETA;
+};
+}  // namespace cpu
+#endif
\ No newline at end of file
diff --git a/ArmPL/spmspm.hh b/ArmPL/spmspm.hh
new file mode 100644
index 0000000..bb17392
--- /dev/null
+++ b/ArmPL/spmspm.hh
@@ -0,0 +1,364 @@
+#pragma once
+
+#ifdef CPU_ARMPL
+#include <stdlib.h>
+#include "armpl.h"
+#include <omp.h>
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+#include <utility>
+
+#include "../include/kernels/CPU/spmspm.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+/** A class for sparse matrix-sparse matrix CPU BLAS kernels. */
+template <typename T>
+class spmspm_cpu : public spmspm<T> {
+public:
+  using spmspm<T>::spmspm;
+  using spmspm<T>::callConsume;
+  using spmspm<T>::initInputMatrices;
+  using spmspm<T>::m_;
+  using spmspm<T>::n_;
+  using spmspm<T>::k_;
+  using spmspm<T>::sparsity_;
+  using spmspm<T>::type_;
+  using spmspm<T>::A_nnz_;
+  using spmspm<T>::B_nnz_;
+  using spmspm<T>::iterations_;
+  using spmspm<T>::C_vals_;
+  using spmspm<T>::C_nnz_;
+
+  void initialise(int m, int n, int k, double sparsity, matrixType type, 
+                  bool binary = false) {
+    sparsity_ = sparsity;
+    type_ = type;
+    
+    m_armpl_ = m_ = m;
+    n_armpl_ = n_ = n;
+    k_armpl_ = k_ = k;
+
+  
+    uint64_t total_elements_A = (uint64_t)m_ * (uint64_t)k_;
+    uint64_t total_elements_B = (uint64_t)k_ * (uint64_t)n_;
+    nnzA_armpl_ = A_nnz_ = 1 + (uint64_t)((double)total_elements_A * (1.0 - sparsity));
+    nnzB_armpl_ = B_nnz_ = 1 + (uint64_t)((double)total_elements_B * (1.0 - sparsity));
+    
+    initInputMatrices();
+  }
+
+protected:
+  void toSparseFormat() override {
+    A_vals_ = (T*)calloc(nnzA_armpl_, sizeof(T));
+    A_cols_ = (armpl_int_t*)calloc(nnzA_armpl_, sizeof(armpl_int_t));
+    A_rows_ = (armpl_int_t*)calloc(m_armpl_ + 1, sizeof(armpl_int_t));
+
+    B_vals_ = (T*)calloc(nnzB_armpl_, sizeof(T));
+    B_cols_ = (armpl_int_t*)calloc(nnzB_armpl_, sizeof(armpl_int_t));
+    B_rows_ = (armpl_int_t*)calloc(k_armpl_ + 1, sizeof(armpl_int_t)); 
+
+    int seedOffset = 0;
+    do {
+      if (type_ == matrixType::rmat) {
+        rMatCSR<T, armpl_int_t>(A_vals_, A_cols_, A_rows_, m_armpl_, k_armpl_, nnzA_armpl_, SEED + seedOffset++);
+        rMatCSR<T, armpl_int_t>(B_vals_, B_cols_, B_rows_, k_armpl_, n_armpl_, nnzB_armpl_, SEED + seedOffset++);
+      } else if (type_ == matrixType::random) {
+        randomCSR<T, armpl_int_t>(A_vals_, A_cols_, A_rows_, m_armpl_, k_armpl_, nnzA_armpl_, SEED + seedOffset++);
+        randomCSR<T, armpl_int_t>(B_vals_, B_cols_, B_rows_, k_armpl_, n_armpl_, nnzB_armpl_, SEED + seedOffset++);
+      } else if (type_ == matrixType::finiteElements) {
+        finiteElementCSR<T, armpl_int_t>(A_vals_, A_cols_, A_rows_, m_armpl_, k_armpl_, nnzA_armpl_, SEED + seedOffset++);
+        finiteElementCSR<T, armpl_int_t>(B_vals_, B_cols_, B_rows_, k_armpl_, n_armpl_, nnzB_armpl_, SEED + seedOffset++);
+      } else {
+        std::cerr << "Matrix type not supported" << std::endl;
+        exit(1);
+      }
+    } while (calcCNNZ<armpl_int_t>(m_, A_nnz_, A_rows_, A_cols_, k_, B_nnz_, B_rows_, B_cols_) == 0);
+
+    // Now make the sparse matrix objects
+    if constexpr (std::is_same_v<T, float>) {
+      status_ = armpl_spmat_create_csr_s(&A_armpl_, 
+                                         m_armpl_, 
+                                         k_armpl_, 
+                                         A_rows_, 
+                                         A_cols_,
+                                         A_vals_,
+                                         0);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1); 
+      }
+      status_ = armpl_spmat_create_csr_s(&B_armpl_, 
+                                         k_armpl_, 
+                                         n_armpl_, 
+                                         B_rows_, 
+                                         B_cols_,
+                                         B_vals_,
+                                         0);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1); 
+      }
+    } else if constexpr (std::is_same_v<T, double>) {
+      status_ = armpl_spmat_create_csr_d(&A_armpl_, 
+                                         m_armpl_, 
+                                         k_armpl_, 
+                                         A_rows_, 
+                                         A_cols_,
+                                         A_vals_,
+                                         0);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1); 
+      }
+      status_ = armpl_spmat_create_csr_d(&B_armpl_, 
+                                         k_armpl_, 
+                                         n_armpl_, 
+                                         B_rows_, 
+                                         B_cols_,
+                                         B_vals_,
+                                         0);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1); 
+      }
+    }
+    C_armpl_ = armpl_spmat_create_null(m_armpl_, n_armpl_);
+  }
+
+private:
+  void preLoopRequirements() override {
+    // Populate A and B with hints
+    status_ = armpl_spmat_hint(A_armpl_,
+                               ARMPL_SPARSE_HINT_MEMORY,
+                               ARMPL_SPARSE_MEMORY_NOALLOCS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }                  
+    status_ = armpl_spmat_hint(A_armpl_,
+                               ARMPL_SPARSE_HINT_STRUCTURE,
+                               ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }                  
+    status_ = armpl_spmat_hint(A_armpl_,
+                               ARMPL_SPARSE_HINT_SPMM_INVOCATIONS,
+                               ARMPL_SPARSE_INVOCATIONS_FEW);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(A_armpl_,
+                               ARMPL_SPARSE_HINT_SPMM_OPERATION,
+                               ARMPL_SPARSE_OPERATION_NOTRANS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(A_armpl_,
+                               ARMPL_SPARSE_HINT_SPMM_STRATEGY,
+                               ARMPL_SPARSE_SPMM_STRAT_OPT_FULL_STRUCT);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+    status_ = armpl_spmat_hint(B_armpl_,
+                               ARMPL_SPARSE_HINT_MEMORY,
+                               ARMPL_SPARSE_MEMORY_NOALLOCS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }                  
+    status_ = armpl_spmat_hint(B_armpl_,
+                               ARMPL_SPARSE_HINT_STRUCTURE,
+                               ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_,
+                               ARMPL_SPARSE_HINT_SPMM_INVOCATIONS,
+                               ARMPL_SPARSE_INVOCATIONS_FEW);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_,
+                               ARMPL_SPARSE_HINT_SPMM_OPERATION,
+                               ARMPL_SPARSE_OPERATION_NOTRANS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_,
+                               ARMPL_SPARSE_HINT_SPMM_STRATEGY,
+                               ARMPL_SPARSE_SPMM_STRAT_OPT_FULL_STRUCT);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_,
+                               ARMPL_SPARSE_HINT_SPMM_STRATEGY,
+                               ARMPL_SPARSE_SPMM_STRAT_OPT_FULL_STRUCT);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+    // Call the optimise function to apply hints
+    status_ = armpl_spmm_optimize(ARMPL_SPARSE_OPERATION_NOTRANS,
+                                  ARMPL_SPARSE_OPERATION_NOTRANS,
+                                  ARMPL_SPARSE_SCALAR_ONE,
+                                  A_armpl_,
+                                  B_armpl_,
+                                  ARMPL_SPARSE_SCALAR_ZERO,
+                                  C_armpl_);
+  }
+
+  void callSpmspm() override{
+    if constexpr (std::is_same_v<T, float>) {
+      status_ = armpl_spmm_exec_s(ARMPL_SPARSE_OPERATION_NOTRANS,
+                                   ARMPL_SPARSE_OPERATION_NOTRANS,
+                                   alpha,
+                                   A_armpl_,
+                                   B_armpl_,
+                                   beta,
+                                   C_armpl_);
+    } else if constexpr (std::is_same_v<T, double>) {
+      status_ = armpl_spmm_exec_d(ARMPL_SPARSE_OPERATION_NOTRANS,
+                                   ARMPL_SPARSE_OPERATION_NOTRANS,
+                                   alpha,
+                                   A_armpl_,
+                                   B_armpl_,
+                                   beta,
+                                   C_armpl_);
+    } else {
+      // Un-specialised class will not do any work - print error and exit.
+      std::cout << "ERROR - Datatype for ArmPL CPU SpMSpM kernel not supported." << std::endl;
+      exit(1);
+    }
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cerr << "ERROR: " << status_ << std::endl;
+      exit(1); 
+    }
+  }
+
+  void postLoopRequirements() override {
+    // Export the C arrays from the structure
+    if constexpr (std::is_same_v<T, float>) {
+      status_ = armpl_spmat_export_csr_s(C_armpl_,
+                                         0,
+                                         &m_armpl_,
+                                         &n_armpl_,
+                                         &C_rows_,
+                                         &C_cols_,
+                                         &C_vals_);
+    } else if constexpr (std::is_same_v<T, double>) {
+      status_ = armpl_spmat_export_csr_d(C_armpl_,
+                                         0,
+                                         &m_armpl_,
+                                         &n_armpl_,
+                                         &C_rows_,
+                                         &C_cols_,
+                                         &C_vals_);
+    } else {
+      // Un-specialised class will not do any work - print error and exit.
+      std::cout << "ERROR - Datatype for ArmPL CPU SpMSpM kernel not supported." << std::endl;
+      exit(1);
+    }
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cerr << "ERROR: " << status_ << std::endl;
+      exit(1);
+    }
+    C_nnz_ = nnzC_armpl_ = C_rows_[m_armpl_];
+
+    // ARMPL does not seem to enforce ordered column indices in its 
+    // output matrices.  Therefore, to allow the checksum to take place
+    // We have to order the output matrix here.  
+    for (int i = 0; i < m_; i++) {
+      int start = C_rows_[i];
+      int end = C_rows_[i + 1];
+      int len = end - start;
+      if (len > 1) {
+        std::vector<std::pair<armpl_int_t, T>> row_entries(len);
+        for (int j = 0; j < len; j++) {
+          row_entries[j] = {C_cols_[start + j], C_vals_[start + j]};
+        }
+
+        std::sort(row_entries.begin(), row_entries.end(),
+                  [](const auto &a, const auto &b) { return a.first < b.first; });
+
+        for (int j = 0; j < len; j++) {
+          C_cols_[start + j] = row_entries[j].first;
+          C_vals_[start + j] = row_entries[j].second;
+        }
+      }
+    }
+  }
+
+  void postCallKernelCleanup() override {
+    status_ = armpl_spmat_destroy(A_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_destroy(B_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_destroy(C_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+
+    free(A_rows_);
+    free(A_cols_);
+    free(A_vals_);
+    free(B_rows_);
+    free(B_cols_);
+    free(B_vals_);
+    free(C_rows_);
+    free(C_cols_);
+    free(C_vals_);
+  }
+
+  const T alpha = ALPHA;
+  const T beta = BETA;
+
+
+  armpl_status_t status_;
+
+  armpl_int_t n_armpl_;
+  armpl_int_t m_armpl_;
+  armpl_int_t k_armpl_;
+  armpl_int_t nnzA_armpl_;
+  armpl_int_t nnzB_armpl_;
+  armpl_int_t nnzC_armpl_;
+
+  armpl_int_t* A_cols_;
+  armpl_int_t* A_rows_;
+  T* A_vals_;
+
+  armpl_int_t* B_rows_;
+  armpl_int_t* B_cols_;
+  T* B_vals_;
+
+  armpl_int_t* C_rows_;
+  armpl_int_t* C_cols_;
+  // No C_vals_ needed as inheriting from 
+  // parent in order to allow result check to carry out
+
+  armpl_spmat_t A_armpl_;
+  armpl_spmat_t B_armpl_;
+  armpl_spmat_t C_armpl_;
+
+};
+}  // namespace cpu
+#endif
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 5dd2fc5..d4dbcbe 100644
--- a/Makefile
+++ b/Makefile
@@ -51,10 +51,10 @@ CXX = $(CXX_$(COMPILER))
 
 CXXFLAGS_ARM     = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native
 CXXFLAGS_CLANG   = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native
-CXXFLAGS_GNU     = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native
-CXXFLAGS_INTEL   = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native -Wno-tautological-constant-compare
+CXXFLAGS_GNU     = -std=c++17 -Wall -Wno-deprecated-declarations -Ofast -$(ARCHFLAG)=native
+CXXFLAGS_INTEL   = -std=c++17 -Wall -O3 -ffast-math -$(ARCHFLAG)=native -Wno-tautological-constant-compare
 CXXFLAGS_NVIDIA  = -std=c++17 -Wall -O3 -fast -$(ARCHFLAG)=native
-CXXFLAGS_HIP     = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native
+CXXFLAGS_HIP     = -std=c++17 -Wall -O3 -ffast-math -$(ARCHFLAG)=native
 
 ifndef CXXFLAGS
 CXXFLAGS = $(CXXFLAGS_$(COMPILER))
@@ -98,16 +98,16 @@ $(error Must add `MKLROOT=/path/to/mkl/` to make command to use OneMKL CPU Libra
 endif
 # Add INTEL compiler options
 ifeq ($(COMPILER), INTEL)
-override CXXFLAGS += -L$(MKLROOT)/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl -qmkl=parallel -DMKL_INT=int
+override CXXFLAGS += -L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl -qmkl=parallel -DMKL_INT=int
 # Add GNU compiler options
 else ifeq ($(COMPILER), GNU)
-override CXXFLAGS += -m64 -L$(MKLROOT)/lib -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl -I"${MKLROOT}/include" -DMKL_INT=int
+override CXXFLAGS += -m64 -L$(MKLROOT)/lib/intel64 -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl -I"${MKLROOT}/include" -DMKL_INT=int
 $(warning Users may be required to do the following to use $(COMPILER) with $(CPU_LIB):)
 $(info $(TAB)$(TAB)Add `<MKLROOT>/lib` to `$$LD_LIBRARY_PATH`)
 $(info )
 # Add CLANG compiler options
 else ifeq ($(COMPILER), CLANG)
-override CXXFLAGS += -L$(MKLROOT)/lib -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl -m64 -I"${MKLROOT}/include" -DMKL_INT=int
+override CXXFLAGS += -L$(MKLROOT)/lib/intel64 -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl -m64 -I"${MKLROOT}/include" -DMKL_INT=int
 $(warning Users may be required to do the following to use $(COMPILER) with $(CPU_LIB):)
 $(info $(TAB)$(TAB)Add `<MKLROOT>/lib` to `$$LD_LIBRARY_PATH`)
 $(info )
@@ -118,10 +118,11 @@ endif
 HEADER_FILES+= $(wildcard oneMKL/CPU/*.hh)
 
 else ifeq ($(CPU_LIB), AOCL)
+override CXXFLAGS += -laoclutils -lblis -lflame -laoclsparse
 ifeq ($(COMPILER), INTEL)
-override CXXFLAGS += -lblis-mt -qopenmp
+override CXXFLAGS += -qopenmp
 else
-override CXXFLAGS += -lblis-mt -fopenmp
+override CXXFLAGS += -fopenmp
 endif
 $(warning Users may be required to do the following to use $(COMPILER) with $(CPU_LIB):)
 $(info $(TAB)$(TAB)Add `CXXFLAGS="-L<AOCL_DIR>/lib -I<AOCL_DIR>/include/blis -Wl,-rpath,<AOCL_DIR>/lib"` to make command)
@@ -170,14 +171,14 @@ $(warning GPU_LIB not set (use CUBLAS, ONEMKL, ROCBLAS). No GPU kernels will be
 else ifeq ($(GPU_LIB), CUBLAS)
 # Do cuBLAS stuff
 ifeq ($(COMPILER), NVIDIA)
-override CXXFLAGS += -cudalib=cublas
+override CXXFLAGS += -cudalib=cublas -lcusparse_static
 else
 $(warning Users may be required to do the following to use $(COMPILER) with $(GPU_LIB):)
 $(info $(TAB)$(TAB)Add `CXXFLAGS=-L<NVHPC_DIR>/.../math_libs/lib64 -L<NVHPC_DIR>/.../cuda/lib64` to make command)
 $(info $(TAB)$(TAB)Add `CXXFLAGS=-I<NVHPC_DIR>/.../math_libs/include -I<NVHPC_DIR>/.../cuda/include` to make command)
 $(info $(TAB)$(TAB)Add `CXXFLAGS=-Wl,-rpath,<NVHPC_DIR>/.../math_libs/lib64 -Wl,-rpath,<NVHPC_DIR>/.../cuda/lib64` to make command)
 $(info )
-override CXXFLAGS += -lcublas -lcudart
+override CXXFLAGS += -lcublas -lcudart -lcusparse
 endif
 HEADER_FILES += $(wildcard cuBLAS/*.hh)
 
@@ -188,7 +189,7 @@ ifndef MKLROOT
 $(error Must add `MKLROOT=/path/to/mkl/` to make command to use OneMKL CPU Library)
 endif
 # Add compiler and link options
-override CXXFLAGS += -fsycl -L$(MKLROOT)/lib -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -lmkl_core -lsycl -lpthread -lm -ldl  -fsycl -DMKL_ILP64  -I"$(MKLROOT)/include"
+override CXXFLAGS += -fsycl -L$(MKLROOT)/lib/intel64 -lmkl_sycl_blas -lmkl_sycl_sparse -lmkl_intel_lp64 -lmkl_tbb_thread -ltbb -lmkl_core -lsycl -lpthread -lm -ldl  -fsycl -DMKL_LP64  -I"$(MKLROOT)/include"
 # `lmkl_tbb_thread` can replace `lmkl_sequential`
 $(warning Users may be required to do the following to use $(COMPILER) with $(GPU_LIB):)
 $(info $(TAB)$(TAB)Add `<MKLROOT>/lib` to `$$LD_LIBRARY_PATH`)
@@ -199,17 +200,17 @@ $(error Selected compiler $(COMPILER) is not currently compatible with oneMKL GP
 endif
 
 else ifeq ($(GPU_LIB), ROCBLAS)
-ifeq ($(COMPILER), HIP)
+# ifeq ($(COMPILER), HIP)
 # Do rocBLAS stuff
-override CXXFLAGS += -lrocblas -lm -lpthread -D__HIP_PLATFORM_AMD__
+override CXXFLAGS += -lrocblas -lrocsparse -lm -lpthread -D__HIP_PLATFORM_AMD__
 $(warning Users may be required to do the following to use $(COMPILER) with $(GPU_LIB):)
 $(info $(TAB)$(TAB)Add `CXXFLAGS=-L<ROCM_PATH>/lib -L<ROCBLAS_PATH>/lib` to make command)
 $(info $(TAB)$(TAB)Add `CXXFLAGS=-I<ROCM_PATH>/include -I<ROCBLAS_PATH>/include` to make command)
 $(info $(TAB)$(TAB)Add `CXXFLAGS=-Wl,-rpath,<ROCM_PATH>/lib -Wl,-rpath,<ROCBLAS_PATH>/lib` to make command)
 HEADER_FILES += $(wildcard rocBLAS/*.hh)
-else
-$(error Selected compiler $(COMPILER) is not currently compatible with rocBLAS GPU Library)
-endif
+# else
+# $(error Selected compiler $(COMPILER) is not currently compatible with rocBLAS GPU Library)
+# endif
 
 
 else
@@ -225,7 +226,7 @@ ifdef GPU_LIB
 override CXXFLAGS += -DGPU_$(GPU_LIB)
 endif
 
-LDFLAGS = -lm 
+LDFLAGS = -lm
 
 # -------
 
@@ -233,11 +234,28 @@ EXE = gpu-blob
 
 .PHONY: all $(EXE) clean
 
-all: $(EXE)
+all: print $(EXE)
+
+print:
+	@echo "COMPILER = $(COMPILER)"
+	@echo "CXX = $(CXX)"
+	@echo "CPU_LIB = $(CPU_LIB)"
+	@echo "GPU_LIB = $(GPU_LIB)"
+	@echo "CXXFLAGS = $(CXXFLAGS)"
+	@echo "LDFLAGS = $(LDFLAGS)"
+	@echo "Full command would be:"
+	@echo "$(CXX) $(SRC_FILES) $(CXXFLAGS) -Lsrc/Consume -Wl,-rpath,src/Consume -lconsume $(LDFLAGS) -o gpu-blob"
+	@echo "░░      ░░░       ░░░  ░░░░  ░░░░░░░░       ░░░  ░░░░░░░░░      ░░░       ░░"
+	@echo "▒  ▒▒▒▒▒▒▒▒  ▒▒▒▒  ▒▒  ▒▒▒▒  ▒▒▒▒▒▒▒▒  ▒▒▒▒  ▒▒  ▒▒▒▒▒▒▒▒  ▒▒▒▒  ▒▒  ▒▒▒▒  ▒"
+	@echo "▓  ▓▓▓   ▓▓       ▓▓▓  ▓▓▓▓  ▓▓    ▓▓       ▓▓▓  ▓▓▓▓▓▓▓▓  ▓▓▓▓  ▓▓       ▓▓"
+	@echo "█  ████  ██  ████████  ████  ████████  ████  ██  ████████  ████  ██  ████  █"
+	@echo "██      ███  █████████      █████████       ███        ███      ███       ██"
+
 
 $(EXE): src/Consume/consume.c $(SRC_FILES) $(HEADER_FILES)
 	gcc src/Consume/consume.c -fpic -O0 -shared -o src/Consume/libconsume.so
-	$(CXX) $(SRC_FILES) $(CXXFLAGS) -Lsrc/Consume -Wl,-rpath,src/Consume -lconsume $(LDFLAGS) -o $@
+	@echo "Building main executable with $(CXX)"
+	$(CXX) $(SRC_FILES) -o $@ $(CXXFLAGS) -Lsrc/Consume -Wl,-rpath,src/Consume -lconsume $(LDFLAGS)
 
 clean:
-	rm -f $(EXE) src/Consume/libconsume.so
\ No newline at end of file
+	rm -f $(EXE) src/Consume/libconsume.so
diff --git a/NVPL/spmdnm.hh b/NVPL/spmdnm.hh
new file mode 100644
index 0000000..1dc2bc0
--- /dev/null
+++ b/NVPL/spmdnm.hh
@@ -0,0 +1,43 @@
+#pragma once
+
+#ifdef CPU_NVPL
+
+#include "../include/kernels/CPU/spmdnm.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+template <typename T>
+class spmdnm_cpu : public spmdnm<T> {
+public:
+  using spmdnm<T>::spmdnm;
+  using spmdnm<T>::callConsume;
+  using spmdnm<T>::initInputMatrices;
+  using spmdnm<T>::m_;
+  using spmdnm<T>::n_;
+  using spmdnm<T>::k_;
+  using spmdnm<T>::B_;
+  using spmdnm<T>::C_;
+  using spmdnm<T>::sparsity_;
+  using spmdnm<T>::type_;
+  using spmdnm<T>::nnz_;
+  using spmdnm<T>::iterations_;
+
+  void initialise(int m, int n, int k, double sparsity,
+                  matrixType type, bool binary = false) {}
+
+protected:
+  void toSparseFormat() override {}
+
+private:
+  void preLoopRequirements() override {}
+
+  void callSpmdnm() override {}
+
+  void postLoopRequirements() override {}
+
+  void postCallKernelCleanup() override {}
+};
+}
+
+
+#endif
diff --git a/NVPL/spmdnv.hh b/NVPL/spmdnv.hh
new file mode 100644
index 0000000..e3f4353
--- /dev/null
+++ b/NVPL/spmdnv.hh
@@ -0,0 +1,213 @@
+#pragma once
+
+#ifdef CPU_NVPL
+#include <nvpl_blas_cblas.h>
+
+#include "../include/kernels/CPU/spmdnv.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+/** A class for SpMDnV CPU BLAS kernels. */
+template <typename T>
+class spmdnv_cpu : public spmdnv<T> {
+ public:
+  using spmdnv<T>::spmdnv;
+  using spmdnv<T>::callConsume;
+  using spmdnv<T>::initInputMatrixVector;
+  using spmdnv<T>::m_;
+  using spmdnv<T>::n_;
+  using spmdnv<T>::x_;
+  using spmdnv<T>::y_;
+  using spmdnv<T>::sparsity_;
+  using spmdnv<T>::type_;
+  using spmdnv<T>::nnz_;
+  using spmdnv<T>::iterations_;
+
+  void initialise (int m, int n, double sparsity, matrixType type, 
+                   bool binary = false) {
+    m_ = m;
+    n_ = n;
+    sparsity_ = sparsity;
+    type_ = type;
+
+    nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_));
+
+    if constexpr (std::is_same_v<T, float>) {
+      dataType_ = NVPL_SPARSE_R_32F;
+    } else if constexpr (Std::is_same_v<T, double>) {
+      dataType_ = NVPL_SPARSE_R_64F;
+    } else {
+      throw std::runtime_error("Only float and double are supported for NVPL.");
+    }
+
+    x_ = (T*)calloc(n_, sizeof(T));
+    y_ = (T*)calloc(m_, sizeof(T));
+    z_ = (T*)calloc(m_, sizeof(T));
+
+    initInputMatrixVector();
+  }
+
+protected:
+  void toSparseFormat() override {
+    A_vals_ = (T*)calloc(nnz_, sizeof(T));
+    A_cols_ = (int64_t*)calloc(nnz_, sizeof(int64_t));
+    A_rows_ = (int64_t*)calloc(m_ + 1, sizeof(int64_t));
+
+    // Fill the CSR arrays
+    if (type_ == matrixType::rmat) {
+      rMatCSR<T, int64_t>(A_vals_, A_cols_, A_rows_, m_, n_, nnz_);
+    } else if (type_ == matrixType::random) {
+      randomCSR<T, int64_t>(A_vals_, A_cols_, A_rows_, m_, n_, nnz_);
+    } else if (type_ == matrixType::finiteElements) {
+      finiteElementCSR<T, int64_t>(A_vals_, A_cols_, A_rows_, m_, n_, nnz_);
+    } else {
+      std::cerr << "Matrix type not supported" << std::endl;
+      exit(1);
+    }
+
+    // Make the NVPL descriptors
+    status_ = nvpl_sparse_create_const_csr(&A_descr_,
+                                           m_,
+                                           n_,
+                                           nnz_,
+                                           A_rows_,
+                                           A_cols_,
+                                           A_vals_,
+                                           indexType_,
+                                           indexType_,
+                                           base_,
+                                           dataType_);
+    if (status_ != NVPL_SPARSE_STATUS_SUCCESS) {
+      std::cerr << "nvpl_sparse_create_csr failed with error: " << status_ << std::endl;
+      exit(1);
+    }
+
+    status_ = nvpl_sparse_create_const_dn_vec(X_descr_,
+                                              n_,
+                                              x_,
+                                              dataType_);
+    if (status_ != NVPL_SPARSE_STATUS_SUCCESS) {
+      std::cerr << "nvpl_sparse_create_const_dn_vec failed with error: " << status_ << std::endl;
+      exit(1);
+    }
+
+    status_ = nvpl_sparse_create_dn_vec(Y_descr_,
+                                        m_,
+                                        y_,
+                                        dataType_);
+    if (status_ != NVPL_SPARSE_STATUS_SUCCESS) {
+      std::cerr << "nvpl_sparse_create_dn_vec failed with error: " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = nvpl_sparse_create_dn_vec(Z_descr_,
+                                        m_,
+                                        z_,
+                                        dataType_);
+    if (status_ != NVPL_SPARSE_STATUS_SUCCESS) {
+      std::cerr << "nvpl_sparse_create_dn_vec failed with error: " << status_ << std::endl;
+      exit(1);
+    }
+  }
+
+private:
+  void preLoopRequirements() override {}
+
+  void callSpMDnV() override {
+    size_t bufferSize;
+    status_ = nvpl_sparse_spmv_buffer_size(handle_,
+                                           operation_,
+                                           &alpha,
+                                           A_descr_,
+                                           X_descr_,
+                                           &beta,
+                                           Z_descr_,
+                                           Y_descr_,
+                                           dataType_,
+                                           algorithm_,
+                                           description_,
+                                           &bufferSize);
+    if (status_ != NVPL_SPARSE_STATUS_SUCCESS) {
+      std::cer << "nvpl_sparse_spmv_buffer_size failed with error: " << status_ << std::endl;
+      exit(1);
+    }
+
+    void* externalBuffer = malloc(bufferSize);
+    status_ = nvpl_sparse_spmv_analysis(handle_,
+                                        operation_,
+                                        &alpha,
+                                        A_descr_,
+                                        X_descr_,
+                                        &beta,
+                                        Z_descr_,
+                                        Y_descr_,
+                                        dataType_,
+                                        algorithm_,
+                                        description_,
+                                        externalBuffer);
+    if (status_ != NVPL_SPARSE_STATUS_SUCCESS) {
+      std::cerr << "nvpl_sparse_spmv_analysis failed with error: " << status_ << std::endl;
+      exit(1);
+    }
+
+    status_ = nvpl_sparse_spmv(handle_,
+                               operation_,
+                               &alpha_,
+                               A_descr_,
+                               X_descr_,
+                               &beta,
+                               Z_descr_,
+                               Y_descr_,
+                               dataType_,
+                               algorithm_,
+                               description_);
+    if (status_ != NVPL_SPARSE_STATUS_SUCCESS) {
+      std::cerr << "nvpl_sparse_spmv failed with error: " << status_ << std::endl;
+      exit(1);
+    }
+
+    free(externalBuffer);
+  }
+
+  void postLoopRequirements() override {}
+
+  void postCallKernelCleanup() override {
+    free(x_);
+    free(y_);
+    free(z_);
+    free(A_rows_);
+    free(A_cols_);
+    free(A_vals_);
+  }
+
+  nvpl_sparse_status_t status_;
+  nvpl_sparse_handle_t handle_;
+  nvpl_Sparse_spmv_descr_t description_;
+
+  nvpl_sparse_spmv_alg_t algorithm_ = NVPL_SPARSE_SPMV_CSR_ALG1;
+  nvpl_sparse_operation_t operation_ = NVPL_SPARSE_OPERATION_NON_TRANSPOSE;
+  nvpl_sparse_data_type_t dataType_;
+  nvpl_sparse_index_type_t indexType_ = NVPL_SPARSE_INDEX_64I;
+  nvpl_sparse_index_base_t base_ = NVPL_SPARSE_INDEX_BASE_ZERO;
+
+  // Being a bit weird with the naming here.
+  // For consistency with the other libraries which don't have a
+  // seperate addition vector, I'm keeping Y as the output 
+  // vector.  Even though the NVPL documentation uses Y for
+  // the addition vector and Z for the output vector
+  nvpl_sparse_const_sp_mat_descr_t A_descr_;
+  nvpl_sparse_const_dn_vec_descr_t X_descr_;
+  nvpl_sparse_dn_vec_descr_t Z_descr_;
+  nvpl_sparse_dn_vec_descr_t Y_descr_;
+
+  // Arrays for Matrix A and unused addition vector Z
+  int64_t* A_vals_;
+  int64_t* A_cols_;
+  T* A_vals_;
+  T* z_;
+  
+
+  const T alpha = ALPHA;
+  const T beta = BETA;
+};
+}  // namespace cpu
+#endif
\ No newline at end of file
diff --git a/NVPL/spmspm.hh b/NVPL/spmspm.hh
new file mode 100644
index 0000000..ac63086
--- /dev/null
+++ b/NVPL/spmspm.hh
@@ -0,0 +1,30 @@
+#pragma once
+
+#ifdef CPU_NVPL
+
+#include "../include/kernels/CPU/spmspm.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+template <typename T>
+class spmspm_cpu : public spmspm<T> {
+public:
+
+  void initialise(int m, int n, int k, double sparsity,
+                  matrixType type, bool binary = false) {}
+
+protected:
+  void toSparseFormat() override {}
+
+private:
+  void preLoopRequirements() override {}
+
+  void callSpmspm() override {}
+
+  void postLoopRequirements() override {}
+
+  void postCallKernelCleanup() override {}
+};
+}
+
+#endif
diff --git a/README.md b/README.md
index d6f4161..1e6cd5e 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,10 @@ Only when an error occurs will any checksum be displayed to the user.
 
 GFLOP/s are calculated using the following Total FLOPs formulas. The compute time excludes any initialisation, but does include any data movement / prefetching to/from the GPU device:
  - **GEMM** : `FLOPs = (2 * M * N * K) + (b * M * N)` where `b` is `1` if BETA=0 and `3` if BETA=/=0
+ - **SPMDNM** : `FLOPs = (2 * N * NNZ)` where NNZ is the number of non-zero values in matrix A
+ - **SPMSPM** : `FLOPs = (NNZA * NNZB) / K` where NNZA is the number of non-zero values in matrix A and NNZ is the number of non-zero values in matrix B.  This is an expectation of the number of flops based on a uniform distribution of non-zero values in the columns of matrix A and the rows of matrix B
  - **GEMV** : `FLOPs = (2 * M * N) + (b * M)` where `b` is `1` if BETA=0 and `3` if BETA=/=0
+ - **SPMDNV** : `FLOPs = (2 * NNZ)` where NNZ is the number of non-zero values in matrix A
 
 # Build Options
 Select the compiler you wish to use. Regardless of choice, `gcc` is required in order to build the `Consume.so` external library.
@@ -126,18 +129,22 @@ The kernels listed below are computed by the benchmark for a wide range of probl
    - FP32, FP64
    - Square, short-&-wide, tall-&-thin input sizes
 
- <!-- - SpMM
+ - SpMDnM
    - FP32, FP64
-   - ... -->
+   - Square, short-&-wide, tall-&-thin input sizes
+
+ - SpMSpM
+   - FP32, FP64
+   - Square, short-&-wide, tall-&-thin input sizes
 
 ### <u>Level 2 BLAS</u>
  - GEMV
    - FP32, FP64
    - Square, short-&-wide, tall-&-thin input sizes 
 
- <!-- - SpMV
+ - SpMDnV
    - FP32, FP64
-   - ... -->
+   - Square, short-&-wide, tall-&-thin input sizes 
 
 # Auxiliary Files
 Additional to the main benchmark, there are two auxiliary python scripts which perform the following:
@@ -146,7 +153,6 @@ Additional to the main benchmark, there are two auxiliary python scripts which p
 
 
 # Future Work
- - [ ] Add support for Sparce Kernels
  - [ ] Add FP16/BF16 support for kernels
  - [ ] Add batched GEMM functions 
  - [ ] Add support for Apple Accelerate
diff --git a/calculateOffloadThreshold.py b/calculateOffloadThreshold.py
index 38c2646..43028c0 100644
--- a/calculateOffloadThreshold.py
+++ b/calculateOffloadThreshold.py
@@ -165,7 +165,7 @@ def printResults(once:offloadThreshold, always:offloadThreshold, unified:offload
             gpuAlways.M = 0
             gpuAlways.N = 0
             gpuAlways.K = 0
-    if(gpuUnified.M != 0 and float(cpu[8]) >= float(gpuU[8])):
+    if("gemm" in kernel and gpuUnified.M != 0 and float(cpu[8]) >= float(gpuU[8])):
         # Do check to see if this is a momentary drop that we should ignore
         if (prevGpuUgflops <= float(cpu[8])) and  (float(gpuLines[2].split(',')[8]) <= float(cpu[8])):
             gpuUnified.cpuGflops = 0.0
diff --git a/createFlopsPerSizeGraphs.py b/createFlopsPerSizeGraphs.py
new file mode 100644
index 0000000..1e50301
--- /dev/null
+++ b/createFlopsPerSizeGraphs.py
@@ -0,0 +1,988 @@
+import os
+import sys
+import matplotlib.pyplot as plt
+
+
+
+directory = "CSV_Results"
+# Get given CSV file directory
+if(len(sys.argv) > 1):
+    directory = sys.argv[1]
+
+outputDir = "Graphs_" + directory.replace('/', '_')
+
+# Check if CSV directory exists
+path = os.path.join(os.getcwd(), directory)
+if(not os.path.isdir(path)):
+    print("ERROR - {} directory does not exist. Cannot generate any graphs.".format(directory))
+    exit(1)
+
+# Get all filenames
+path = os.path.join(os.getcwd(), directory)
+filenames = os.listdir(path)
+
+# Make Graphs directory
+graphDir = os.path.join(os.getcwd(), outputDir)
+if(not os.path.isdir(graphDir)):
+    os.mkdir(graphDir)
+
+# ------------------------------ GEMV Graphs --------------------------------------------
+print("Creating GEMV graphs...")
+# Create GEMV graphs
+gemvFilenames = []
+for i in range(0, len(filenames)):
+    if "gemv_" in filenames[i] and "spgemv_" not in filenames[i]:
+        gemvFilenames.append(filenames[i])
+
+### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
+for i in range(0, len(gemvFilenames)):
+    mn = []
+    iters = 0
+    kernel = ""
+    cpu_Gflops = []
+    gpuO_Gflops = []
+    gpuA_Gflops = []
+    gpuU_Gflops = []
+    prob_size = []
+
+    # Open file and get all lines
+    fName = os.path.join(os.getcwd(), directory, gemvFilenames[i])
+    openFile = open(fName, 'r')
+    lines = openFile.readlines()
+    lines.pop(0) # Remove headers
+    if len(lines) == 0 :
+        continue
+
+    # Get number of iterations performed and kernel name
+    line1 = lines[0].split(',')
+    iters = int(line1[7])
+    kernel = line1[1]
+
+    # Get gflops (y-axis) and MN values (x-axis) for CPU and all GPU types
+    for line in lines:
+        line = line.split(',')
+        # Get MN
+        if (len(mn) == 0) or ([line[2], line[3]] not in mn):
+            mn.append([line[2], line[3]])
+        # Get Gflops
+        gflops = float(line[-1].rstrip())
+        size = float(line[5].rstrip())
+        if line[0] == "cpu":
+            cpu_Gflops.append(gflops)
+            prob_size.append(size)
+        elif line[0] == "gpu_offloadOnce":
+            gpuO_Gflops.append(gflops)
+        elif line[0] == "gpu_offloadAlways":
+            gpuA_Gflops.append(gflops)
+        elif line[0] == "gpu_unified":
+            gpuU_Gflops.append(gflops)
+
+
+    # Create x-axis label and tick values
+    inputTypeStr = ""
+    x_name = ""
+    xVals = []
+    if "_square_vector_M=N" in gemvFilenames[i]:
+        x_name = "Value of M, N"
+        inputTypeStr = "Square x Vector (M=N)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][0])
+    elif "_tall-thin_vector_M=16N" in gemvFilenames[i]:
+        x_name = "Value of N where M=16N"
+        inputTypeStr = "Tall-Thin x Vector (M=16N)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][1])
+    elif "_tall-thin_vector_M_N=32" in gemvFilenames[i]:
+        x_name = "Value of M, where N=32"
+        inputTypeStr = "Tall-Thin x Vector (M, N=32)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][0])
+    elif "_short-wide_vector_N=16M" in gemvFilenames[i]:
+        x_name = "Value of M, where N=16M"
+        inputTypeStr = "Short-Wide x Vector (N=16M)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][0])
+    elif "_short-wide_vector_M=32_N" in gemvFilenames[i]:
+        x_name = "Value of N, where M=32"
+        inputTypeStr = "Short-Wide x Vector (M=32, N)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][1])
+    else:
+        # File not supported so go to next file
+        continue
+
+    # Create y-axis label & graph title
+    y_name = ""
+    title = ""
+    fp = ""
+    if kernel == "sgemv" :
+        fp = "FP32"
+    elif kernel == "dgemv":
+        fp = "FP64"
+    y_name = "{} GFLOP/s".format(fp)        
+    title = "{}GEMV Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters)
+
+    # Make Graph
+    fig1 = plt.figure(figsize=(28,16))
+    ax1 = fig1.add_subplot()
+
+    gpuEnabled = False
+    if len(cpu_Gflops) > 0:
+        ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU")
+        # Plot line at max GFLOP/s
+        yCoord = round(max(cpu_Gflops),1)
+        ax1.axhline(yCoord, color='black', linestyle='--')
+        ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+    if len(gpuO_Gflops) > 0:
+        ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)")
+        gpuEnabled = True
+    if len(gpuA_Gflops) > 0:
+        ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)")
+        gpuEnabled = True
+    if len(gpuU_Gflops) > 0:
+        ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)")
+        gpuEnabled = True
+    if len(prob_size) > 0:
+        ax2 = ax1.twinx()
+        ax2.plot(xVals, prob_size, color="red", linestyle="--", marker="s", label="Problem Size (KiB)")
+        ax2.set_ylabel("Problem Size (KiB)", color="red", fontsize=14)
+        ax2.tick_params(axis='y', labelcolor="red")
+        ax2.set_ylim(min(prob_size) * 0.9, max(prob_size) * 1.1)
+        lines_1, labels_1 = ax1.get_legend_handles_labels()
+        lines_2, labels_2 = ax2.get_legend_handles_labels()
+        ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper left")
+
+    if(gpuEnabled):
+        yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1)
+        ax1.axhline(yCoord, color='black', linestyle='--')
+        ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+
+    # Set X ticks
+    NUM_TICK = 8
+    numXVals = len(xVals)
+    if numXVals < NUM_TICK:
+        # Print all labels
+        plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20)
+    else:
+        # Calculate labels
+        locInterval = int((numXVals) / (NUM_TICK-1))
+        tickLocs = [0]
+        for q in range(1, (NUM_TICK-1)):
+            tickLocs.append(1 + (locInterval * q))
+        tickLocs.append(numXVals - 1)
+
+        labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1))
+        tickLabs = [xVals[0]]
+        for q in range(1, (NUM_TICK-1)):
+            tickLabs.append(int(xVals[0]) + (labelInterval * q))
+        tickLabs.append(int(xVals[-1]))
+
+        plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20)
+
+    # Force setting of y-axis labels. If this isn't done then the range is weird...
+    yLoc, yLab = plt.yticks()
+    yLoc = yLoc.tolist()
+    # Remove negative first element of the list
+    if yLoc[0] != 0:
+        yLoc = yLoc[1:]
+    plt.ylim(0, yLoc[-1])
+    plt.yticks(ticks=yLoc, fontsize=20)
+
+    plt.margins(x=0.01, y=0.01)
+    leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
+    for obj in leg.legend_handles:
+        obj.set_linewidth(3.0)
+        obj.set_markersize(15.0)
+        obj.set_markeredgewidth(3.0)
+    plt.xlabel(x_name, fontsize=20)
+    plt.ylabel(y_name, fontsize=20)
+    plt.title(title, fontsize=20)
+    plt.savefig(fname="{}/{}.pdf".format(graphDir, gemvFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
+    plt.close('all')
+    print("\tPDF made")
+    
+
+print("Finished!")
+# ---------------------------------------------------------------------------------------
+
+# ------------------------------ SpMDnV Graphs --------------------------------------------
+print("Creating SpMDnV graphs...")
+# Create GEMV graphs
+spmdnvFilenames = []
+for i in range(0, len(filenames)):
+    if "spmdnv_" in filenames[i]:
+        spmdnvFilenames.append(filenames[i])
+
+### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
+for i in range(0, len(spmdnvFilenames)):
+    mn = []
+    iters = 0
+    kernel = ""
+    cpu_Gflops = []
+    gpuO_Gflops = []
+    gpuA_Gflops = []
+    gpuU_Gflops = []
+    prob_size = []
+
+    # Open file and get all lines
+    fName = os.path.join(os.getcwd(), directory, spmdnvFilenames[i])
+    openFile = open(fName, 'r')
+    lines = openFile.readlines()
+    lines.pop(0) # Remove headers
+    if len(lines) == 0 :
+        continue
+
+    # Get number of iterations performed and kernel name
+    line1 = lines[0].split(',')
+    iters = int(line1[7])
+    kernel = line1[1]
+
+    # Get gflops (y-axis) and MN values (x-axis) for CPU and all GPU types
+    for line in lines:
+        line = line.split(',')
+        # Get MN
+        if (len(mn) == 0) or ([line[2], line[3]] not in mn):
+            mn.append([line[2], line[3]]) # line[2] = M, line[3] = N
+        # Get Gflops
+        gflops = float(line[-1].rstrip())
+        size = float(line[5].rstrip())
+        if line[0] == "cpu":
+            cpu_Gflops.append(gflops)
+            prob_size.append(size)
+        elif line[0] == "gpu_offloadOnce":
+            gpuO_Gflops.append(gflops)
+        elif line[0] == "gpu_offloadAlways":
+            gpuA_Gflops.append(gflops)
+        elif line[0] == "gpu_unified":
+            gpuU_Gflops.append(gflops)
+
+
+    # Create x-axis label and tick values
+    inputTypeStr = ""
+    x_name = ""
+    xVals = []
+    if "_square_vector_M=N" in spmdnvFilenames[i]:
+        x_name = "Value of M, N"
+        inputTypeStr = "Square x Vector (M=N)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][0])
+    elif "_tall-thin_vector_M=16N" in spmdnvFilenames[i]:
+        x_name = "Value of N where M=16N"
+        inputTypeStr = "Tall-Thin x Vector (M=16N)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][1])
+    elif "_tall-thin_vector_M_N=32" in spmdnvFilenames[i]:
+        x_name = "Value of M, where N=32"
+        inputTypeStr = "Tall-Thin x Vector (M, N=32)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][0])
+    elif "_short-wide_vector_N=16M" in spmdnvFilenames[i]:
+        x_name = "Value of M, where N=16M"
+        inputTypeStr = "Short-Wide x Vector (N=16M)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][0])
+    elif "_short-wide_vector_M=32_N" in spmdnvFilenames[i]:
+        x_name = "Value of N, where M=32"
+        inputTypeStr = "Short-Wide x Vector (M=32, N)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][1])
+    else:
+        # File not supported so go to next file
+        continue
+
+    # Create y-axis label & graph title
+    y_name = ""
+    title = ""
+    fp = ""
+    if kernel == "sspmdnv" :
+        fp = "FP32"
+    elif kernel == "dspmdnv":
+        fp = "FP64"
+    y_name = "{} GFLOP/s".format(fp)        
+    title = "{}SpMDnV Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters)
+
+    # Make Graph
+    fig1 = plt.figure(figsize=(28,16))
+    ax1 = fig1.add_subplot()
+
+    gpuEnabled = False
+    if len(cpu_Gflops) > 0:
+        ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU")
+        # Plot line at max GFLOP/s
+        yCoord = round(max(cpu_Gflops),1)
+        ax1.axhline(yCoord, color='black', linestyle='--')
+        ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+    if len(gpuO_Gflops) > 0:
+        ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)")
+        gpuEnabled = True
+    if len(gpuA_Gflops) > 0:
+        ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)")
+        gpuEnabled = True
+    if len(gpuU_Gflops) > 0:
+        ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)")
+        gpuEnabled = True
+    if len(prob_size) > 0:
+        ax2 = ax1.twinx()
+        ax2.plot(xVals, prob_size, color="red", linestyle="--", marker="s", label="Problem Size (KiB)")
+        ax2.set_ylabel("Problem Size (KiB)", color="red", fontsize=14)
+        ax2.tick_params(axis='y', labelcolor="red")
+        ax2.set_ylim(min(prob_size) * 0.9, max(prob_size) * 1.1)
+        lines_1, labels_1 = ax1.get_legend_handles_labels()
+        lines_2, labels_2 = ax2.get_legend_handles_labels()
+        ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper left")
+
+    if(gpuEnabled):
+        yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1)
+        ax1.axhline(yCoord, color='black', linestyle='--')
+        ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+
+    # Set X ticks
+    NUM_TICK = 8
+    numXVals = len(xVals)
+    if numXVals < NUM_TICK:
+        # Print all labels
+        plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20)
+    else:
+        # Calculate labels
+        locInterval = int((numXVals) / (NUM_TICK-1))
+        tickLocs = [0]
+        for q in range(1, (NUM_TICK-1)):
+            tickLocs.append(1 + (locInterval * q))
+        tickLocs.append(numXVals - 1)
+
+        labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1))
+        tickLabs = [xVals[0]]
+        for q in range(1, (NUM_TICK-1)):
+            tickLabs.append(int(xVals[0]) + (labelInterval * q))
+        tickLabs.append(int(xVals[-1]))
+
+        plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20)
+
+    # Force setting of y-axis labels. If this isn't done then the range is weird...
+    yLoc, yLab = plt.yticks()
+    yLoc = yLoc.tolist()
+    # Remove negative first element of the list
+    if yLoc[0] != 0:
+        yLoc = yLoc[1:]
+    plt.ylim(0, yLoc[-1])
+    plt.yticks(ticks=yLoc, fontsize=20)
+
+    plt.margins(x=0.01, y=0.01)
+    leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
+    for obj in leg.legend_handles:
+        obj.set_linewidth(3.0)
+        obj.set_markersize(15.0)
+        obj.set_markeredgewidth(3.0)
+    plt.xlabel(x_name, fontsize=20)
+    plt.ylabel(y_name, fontsize=20)
+    plt.title(title, fontsize=20)
+    plt.savefig(fname="{}/{}.pdf".format(graphDir, spmdnvFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
+    plt.close('all')
+    print("\tPDF made")
+    
+
+print("Finished!")
+# ---------------------------------------------------------------------------------------
+
+# ------------------------------ GEMM Graphs --------------------------------------------
+print("Creating GEMM graphs...")
+# Create GEMM graphs
+gemmFilenames = []
+for i in range(0, len(filenames)):
+    if "gemm_" in filenames[i] and "spgemm_" not in filenames[i]:
+        gemmFilenames.append(filenames[i])
+
+### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
+for i in range(0, len(gemmFilenames)):
+    mnk = []
+    iters = 0
+    kernel = ""
+    cpu_Gflops = []
+    gpuO_Gflops = []
+    gpuA_Gflops = []
+    gpuU_Gflops = []
+    prob_size = []
+
+    # Open file and get all lines
+    fName = os.path.join(os.getcwd(), directory, gemmFilenames[i])
+    openFile = open(fName, 'r')
+    lines = openFile.readlines()
+    lines.pop(0) # Remove headers
+    if len(lines) == 0 :
+        continue
+
+    # Get number of iterations performed and kernel name
+    line1 = lines[0].split(',')
+    sparsity = float(line1[6])
+    iters = int(line1[7])
+    kernel = line1[1]
+
+    # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types
+    for line in lines:
+        line = line.split(',')
+        # Get MNK
+        if (len(mnk) == 0) or ([line[2], line[3], line[4]] not in mnk):
+            mnk.append([line[2], line[3], line[4]])
+        # Get Gflops
+        gflops = float(line[-1].rstrip())
+        size = float(line[5].rstrip())
+        if line[0] == "cpu":
+            cpu_Gflops.append(gflops)
+            prob_size.append(size)
+        elif line[0] == "gpu_offloadOnce":
+            gpuO_Gflops.append(gflops)
+        elif line[0] == "gpu_offloadAlways":
+            gpuA_Gflops.append(gflops)
+        elif line[0] == "gpu_unified":
+            gpuU_Gflops.append(gflops)
+
+    # Create x-axis label and tick values
+    inputTypeStr = ""
+    x_name = ""
+    xVals = []
+    if "_square_square_M=N=K" in gemmFilenames[i]:
+        x_name = "Value of M, N, K"
+        inputTypeStr = "Square x Square (M=N=K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_tall-thin_short-wide_M=N_M=16K" in gemmFilenames[i]:
+        x_name = "Value of K where M=16K and N=16K"
+        inputTypeStr = "Tall-Thin x Short-Wide (M=N=16K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][2])
+    elif "_tall-thin_short-wide_M=N_K=32" in gemmFilenames[i]:
+        x_name = "Value of M and N, where K=32"
+        inputTypeStr = "Tall-Thin x Short-Wide (M=N, K=32)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_short-wide_tall-thin_M=N_K=16M" in gemmFilenames[i]:
+        x_name = "Value of M and N, where K=16M"
+        inputTypeStr = "Short-Wide x Tall-Thin (M=N, K=16M)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_short-wide_tall-thin_M=N=32_K" in gemmFilenames[i]:
+        x_name = "Value of K, where M=32 and N=32"
+        inputTypeStr = "Short-Wide x Tall-Thin (M=N=32, K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][2])
+    elif "_tall-thin_square_K=N_M=16K" in gemmFilenames[i]:
+        x_name = "Value of N and K, where M=16K"
+        inputTypeStr = "Tall-Thin x Square (N=K, M=16K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][2])
+    elif "_tall-thin_square_K=N=32_M" in gemmFilenames[i]:
+        x_name = "Value of M, where N=32 and K=32"
+        inputTypeStr = "Tall-Thin x Square (M, N=K=32)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_square_short-wide_M=K_N=16K" in gemmFilenames[i]:
+        x_name = "Value of M and K, where N=16K"
+        inputTypeStr = "Square x Short-Wide (M=K, N=16K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_square_short-wide_M=K=32_N" in gemmFilenames[i]:
+        x_name = "Value of N, where M=32 and K=32"
+        inputTypeStr = "Square x Short-Wide (M=K=32, N)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][1])
+    else:
+        # File not supported so go to next file
+        continue
+
+    # Create y-axis label & graph title
+    y_name = ""
+    title = ""
+    fp = ""
+    if kernel == "sgemm" :
+        fp = "FP32"
+    elif kernel == "dgemm":
+        fp = "FP64"
+    y_name = "{} GFLOP/s".format(fp)        
+    title = ("{}GEMM Performance for {} Problems (sparsity = {})- {} "
+             "iterations per problemize").format(kernel[0].upper(),
+                                                 inputTypeStr, sparsity, iters)
+
+    # Make Graph
+    fig1 = plt.figure(figsize=(28,16))
+    ax1 = fig1.add_subplot()
+
+    gpuEnabled = False
+    if len(cpu_Gflops) > 0:
+        ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU")
+        # Plot line at max GFLOP/s
+        yCoord = round(max(cpu_Gflops),1)
+        ax1.axhline(yCoord, color='black', linestyle='--')
+        ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+    if len(gpuO_Gflops) > 0:
+        ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)")
+        gpuEnabled = True
+    if len(gpuA_Gflops) > 0:
+        ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)")
+        gpuEnabled = True
+    if len(gpuU_Gflops) > 0:
+        ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)")
+        gpuEnabled = True
+    if len(prob_size) > 0:
+        ax2 = ax1.twinx()
+        ax2.plot(xVals, prob_size, color="red", linestyle="--", marker="s", label="Problem Size (KiB)")
+        ax2.set_ylabel("Problem Size (KiB)", color="red", fontsize=14)
+        ax2.tick_params(axis='y', labelcolor="red")
+        ax2.set_ylim(min(prob_size) * 0.9, max(prob_size) * 1.1)
+        lines_1, labels_1 = ax1.get_legend_handles_labels()
+        lines_2, labels_2 = ax2.get_legend_handles_labels()
+        ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper left")
+
+    if(gpuEnabled):
+        yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1)
+        ax1.axhline(yCoord, color='black', linestyle='--')
+        ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+
+    # Set X ticks
+    NUM_TICK = 8
+    numXVals = len(xVals)
+    if numXVals < NUM_TICK:
+        # Print all labels
+        plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20)
+    else:
+        # Calculate labels
+        locInterval = int((numXVals) / (NUM_TICK-1))
+        tickLocs = [0]
+        for q in range(1, (NUM_TICK-1)):
+            tickLocs.append(1 + (locInterval * q))
+        tickLocs.append(numXVals - 1)
+
+        labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1))
+        tickLabs = [xVals[0]]
+        for q in range(1, (NUM_TICK-1)):
+            tickLabs.append(int(xVals[0]) + (labelInterval * q))
+        tickLabs.append(int(xVals[-1]))
+
+        plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20)
+
+    # Force setting of y-axis labels. If this isn't done then the range is weird...
+    yLoc, yLab = plt.yticks()
+    yLoc = yLoc.tolist()
+    # Remove negative first element of the list
+    if yLoc[0] != 0:
+        yLoc = yLoc[1:]
+    plt.ylim(0, yLoc[-1])
+    plt.yticks(ticks=yLoc, fontsize=20)
+
+    plt.margins(x=0.01, y=0.01)
+    leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
+    for obj in leg.legend_handles:
+        obj.set_linewidth(3.0)
+        obj.set_markersize(15.0)
+        obj.set_markeredgewidth(3.0)
+    plt.xlabel(x_name, fontsize=20)
+    plt.ylabel(y_name, fontsize=20)
+    plt.title(title, fontsize=20)
+    plt.savefig(fname="{}/{}.pdf".format(graphDir, gemmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
+    plt.close('all')
+    print("\tPDF made")
+    
+
+print("Finished!")
+# ---------------------------------------------------------------------------------------
+
+# ------------------------------ SpGEMM Graphs --------------------------------------------
+print("Creating SpMDnM graphs...")
+# Create SpMDnM graphs
+spmdnmFilenames = []
+for i in range(0, len(filenames)):
+    if "spmdnm_" in filenames[i]:
+        spmdnmFilenames.append(filenames[i])
+
+### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
+for i in range(0, len(spmdnmFilenames)):
+    mnk = []
+    iters = 0
+    kernel = ""
+    cpu_Gflops = []
+    gpuO_Gflops = []
+    gpuA_Gflops = []
+    gpuU_Gflops = []
+    prob_size = []
+
+    # Open file and get all lines
+    fName = os.path.join(os.getcwd(), directory, spmdnmFilenames[i])
+    openFile = open(fName, 'r')
+    lines = openFile.readlines()
+    lines.pop(0) # Remove headers
+    if len(lines) == 0 :
+        continue
+
+    # Get number of iterations performed and kernel name
+    line1 = lines[0].split(',')
+    sparsity = float(line1[6])
+    iters = int(line1[7])
+    kernel = line1[1]
+
+    # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types
+    for line in lines:
+        line = line.split(',')
+        # Get MNK
+        if (len(mnk) == 0) or ([line[2], line[3], line[4]] not in mnk):
+            mnk.append([line[2], line[3], line[4]])
+        # Get Gflops
+        gflops = float(line[-1].rstrip())
+        size = float(line[5].rstrip())
+        if line[0] == "cpu":
+            cpu_Gflops.append(gflops)
+            prob_size.append(size)
+        elif line[0] == "gpu_offloadOnce":
+            gpuO_Gflops.append(gflops)
+        elif line[0] == "gpu_offloadAlways":
+            gpuA_Gflops.append(gflops)
+        elif line[0] == "gpu_unified":
+            gpuU_Gflops.append(gflops)
+
+    # Create x-axis label and tick values
+    inputTypeStr = ""
+    x_name = ""
+    xVals = []
+    if "_square_square_M=N=K" in spmdnmFilenames[i]:
+        x_name = "Value of M, N, K"
+        inputTypeStr = "Square x Square (M=N=K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_tall-thin_short-wide_M=N_M=16K" in spmdnmFilenames[i]:
+        x_name = "Value of K where M=16K and N=16K"
+        inputTypeStr = "Tall-Thin x Short-Wide (M=N=16K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][2])
+    elif "_tall-thin_short-wide_M=N_K=32" in spmdnmFilenames[i]:
+        x_name = "Value of M and N, where K=32"
+        inputTypeStr = "Tall-Thin x Short-Wide (M=N, K=32)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_short-wide_tall-thin_M=N_K=16M" in spmdnmFilenames[i]:
+        x_name = "Value of M and N, where K=16M"
+        inputTypeStr = "Short-Wide x Tall-Thin (M=N, K=16M)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_short-wide_tall-thin_M=N=32_K" in spmdnmFilenames[i]:
+        x_name = "Value of K, where M=32 and N=32"
+        inputTypeStr = "Short-Wide x Tall-Thin (M=N=32, K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][2])
+    elif "_tall-thin_square_K=N_M=16K" in spmdnmFilenames[i]:
+        x_name = "Value of N and K, where M=16K"
+        inputTypeStr = "Tall-Thin x Square (N=K, M=16K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][2])
+    elif "_tall-thin_square_K=N=32_M" in spmdnmFilenames[i]:
+        x_name = "Value of M, where N=32 and K=32"
+        inputTypeStr = "Tall-Thin x Square (M, N=K=32)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_square_short-wide_M=K_N=16K" in spmdnmFilenames[i]:
+        x_name = "Value of M and K, where N=16K"
+        inputTypeStr = "Square x Short-Wide (M=K, N=16K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_square_short-wide_M=K=32_N" in spmdnmFilenames[i]:
+        x_name = "Value of N, where M=32 and K=32"
+        inputTypeStr = "Square x Short-Wide (M=K=32, N)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][1])
+    else:
+        # File not supported so go to next file
+        continue
+
+    # Create y-axis label & graph title
+    y_name = ""
+    title = ""
+    fp = ""
+    if kernel == "sspmdnm" :
+        fp = "FP32"
+    elif kernel == "dspmdnm":
+        fp = "FP64"
+    y_name = "{} GFLOP/s".format(fp)        
+    title = ("{}SpMDnM Performance for {} Problems (sparsity = {})- {} "
+             "iterations per problemize").format(kernel[0].upper(),
+                                                 inputTypeStr, sparsity, iters)
+
+    # Make Graph
+    fig1 = plt.figure(figsize=(28,16))
+    ax1 = fig1.add_subplot()
+
+    gpuEnabled = False
+    if len(cpu_Gflops) > 0:
+        ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU")
+        # Plot line at max GFLOP/s
+        yCoord = round(max(cpu_Gflops),1)
+        ax1.axhline(yCoord, color='black', linestyle='--')
+        ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+    if len(gpuO_Gflops) > 0:
+        ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)")
+        gpuEnabled = True
+    if len(gpuA_Gflops) > 0:
+        ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)")
+        gpuEnabled = True
+    if len(gpuU_Gflops) > 0:
+        ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)")
+        gpuEnabled = True
+    if len(prob_size) > 0:
+        ax2 = ax1.twinx()
+        ax2.plot(xVals, prob_size, color="red", linestyle="--", marker="s", label="Problem Size (KiB)")
+        ax2.set_ylabel("Problem Size (KiB)", color="red", fontsize=14)
+        ax2.tick_params(axis='y', labelcolor="red")
+        ax2.set_ylim(min(prob_size) * 0.9, max(prob_size) * 1.1)
+        lines_1, labels_1 = ax1.get_legend_handles_labels()
+        lines_2, labels_2 = ax2.get_legend_handles_labels()
+        ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper left")
+
+    if(gpuEnabled):
+        yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1)
+        ax1.axhline(yCoord, color='black', linestyle='--')
+        ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+
+    # Set X ticks
+    NUM_TICK = 8
+    numXVals = len(xVals)
+    if numXVals < NUM_TICK:
+        # Print all labels
+        plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20)
+    else:
+        # Calculate labels
+        locInterval = int((numXVals) / (NUM_TICK-1))
+        tickLocs = [0]
+        for q in range(1, (NUM_TICK-1)):
+            tickLocs.append(1 + (locInterval * q))
+        tickLocs.append(numXVals - 1)
+
+        labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1))
+        tickLabs = [xVals[0]]
+        for q in range(1, (NUM_TICK-1)):
+            tickLabs.append(int(xVals[0]) + (labelInterval * q))
+        tickLabs.append(int(xVals[-1]))
+
+        plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20)
+
+    # Force setting of y-axis labels. If this isn't done then the range is weird...
+    yLoc, yLab = plt.yticks()
+    yLoc = yLoc.tolist()
+    # Remove negative first element of the list
+    if yLoc[0] != 0:
+        yLoc = yLoc[1:]
+    plt.ylim(0, yLoc[-1])
+    plt.yticks(ticks=yLoc, fontsize=20)
+
+    plt.margins(x=0.01, y=0.01)
+    leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
+    for obj in leg.legend_handles:
+        obj.set_linewidth(3.0)
+        obj.set_markersize(15.0)
+        obj.set_markeredgewidth(3.0)
+    plt.xlabel(x_name, fontsize=20)
+    plt.ylabel(y_name, fontsize=20)
+    plt.title(title, fontsize=20)
+    plt.savefig(fname="{}/{}.pdf".format(graphDir, spmdnmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
+    plt.close('all')
+    print("\tPDF made")
+    
+
+print("Finished!")
+# ---------------------------------------------------------------------------------------
+
+# ------------------------------ SpMSpM Graphs --------------------------------------------
+print("Creating SpMSpM graphs...")
+# Create SpMSpM graphs
+spmspmFilenames = []
+for i in range(0, len(filenames)):
+    if "spmspm_" in filenames[i]:
+        spmspmFilenames.append(filenames[i])
+
+### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
+for i in range(0, len(spmspmFilenames)):
+    mnk = []
+    iters = 0
+    kernel = ""
+    cpu_Gflops = []
+    gpuO_Gflops = []
+    gpuA_Gflops = []
+    gpuU_Gflops = []
+    prob_size = []
+
+    # Open file and get all lines
+    fName = os.path.join(os.getcwd(), directory, spmspmFilenames[i])
+    openFile = open(fName, 'r')
+    lines = openFile.readlines()
+    lines.pop(0) # Remove headers
+    if len(lines) == 0 :
+        continue
+
+    # Get number of iterations performed and kernel name
+    line1 = lines[0].split(',')
+    sparsity = float(line1[6])
+    iters = int(line1[7])
+    kernel = line1[1]
+
+    # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types
+    for line in lines:
+        line = line.split(',')
+        # Get MNK
+        if (len(mnk) == 0) or ([line[2], line[3], line[4]] not in mnk):
+            mnk.append([line[2], line[3], line[4]])
+        # Get Gflops
+        gflops = float(line[-1].rstrip())
+        size = float(line[5].rstrip())
+        if line[0] == "cpu":
+            cpu_Gflops.append(gflops)
+            prob_size.append(size)
+        elif line[0] == "gpu_offloadOnce":
+            gpuO_Gflops.append(gflops)
+        elif line[0] == "gpu_offloadAlways":
+            gpuA_Gflops.append(gflops)
+        elif line[0] == "gpu_unified":
+            gpuU_Gflops.append(gflops)
+
+    # Create x-axis label and tick values
+    inputTypeStr = ""
+    x_name = ""
+    xVals = []
+    if "_square_square_M=N=K" in spmspmFilenames[i]:
+        x_name = "Value of M, N, K"
+        inputTypeStr = "Square x Square (M=N=K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_tall-thin_short-wide_M=N_M=16K" in spmspmFilenames[i]:
+        x_name = "Value of K where M=16K and N=16K"
+        inputTypeStr = "Tall-Thin x Short-Wide (M=N=16K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][2])
+    elif "_tall-thin_short-wide_M=N_K=32" in spmspmFilenames[i]:
+        x_name = "Value of M and N, where K=32"
+        inputTypeStr = "Tall-Thin x Short-Wide (M=N, K=32)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_short-wide_tall-thin_M=N_K=16M" in spmspmFilenames[i]:
+        x_name = "Value of M and N, where K=16M"
+        inputTypeStr = "Short-Wide x Tall-Thin (M=N, K=16M)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_short-wide_tall-thin_M=N=32_K" in spmspmFilenames[i]:
+        x_name = "Value of K, where M=32 and N=32"
+        inputTypeStr = "Short-Wide x Tall-Thin (M=N=32, K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][2])
+    elif "_tall-thin_square_K=N_M=16K" in spmspmFilenames[i]:
+        x_name = "Value of N and K, where M=16K"
+        inputTypeStr = "Tall-Thin x Square (N=K, M=16K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][2])
+    elif "_tall-thin_square_K=N=32_M" in spmspmFilenames[i]:
+        x_name = "Value of M, where N=32 and K=32"
+        inputTypeStr = "Tall-Thin x Square (M, N=K=32)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_square_short-wide_M=K_N=16K" in spmspmFilenames[i]:
+        x_name = "Value of M and K, where N=16K"
+        inputTypeStr = "Square x Short-Wide (M=K, N=16K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_square_short-wide_M=K=32_N" in spmspmFilenames[i]:
+        x_name = "Value of N, where M=32 and K=32"
+        inputTypeStr = "Square x Short-Wide (M=K=32, N)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][1])
+    else:
+        # File not supported so go to next file
+        continue
+
+    # Create y-axis label & graph title
+    y_name = ""
+    title = ""
+    fp = ""
+    if kernel == "sspmspm" :
+        fp = "FP32"
+    elif kernel == "dspmspm":
+        fp = "FP64"
+    y_name = "{} GFLOP/s".format(fp)        
+    title = ("{}SpMSpM Performance for {} Problems (sparsity = {})- {} "
+             "iterations per problemize").format(kernel[0].upper(),
+                                                 inputTypeStr, sparsity, iters)
+
+    # Make Graph
+    fig1 = plt.figure(figsize=(28,16))
+    ax1 = fig1.add_subplot()
+
+    gpuEnabled = False
+    if len(cpu_Gflops) > 0:
+        ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU")
+        # Plot line at max GFLOP/s
+        yCoord = round(max(cpu_Gflops),1)
+        ax1.axhline(yCoord, color='black', linestyle='--')
+        ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+    if len(gpuO_Gflops) > 0:
+        ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)")
+        gpuEnabled = True
+    if len(gpuA_Gflops) > 0:
+        ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)")
+        gpuEnabled = True
+    if len(gpuU_Gflops) > 0:
+        ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)")
+        gpuEnabled = True
+    if len(prob_size) > 0:
+        ax2 = ax1.twinx()
+        ax2.plot(xVals, prob_size, color="red", linestyle="--", marker="s", label="Problem Size (KiB)")
+        ax2.set_ylabel("Problem Size (KiB)", color="red", fontsize=14)
+        ax2.tick_params(axis='y', labelcolor="red")
+        ax2.set_ylim(min(prob_size) * 0.9, max(prob_size) * 1.1)
+        lines_1, labels_1 = ax1.get_legend_handles_labels()
+        lines_2, labels_2 = ax2.get_legend_handles_labels()
+        ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper left")
+
+    if(gpuEnabled):
+        yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1)
+        ax1.axhline(yCoord, color='black', linestyle='--')
+        ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+
+    # Set X ticks
+    NUM_TICK = 8
+    numXVals = len(xVals)
+    if numXVals < NUM_TICK:
+        # Print all labels
+        plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20)
+    else:
+        # Calculate labels
+        locInterval = int((numXVals) / (NUM_TICK-1))
+        tickLocs = [0]
+        for q in range(1, (NUM_TICK-1)):
+            tickLocs.append(1 + (locInterval * q))
+        tickLocs.append(numXVals - 1)
+
+        labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1))
+        tickLabs = [xVals[0]]
+        for q in range(1, (NUM_TICK-1)):
+            tickLabs.append(int(xVals[0]) + (labelInterval * q))
+        tickLabs.append(int(xVals[-1]))
+
+        plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20)
+
+    # Force setting of y-axis labels. If this isn't done then the range is weird...
+    yLoc, yLab = plt.yticks()
+    yLoc = yLoc.tolist()
+    # Remove negative first element of the list
+    if yLoc[0] != 0:
+        yLoc = yLoc[1:]
+    plt.ylim(0, yLoc[-1])
+    plt.yticks(ticks=yLoc, fontsize=20)
+
+    plt.margins(x=0.01, y=0.01)
+    leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
+    for obj in leg.legend_handles:
+        obj.set_linewidth(3.0)
+        obj.set_markersize(15.0)
+        obj.set_markeredgewidth(3.0)
+    plt.xlabel(x_name, fontsize=20)
+    plt.ylabel(y_name, fontsize=20)
+    plt.title(title, fontsize=20)
+    plt.savefig(fname="{}/{}.pdf".format(graphDir, spmspmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
+    plt.close('all')
+    print("\tPDF made")
+    
+
+print("Finished!")
+# ---------------------------------------------------------------------------------------
diff --git a/createGflopsGraphs.py b/createGflopsGraphs.py
index 0ed7772..8108812 100644
--- a/createGflopsGraphs.py
+++ b/createGflopsGraphs.py
@@ -26,12 +26,346 @@
 if(not os.path.isdir(graphDir)):
     os.mkdir(graphDir)
 
+# ------------------------------ GEMV Graphs --------------------------------------------
+print("Creating GEMV graphs...")
+# Create GEMV graphs
+gemvFilenames = []
+for i in range(0, len(filenames)):
+    if "gemv_" in filenames[i] and "spgemv_" not in filenames[i]:
+        gemvFilenames.append(filenames[i])
+
+### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
+for i in range(0, len(gemvFilenames)):
+    mn = []
+    iters = 0
+    kernel = ""
+    cpu_Gflops = []
+    gpuO_Gflops = []
+    gpuA_Gflops = []
+    gpuU_Gflops = []
+
+    # Open file and get all lines
+    fName = os.path.join(os.getcwd(), directory, gemvFilenames[i])
+    openFile = open(fName, 'r')
+    lines = openFile.readlines()
+    lines.pop(0) # Remove headers
+    if len(lines) == 0 :
+        continue
+
+    # Get number of iterations performed and kernel name
+    line1 = lines[0].split(',')
+    iters = int(line1[7])
+    kernel = line1[1]
+
+    # Get gflops (y-axis) and MN values (x-axis) for CPU and all GPU types
+    for line in lines:
+        line = line.split(',')
+        # Get MN
+        if (len(mn) == 0) or ([line[2], line[3]] not in mn):
+            mn.append([line[2], line[3]])
+        # Get Gflops
+        gflops = float(line[-1].rstrip())
+        if line[0] == "cpu":
+            cpu_Gflops.append(gflops)
+        elif line[0] == "gpu_offloadOnce":
+            gpuO_Gflops.append(gflops)
+        elif line[0] == "gpu_offloadAlways":
+            gpuA_Gflops.append(gflops)
+        elif line[0] == "gpu_unified":
+            gpuU_Gflops.append(gflops)
+
+
+    # Create x-axis label and tick values
+    inputTypeStr = ""
+    x_name = ""
+    xVals = []
+    if "_square_vector_M=N" in gemvFilenames[i]:
+        x_name = "Value of M, N"
+        inputTypeStr = "Square x Vector (M=N)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][0])
+    elif "_tall-thin_vector_M=16N" in gemvFilenames[i]:
+        x_name = "Value of N where M=16N"
+        inputTypeStr = "Tall-Thin x Vector (M=16N)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][1])
+    elif "_tall-thin_vector_M_N=32" in gemvFilenames[i]:
+        x_name = "Value of M, where N=32"
+        inputTypeStr = "Tall-Thin x Vector (M, N=32)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][0])
+    elif "_short-wide_vector_N=16M" in gemvFilenames[i]:
+        x_name = "Value of M, where N=16M"
+        inputTypeStr = "Short-Wide x Vector (N=16M)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][0])
+    elif "_short-wide_vector_M=32_N" in gemvFilenames[i]:
+        x_name = "Value of N, where M=32"
+        inputTypeStr = "Short-Wide x Vector (M=32, N)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][1])
+    else:
+        # File not supported so go to next file
+        continue
+
+    # Create y-axis label & graph title
+    y_name = ""
+    title = ""
+    fp = ""
+    if kernel == "sgemv" :
+        fp = "FP32"
+    elif kernel == "dgemv":
+        fp = "FP64"
+    y_name = "{} GFLOP/s".format(fp)        
+    title = "{}GEMV Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters)
+
+    # Make Graph
+    fig1 = plt.figure(figsize=(28,16))
+    ax1 = fig1.add_subplot()
+
+    gpuEnabled = False
+    if len(cpu_Gflops) > 0:
+        ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU")
+        # Plot line at max GFLOP/s
+        yCoord = round(max(cpu_Gflops),1)
+        ax1.axhline(yCoord, color='black', linestyle='--')
+        ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+    if len(gpuO_Gflops) > 0:
+        ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)")
+        gpuEnabled = True
+    if len(gpuA_Gflops) > 0:
+        ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)")
+        gpuEnabled = True
+    if len(gpuU_Gflops) > 0:
+        ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)")
+        gpuEnabled = True
+
+    if(gpuEnabled):
+        yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1)
+        ax1.axhline(yCoord, color='black', linestyle='--')
+        ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+
+    # Set X ticks
+    NUM_TICK = 8
+    numXVals = len(xVals)
+    if numXVals < NUM_TICK:
+        # Print all labels
+        plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20)
+    else:
+        # Calculate labels
+        locInterval = int((numXVals) / (NUM_TICK-1))
+        tickLocs = [0]
+        for q in range(1, (NUM_TICK-1)):
+            tickLocs.append(1 + (locInterval * q))
+        tickLocs.append(numXVals - 1)
+
+        labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1))
+        tickLabs = [xVals[0]]
+        for q in range(1, (NUM_TICK-1)):
+            tickLabs.append(int(xVals[0]) + (labelInterval * q))
+        tickLabs.append(int(xVals[-1]))
+
+        plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20)
+
+    # Force setting of y-axis labels. If this isn't done then the range is weird...
+    yLoc, yLab = plt.yticks()
+    yLoc = yLoc.tolist()
+    # Remove negative first element of the list
+    if yLoc[0] != 0:
+        yLoc = yLoc[1:]
+    plt.ylim(0, yLoc[-1])
+    plt.yticks(ticks=yLoc, fontsize=20)
+
+    plt.margins(x=0.01, y=0.01)
+    leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
+    for obj in leg.legend_handles:
+        obj.set_linewidth(3.0)
+        obj.set_markersize(15.0)
+        obj.set_markeredgewidth(3.0)
+    plt.xlabel(x_name, fontsize=20)
+    plt.ylabel(y_name, fontsize=20)
+    plt.title(title, fontsize=20)
+    plt.savefig(fname="{}/{}.pdf".format(graphDir, gemvFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
+    plt.close('all')
+    print("\tPDF made")
+    
+
+print("Finished!")
+# ---------------------------------------------------------------------------------------
+
+# ------------------------------ SpMDnV Graphs --------------------------------------------
+print("Creating SpMDnV graphs...")
+# Create GEMV graphs
+spmdnvFilenames = []
+for i in range(0, len(filenames)):
+    if "spmdnv_" in filenames[i]:
+        spmdnvFilenames.append(filenames[i])
+
+### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
+for i in range(0, len(spmdnvFilenames)):
+    mn = []
+    iters = 0
+    kernel = ""
+    cpu_Gflops = []
+    gpuO_Gflops = []
+    gpuA_Gflops = []
+    gpuU_Gflops = []
+
+    # Open file and get all lines
+    fName = os.path.join(os.getcwd(), directory, spmdnvFilenames[i])
+    openFile = open(fName, 'r')
+    lines = openFile.readlines()
+    lines.pop(0) # Remove headers
+    if len(lines) == 0 :
+        continue
+
+    # Get number of iterations performed and kernel name
+    line1 = lines[0].split(',')
+    iters = int(line1[7])
+    kernel = line1[1]
+
+    # Get gflops (y-axis) and MN values (x-axis) for CPU and all GPU types
+    for line in lines:
+        line = line.split(',')
+        # Get MN
+        if (len(mn) == 0) or ([line[2], line[3]] not in mn):
+            mn.append([line[2], line[3]]) # line[2] = M, line[3] = N
+        # Get Gflops
+        gflops = float(line[-1].rstrip())
+        if line[0] == "cpu":
+            cpu_Gflops.append(gflops)
+        elif line[0] == "gpu_offloadOnce":
+            gpuO_Gflops.append(gflops)
+        elif line[0] == "gpu_offloadAlways":
+            gpuA_Gflops.append(gflops)
+        elif line[0] == "gpu_unified":
+            gpuU_Gflops.append(gflops)
+
+
+    # Create x-axis label and tick values
+    inputTypeStr = ""
+    x_name = ""
+    xVals = []
+    if "_square_vector_M=N" in spmdnvFilenames[i]:
+        x_name = "Value of M, N"
+        inputTypeStr = "Square x Vector (M=N)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][0])
+    elif "_tall-thin_vector_M=16N" in spmdnvFilenames[i]:
+        x_name = "Value of N where M=16N"
+        inputTypeStr = "Tall-Thin x Vector (M=16N)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][1])
+    elif "_tall-thin_vector_M_N=32" in spmdnvFilenames[i]:
+        x_name = "Value of M, where N=32"
+        inputTypeStr = "Tall-Thin x Vector (M, N=32)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][0])
+    elif "_short-wide_vector_N=16M" in spmdnvFilenames[i]:
+        x_name = "Value of M, where N=16M"
+        inputTypeStr = "Short-Wide x Vector (N=16M)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][0])
+    elif "_short-wide_vector_M=32_N" in spmdnvFilenames[i]:
+        x_name = "Value of N, where M=32"
+        inputTypeStr = "Short-Wide x Vector (M=32, N)"
+        for j in range(0, len(mn)):
+            xVals.append(mn[j][1])
+    else:
+        # File not supported so go to next file
+        continue
+
+    # Create y-axis label & graph title
+    y_name = ""
+    title = ""
+    fp = ""
+    if kernel == "sspmdnv" :
+        fp = "FP32"
+    elif kernel == "dspmdnv":
+        fp = "FP64"
+    y_name = "{} GFLOP/s".format(fp)        
+    title = "{}SpMDnV Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters)
+
+    # Make Graph
+    fig1 = plt.figure(figsize=(28,16))
+    ax1 = fig1.add_subplot()
+
+    gpuEnabled = False
+    if len(cpu_Gflops) > 0:
+        ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU")
+        # Plot line at max GFLOP/s
+        yCoord = round(max(cpu_Gflops),1)
+        ax1.axhline(yCoord, color='black', linestyle='--')
+        ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+    if len(gpuO_Gflops) > 0:
+        ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)")
+        gpuEnabled = True
+    if len(gpuA_Gflops) > 0:
+        ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)")
+        gpuEnabled = True
+    if len(gpuU_Gflops) > 0:
+        ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)")
+        gpuEnabled = True
+
+    if(gpuEnabled):
+        yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1)
+        ax1.axhline(yCoord, color='black', linestyle='--')
+        ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+
+    # Set X ticks
+    NUM_TICK = 8
+    numXVals = len(xVals)
+    if numXVals < NUM_TICK:
+        # Print all labels
+        plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20)
+    else:
+        # Calculate labels
+        locInterval = int((numXVals) / (NUM_TICK-1))
+        tickLocs = [0]
+        for q in range(1, (NUM_TICK-1)):
+            tickLocs.append(1 + (locInterval * q))
+        tickLocs.append(numXVals - 1)
+
+        labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1))
+        tickLabs = [xVals[0]]
+        for q in range(1, (NUM_TICK-1)):
+            tickLabs.append(int(xVals[0]) + (labelInterval * q))
+        tickLabs.append(int(xVals[-1]))
+
+        plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20)
+
+    # Force setting of y-axis labels. If this isn't done then the range is weird...
+    yLoc, yLab = plt.yticks()
+    yLoc = yLoc.tolist()
+    # Remove negative first element of the list
+    if yLoc[0] != 0:
+        yLoc = yLoc[1:]
+    plt.ylim(0, yLoc[-1])
+    plt.yticks(ticks=yLoc, fontsize=20)
+
+    plt.margins(x=0.01, y=0.01)
+    leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
+    for obj in leg.legend_handles:
+        obj.set_linewidth(3.0)
+        obj.set_markersize(15.0)
+        obj.set_markeredgewidth(3.0)
+    plt.xlabel(x_name, fontsize=20)
+    plt.ylabel(y_name, fontsize=20)
+    plt.title(title, fontsize=20)
+    plt.savefig(fname="{}/{}.pdf".format(graphDir, spmdnvFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
+    plt.close('all')
+    print("\tPDF made")
+    
+
+print("Finished!")
+# ---------------------------------------------------------------------------------------
+
 # ------------------------------ GEMM Graphs --------------------------------------------
 print("Creating GEMM graphs...")
 # Create GEMM graphs
 gemmFilenames = []
 for i in range(0, len(filenames)):
-    if "gemm_" in filenames[i]:
+    if "gemm_" in filenames[i] and "spgemm_" not in filenames[i]:
         gemmFilenames.append(filenames[i])
 
 ### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
@@ -54,7 +388,8 @@
 
     # Get number of iterations performed and kernel name
     line1 = lines[0].split(',')
-    iters = int(line1[6])
+    sparsity = float(line1[6])
+    iters = int(line1[7])
     kernel = line1[1]
 
     # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types
@@ -127,8 +462,6 @@
         # File not supported so go to next file
         continue
 
-
-
     # Create y-axis label & graph title
     y_name = ""
     title = ""
@@ -138,7 +471,9 @@
     elif kernel == "dgemm":
         fp = "FP64"
     y_name = "{} GFLOP/s".format(fp)        
-    title = "{}GEMM Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters)
+    title = ("{}GEMM Performance for {} Problems (sparsity = {})- {} "
+             "iterations per problemize").format(kernel[0].upper(),
+                                                 inputTypeStr, sparsity, iters)
 
     # Make Graph
     fig1 = plt.figure(figsize=(28,16))
@@ -199,31 +534,32 @@
 
     plt.margins(x=0.01, y=0.01)
     leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
-    for obj in leg.legendHandles:
+    for obj in leg.legend_handles:
         obj.set_linewidth(3.0)
         obj.set_markersize(15.0)
         obj.set_markeredgewidth(3.0)
     plt.xlabel(x_name, fontsize=20)
     plt.ylabel(y_name, fontsize=20)
     plt.title(title, fontsize=20)
-    plt.savefig(fname="{}/{}.png".format(graphDir, gemmFilenames[i][:-4]), format="png", dpi=100, bbox_inches="tight")
+    plt.savefig(fname="{}/{}.pdf".format(graphDir, gemmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
     plt.close('all')
+    print("\tPDF made")
     
 
 print("Finished!")
 # ---------------------------------------------------------------------------------------
 
-# ------------------------------ GEMV Graphs --------------------------------------------
-print("Creating GEMV graphs...")
-# Create GEMV graphs
-gemvFilenames = []
+# ------------------------------ SpGEMM Graphs --------------------------------------------
+print("Creating SpMDnM graphs...")
+# Create SpMDnM graphs
+spmdnmFilenames = []
 for i in range(0, len(filenames)):
-    if "gemv_" in filenames[i]:
-        gemvFilenames.append(filenames[i])
+    if "spmdnm_" in filenames[i]:
+        spmdnmFilenames.append(filenames[i])
 
 ### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
-for i in range(0, len(gemvFilenames)):
-    mn = []
+for i in range(0, len(spmdnmFilenames)):
+    mnk = []
     iters = 0
     kernel = ""
     cpu_Gflops = []
@@ -232,7 +568,7 @@
     gpuU_Gflops = []
 
     # Open file and get all lines
-    fName = os.path.join(os.getcwd(), directory, gemvFilenames[i])
+    fName = os.path.join(os.getcwd(), directory, spmdnmFilenames[i])
     openFile = open(fName, 'r')
     lines = openFile.readlines()
     lines.pop(0) # Remove headers
@@ -241,15 +577,16 @@
 
     # Get number of iterations performed and kernel name
     line1 = lines[0].split(',')
-    iters = int(line1[6])
+    sparsity = float(line1[6])
+    iters = int(line1[7])
     kernel = line1[1]
 
-    # Get gflops (y-axis) and MN values (x-axis) for CPU and all GPU types
+    # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types
     for line in lines:
         line = line.split(',')
-        # Get MN
-        if (len(mn) == 0) or ([line[2], line[3]] not in mn):
-            mn.append([line[2], line[3]])
+        # Get MNK
+        if (len(mnk) == 0) or ([line[2], line[3], line[4]] not in mnk):
+            mnk.append([line[2], line[3], line[4]])
         # Get Gflops
         gflops = float(line[-1].rstrip())
         if line[0] == "cpu":
@@ -261,52 +598,260 @@
         elif line[0] == "gpu_unified":
             gpuU_Gflops.append(gflops)
 
-
     # Create x-axis label and tick values
     inputTypeStr = ""
     x_name = ""
     xVals = []
-    if "_square_vector_M=N" in gemvFilenames[i]:
-        x_name = "Value of M, N"
-        inputTypeStr = "Square x Vector (M=N)"
-        for j in range(0, len(mn)):
-            xVals.append(mn[j][0])
-    elif "_tall-thin_vector_M=16N" in gemvFilenames[i]:
-        x_name = "Value of N where M=16N"
-        inputTypeStr = "Tall-Thin x Vector (M=16N)"
-        for j in range(0, len(mn)):
-            xVals.append(mn[j][1])
-    elif "_tall-thin_vector_M_N=32" in gemvFilenames[i]:
-        x_name = "Value of M, where N=32"
-        inputTypeStr = "Tall-Thin x Vector (M, N=32)"
-        for j in range(0, len(mn)):
-            xVals.append(mn[j][0])
-    elif "_short-wide_vector_N=16M" in gemvFilenames[i]:
-        x_name = "Value of M, where N=16M"
-        inputTypeStr = "Short-Wide x Vector (N=16M)"
-        for j in range(0, len(mn)):
-            xVals.append(mn[j][0])
-    elif "_short-wide_vector_M=32_N" in gemvFilenames[i]:
-        x_name = "Value of N, where M=32"
-        inputTypeStr = "Short-Wide x Vector (M=32, N)"
-        for j in range(0, len(mn)):
-            xVals.append(mn[j][1])
+    if "_square_square_M=N=K" in spmdnmFilenames[i]:
+        x_name = "Value of M, N, K"
+        inputTypeStr = "Square x Square (M=N=K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_tall-thin_short-wide_M=N_M=16K" in spmdnmFilenames[i]:
+        x_name = "Value of K where M=16K and N=16K"
+        inputTypeStr = "Tall-Thin x Short-Wide (M=N=16K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][2])
+    elif "_tall-thin_short-wide_M=N_K=32" in spmdnmFilenames[i]:
+        x_name = "Value of M and N, where K=32"
+        inputTypeStr = "Tall-Thin x Short-Wide (M=N, K=32)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_short-wide_tall-thin_M=N_K=16M" in spmdnmFilenames[i]:
+        x_name = "Value of M and N, where K=16M"
+        inputTypeStr = "Short-Wide x Tall-Thin (M=N, K=16M)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_short-wide_tall-thin_M=N=32_K" in spmdnmFilenames[i]:
+        x_name = "Value of K, where M=32 and N=32"
+        inputTypeStr = "Short-Wide x Tall-Thin (M=N=32, K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][2])
+    elif "_tall-thin_square_K=N_M=16K" in spmdnmFilenames[i]:
+        x_name = "Value of N and K, where M=16K"
+        inputTypeStr = "Tall-Thin x Square (N=K, M=16K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][2])
+    elif "_tall-thin_square_K=N=32_M" in spmdnmFilenames[i]:
+        x_name = "Value of M, where N=32 and K=32"
+        inputTypeStr = "Tall-Thin x Square (M, N=K=32)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_square_short-wide_M=K_N=16K" in spmdnmFilenames[i]:
+        x_name = "Value of M and K, where N=16K"
+        inputTypeStr = "Square x Short-Wide (M=K, N=16K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_square_short-wide_M=K=32_N" in spmdnmFilenames[i]:
+        x_name = "Value of N, where M=32 and K=32"
+        inputTypeStr = "Square x Short-Wide (M=K=32, N)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][1])
     else:
         # File not supported so go to next file
         continue
 
+    # Create y-axis label & graph title
+    y_name = ""
+    title = ""
+    fp = ""
+    if kernel == "sspmdnm" :
+        fp = "FP32"
+    elif kernel == "dspmdnm":
+        fp = "FP64"
+    y_name = "{} GFLOP/s".format(fp)        
+    title = ("{}SpMDnM Performance for {} Problems (sparsity = {})- {} "
+             "iterations per problemize").format(kernel[0].upper(),
+                                                 inputTypeStr, sparsity, iters)
+
+    # Make Graph
+    fig1 = plt.figure(figsize=(28,16))
+    ax1 = fig1.add_subplot()
+
+    gpuEnabled = False
+    if len(cpu_Gflops) > 0:
+        ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU")
+        # Plot line at max GFLOP/s
+        yCoord = round(max(cpu_Gflops),1)
+        ax1.axhline(yCoord, color='black', linestyle='--')
+        ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+    if len(gpuO_Gflops) > 0:
+        ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)")
+        gpuEnabled = True
+    if len(gpuA_Gflops) > 0:
+        ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)")
+        gpuEnabled = True
+    if len(gpuU_Gflops) > 0:
+        ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)")
+        gpuEnabled = True
+
+    if(gpuEnabled):
+        yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1)
+        ax1.axhline(yCoord, color='black', linestyle='--')
+        ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+
+    # Set X ticks
+    NUM_TICK = 8
+    numXVals = len(xVals)
+    if numXVals < NUM_TICK:
+        # Print all labels
+        plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20)
+    else:
+        # Calculate labels
+        locInterval = int((numXVals) / (NUM_TICK-1))
+        tickLocs = [0]
+        for q in range(1, (NUM_TICK-1)):
+            tickLocs.append(1 + (locInterval * q))
+        tickLocs.append(numXVals - 1)
+
+        labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1))
+        tickLabs = [xVals[0]]
+        for q in range(1, (NUM_TICK-1)):
+            tickLabs.append(int(xVals[0]) + (labelInterval * q))
+        tickLabs.append(int(xVals[-1]))
+
+        plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20)
+
+    # Force setting of y-axis labels. If this isn't done then the range is weird...
+    yLoc, yLab = plt.yticks()
+    yLoc = yLoc.tolist()
+    # Remove negative first element of the list
+    if yLoc[0] != 0:
+        yLoc = yLoc[1:]
+    plt.ylim(0, yLoc[-1])
+    plt.yticks(ticks=yLoc, fontsize=20)
+
+    plt.margins(x=0.01, y=0.01)
+    leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
+    for obj in leg.legend_handles:
+        obj.set_linewidth(3.0)
+        obj.set_markersize(15.0)
+        obj.set_markeredgewidth(3.0)
+    plt.xlabel(x_name, fontsize=20)
+    plt.ylabel(y_name, fontsize=20)
+    plt.title(title, fontsize=20)
+    plt.savefig(fname="{}/{}.pdf".format(graphDir, spmdnmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
+    plt.close('all')
+    print("\tPDF made")
+    
+
+print("Finished!")
+# ---------------------------------------------------------------------------------------
+
+# ------------------------------ SpMSpM Graphs --------------------------------------------
+print("Creating SpMSpM graphs...")
+# Create SpMSpM graphs
+spmspmFilenames = []
+for i in range(0, len(filenames)):
+    if "spmspm_" in filenames[i]:
+        spmspmFilenames.append(filenames[i])
+
+### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
+for i in range(0, len(spmspmFilenames)):
+    mnk = []
+    iters = 0
+    kernel = ""
+    cpu_Gflops = []
+    gpuO_Gflops = []
+    gpuA_Gflops = []
+    gpuU_Gflops = []
+
+    # Open file and get all lines
+    fName = os.path.join(os.getcwd(), directory, spmspmFilenames[i])
+    openFile = open(fName, 'r')
+    lines = openFile.readlines()
+    lines.pop(0) # Remove headers
+    if len(lines) == 0 :
+        continue
+
+    # Get number of iterations performed and kernel name
+    line1 = lines[0].split(',')
+    sparsity = float(line1[6])
+    iters = int(line1[7])
+    kernel = line1[1]
+
+    # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types
+    for line in lines:
+        line = line.split(',')
+        # Get MNK
+        if (len(mnk) == 0) or ([line[2], line[3], line[4]] not in mnk):
+            mnk.append([line[2], line[3], line[4]])
+        # Get Gflops
+        gflops = float(line[-1].rstrip())
+        if line[0] == "cpu":
+            cpu_Gflops.append(gflops)
+        elif line[0] == "gpu_offloadOnce":
+            gpuO_Gflops.append(gflops)
+        elif line[0] == "gpu_offloadAlways":
+            gpuA_Gflops.append(gflops)
+        elif line[0] == "gpu_unified":
+            gpuU_Gflops.append(gflops)
 
+    # Create x-axis label and tick values
+    inputTypeStr = ""
+    x_name = ""
+    xVals = []
+    if "_square_square_M=N=K" in spmspmFilenames[i]:
+        x_name = "Value of M, N, K"
+        inputTypeStr = "Square x Square (M=N=K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_tall-thin_short-wide_M=N_M=16K" in spmspmFilenames[i]:
+        x_name = "Value of K where M=16K and N=16K"
+        inputTypeStr = "Tall-Thin x Short-Wide (M=N=16K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][2])
+    elif "_tall-thin_short-wide_M=N_K=32" in spmspmFilenames[i]:
+        x_name = "Value of M and N, where K=32"
+        inputTypeStr = "Tall-Thin x Short-Wide (M=N, K=32)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_short-wide_tall-thin_M=N_K=16M" in spmspmFilenames[i]:
+        x_name = "Value of M and N, where K=16M"
+        inputTypeStr = "Short-Wide x Tall-Thin (M=N, K=16M)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_short-wide_tall-thin_M=N=32_K" in spmspmFilenames[i]:
+        x_name = "Value of K, where M=32 and N=32"
+        inputTypeStr = "Short-Wide x Tall-Thin (M=N=32, K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][2])
+    elif "_tall-thin_square_K=N_M=16K" in spmspmFilenames[i]:
+        x_name = "Value of N and K, where M=16K"
+        inputTypeStr = "Tall-Thin x Square (N=K, M=16K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][2])
+    elif "_tall-thin_square_K=N=32_M" in spmspmFilenames[i]:
+        x_name = "Value of M, where N=32 and K=32"
+        inputTypeStr = "Tall-Thin x Square (M, N=K=32)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_square_short-wide_M=K_N=16K" in spmspmFilenames[i]:
+        x_name = "Value of M and K, where N=16K"
+        inputTypeStr = "Square x Short-Wide (M=K, N=16K)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
+    elif "_square_short-wide_M=K=32_N" in spmspmFilenames[i]:
+        x_name = "Value of N, where M=32 and K=32"
+        inputTypeStr = "Square x Short-Wide (M=K=32, N)"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][1])
+    else:
+        # File not supported so go to next file
+        continue
 
     # Create y-axis label & graph title
     y_name = ""
     title = ""
     fp = ""
-    if kernel == "sgemv" :
+    if kernel == "sspmspm" :
         fp = "FP32"
-    elif kernel == "dgemv":
+    elif kernel == "dspmspm":
         fp = "FP64"
     y_name = "{} GFLOP/s".format(fp)        
-    title = "{}GEMV Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters)
+    title = ("{}SpMSpM Performance for {} Problems (sparsity = {})- {} "
+             "iterations per problemize").format(kernel[0].upper(),
+                                                 inputTypeStr, sparsity, iters)
 
     # Make Graph
     fig1 = plt.figure(figsize=(28,16))
@@ -367,16 +912,17 @@
 
     plt.margins(x=0.01, y=0.01)
     leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
-    for obj in leg.legendHandles:
+    for obj in leg.legend_handles:
         obj.set_linewidth(3.0)
         obj.set_markersize(15.0)
         obj.set_markeredgewidth(3.0)
     plt.xlabel(x_name, fontsize=20)
     plt.ylabel(y_name, fontsize=20)
     plt.title(title, fontsize=20)
-    plt.savefig(fname="{}/{}.png".format(graphDir, gemvFilenames[i][:-4]), format="png", dpi=100, bbox_inches="tight")
+    plt.savefig(fname="{}/{}.pdf".format(graphDir, spmspmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
     plt.close('all')
+    print("\tPDF made")
     
 
 print("Finished!")
-# ---------------------------------------------------------------------------------------
\ No newline at end of file
+# ---------------------------------------------------------------------------------------
diff --git a/cuBLAS/common.hh b/cuBLAS/common.hh
index 78d0270..af222fb 100644
--- a/cuBLAS/common.hh
+++ b/cuBLAS/common.hh
@@ -2,24 +2,45 @@
 
 #if defined GPU_CUBLAS
 
+#include <cusparse.h>
+#include <cuda_runtime.h>
+#include <cstdio>
+
+/** Macro function to check if error occurred when calling cuBLAS. */
 /** Macro function to check if error occurred when calling CUDA. */
-#define cudaCheckError(f)                                                \
-  do {                                                                   \
-    if (cudaError_t e = (f); e != cudaSuccess) {                         \
-      std::cout << "CUDA error: " << __FILE__ << ":" << __LINE__ << ": " \
-                << cudaGetErrorString(e) << std::endl;                   \
-      exit(1);                                                           \
-    }                                                                    \
+#define cudaCheckError(f)                                                 \
+  do {                                                                    \
+    if (cudaError_t e = (f); e != cudaSuccess) {                          \
+      std::cout << "CUDA error: " << __FILE__ << ":" << __LINE__ << ": "; \
+      std::cout << cudaGetErrorString(e) << std::endl;                    \
+      exit(1);                                                            \
+    }                                                                     \
   } while (false)
 
 /** Macro function to check if error occurred when calling cuBLAS. */
-#define cublasCheckError(f)                                                \
-  do {                                                                     \
-    if (cublasStatus_t e = (f); e != CUBLAS_STATUS_SUCCESS) {              \
-      std::cout << "CUBLAS error: " << __FILE__ << ":" << __LINE__ << ": " \
-                << cublasGetStatusString(e) << std::endl;                  \
-      exit(1);                                                             \
-    }                                                                      \
+#define cublasCheckError(f)                                                   \
+  do {                                                                        \
+    cublasStatus_t status = (f);                                              \
+    if (status != CUBLAS_STATUS_SUCCESS) {                                    \
+      std::cout << "CUBLAS error: " << __FILE__ << ":" << __LINE__ << ": ";   \
+      std::cout << cublasGetStatusName(status) << " - ";                      \
+      std::cout << cublasGetStatusString(status) << std::endl;                \
+      exit(1);                                                                \
+    }                                                                         \
   } while (false)
 
-#endif
\ No newline at end of file
+/** Macro function to check if error occurred when calling cuSPARSE. */
+#define cusparseCheckError(f)                                                 \
+  do {                                                                        \
+    cusparseStatus_t status = (f);                                            \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                  \
+      std::cout << "CUSPARSE error: " << __FILE__ << ":" << __LINE__ << ": "; \
+      std::cout << cusparseGetErrorName(status) << " - ";                     \
+      std::cout << cusparseGetErrorString(status) << std::endl;               \
+      exit(1);                                                                \
+    }                                                                         \
+  } while (false)                                                             \
+
+#endif
+
+
diff --git a/cuBLAS/spmdnm.hh b/cuBLAS/spmdnm.hh
new file mode 100644
index 0000000..bb08d90
--- /dev/null
+++ b/cuBLAS/spmdnm.hh
@@ -0,0 +1,552 @@
+#pragma once
+
+#ifdef GPU_CUBLAS
+#include <cusparse_v2.h>
+#include <cuda_runtime.h>
+#include <type_traits>
+#include <random>
+#include <iostream>
+
+#include "../include/kernels/GPU/spmdnm.hh"
+#include "../include/utilities.hh"
+#include "common.hh"
+
+namespace gpu {
+    /**
+     * A class for sparse matrix-dense matrix BLAS
+     */
+template <typename T>
+class spmdnm_gpu : public spmdnm<T> {
+public:
+  using spmdnm<T>::spmdnm;
+  using spmdnm<T>::initInputMatrices;
+  using spmdnm<T>::m_;
+  using spmdnm<T>::n_;
+  using spmdnm<T>::k_;
+  using spmdnm<T>::B_;
+  using spmdnm<T>::C_;
+  using spmdnm<T>::offload_;
+  using spmdnm<T>::nnz_;
+  using spmdnm<T>::sparsity_;
+  using spmdnm<T>::type_;
+
+  ~spmdnm_gpu() {
+    if (alreadyInitialised_) {
+      cusparseCheckError(cusparseDestroy(handle_));
+
+      cudaCheckError(cudaStreamDestroy(stream1_));
+      cudaCheckError(cudaStreamDestroy(stream2_));
+      cudaCheckError(cudaStreamDestroy(stream3_));
+      cudaCheckError(cudaStreamDestroy(stream4_));
+      cudaCheckError(cudaStreamDestroy(stream5_));
+
+      alreadyInitialised_ = false;
+    }
+  }
+
+  void initialise(gpuOffloadType offload, int m, int n, int k,
+                  double sparsity, matrixType type, bool binary = false) override {
+    if (!alreadyInitialised_) {
+      alreadyInitialised_ = true;
+      cusparseCheckError(cusparseCreate(&handle_));
+      
+      cudaCheckError(cudaStreamCreate(&stream1_));
+      cudaCheckError(cudaStreamCreate(&stream2_));
+      cudaCheckError(cudaStreamCreate(&stream3_));
+      cudaCheckError(cudaStreamCreate(&stream4_));
+      cudaCheckError(cudaStreamCreate(&stream5_));
+      
+      cusparseCheckError(cusparseSetStream(handle_, stream1_));
+
+      // Get device identifier
+      cudaCheckError(cudaGetDevice(&gpuDevice_));
+
+    }
+    offload_ = offload;
+    sparsity_ = sparsity;
+    type_ = type;
+
+    m_ = m;
+    n_ = n;
+    k_ = k;
+
+    B_ = C_ = B_dev_ = C_dev_ = A_vals_ = A_vals_dev_ = nullptr;
+    A_rows_ = A_cols_ = A_rows_dev_ = A_cols_dev_ = nullptr;
+    /** Determine the number of nnz elements in A and B */
+    nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+    
+    // Set up cuSPARSE metadata
+    opA_ = CUSPARSE_OPERATION_NON_TRANSPOSE;
+    opB_ = CUSPARSE_OPERATION_NON_TRANSPOSE;
+    alg_ = CUSPARSE_SPMM_ALG_DEFAULT;
+    index_ = CUSPARSE_INDEX_64I;
+    base_ = CUSPARSE_INDEX_BASE_ZERO;
+    B_order_ = CUSPARSE_ORDER_ROW;
+    C_order_ = CUSPARSE_ORDER_ROW;
+    if (std::is_same_v<T, float>) {
+      dataType_ = CUDA_R_32F;
+    } else if (std::is_same_v<T, double>) {
+      dataType_ = CUDA_R_64F;
+    } else {
+      std::cerr << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
+      exit(1);
+    }
+
+    if (offload_ == gpuOffloadType::unified) {
+      cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * k_ * n_));
+      cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * m_ * n_));
+    } else {
+      B_ = (T*)malloc(sizeof(T) * k_ * n_);
+      C_ = (T*)malloc(sizeof(T) * m_ * n_);
+
+      cudaCheckError(cudaMalloc((void**)&B_dev_, sizeof(T) * k_ * n_));
+      cudaCheckError(cudaMalloc((void**)&C_dev_, sizeof(T) * m_ * n_));
+    }
+    cudaCheckError(cudaDeviceSynchronize());
+
+    initInputMatrices();
+  }
+
+protected:
+  void toSparseFormat() override {
+    if (offload_ == gpuOffloadType::always) {
+      A_vals_store_ = (T*)malloc(sizeof(T) * nnz_);
+      A_cols_store_ = (int64_t*)malloc(sizeof(int64_t) * nnz_);
+      A_rows_store_ = (int64_t*)malloc(sizeof(int64_t) * (m_ + 1));
+
+      if (type_ == matrixType::rmat) {
+        rMatCSR<T, int64_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, nnz_);
+      } else if (type_ == matrixType::random) {
+        randomCSR<T, int64_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, nnz_);
+      } else if (type_ == matrixType::finiteElements) {
+        finiteElementCSR<T, int64_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, nnz_);
+      } else {
+        exit(1);
+      }
+    }
+
+    // Allocate CSR arrays
+    if (offload_ == gpuOffloadType::unified) {
+      cudaCheckError(cudaMallocManaged(&A_vals_, nnz_ * sizeof(T)));
+      cudaCheckError(cudaMallocManaged(&A_cols_, nnz_ * sizeof(int64_t)));
+      cudaCheckError(cudaMallocManaged(&A_rows_, (m_ + 1) * sizeof(int64_t)));
+    } else {
+      A_vals_ = (T*)malloc(nnz_ * sizeof(T));
+      A_cols_ = (int64_t*)malloc(nnz_ * sizeof(int64_t));
+      A_rows_ = (int64_t*)malloc((m_ + 1) * sizeof(int64_t));
+      cudaCheckError(cudaMalloc((void**)&A_vals_dev_, nnz_ * sizeof(T)));
+      cudaCheckError(cudaMalloc((void**)&A_cols_dev_, nnz_ * sizeof(int64_t)));
+      cudaCheckError(cudaMalloc((void**)&A_rows_dev_, (m_ + 1) * sizeof(int64_t)));
+    }
+    cudaCheckError(cudaDeviceSynchronize());
+
+    memcpy(A_vals_, A_vals_store_, sizeof(T) * nnz_);
+    memcpy(A_cols_, A_cols_store_, sizeof(int64_t) * nnz_);
+    memcpy(A_rows_, A_rows_store_, sizeof(int64_t) * (m_ + 1));
+    cudaCheckError(cudaDeviceSynchronize());
+  }
+
+private:
+  void preLoopRequirements() override {
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        break;
+      }
+      case gpuOffloadType::once: {
+        cudaCheckError(cudaMemcpyAsync(A_vals_dev_, A_vals_, nnz_ * sizeof(T), cudaMemcpyHostToDevice, stream1_));
+        cudaCheckError(cudaMemcpyAsync(A_cols_dev_, A_cols_, nnz_ * sizeof(int64_t), cudaMemcpyHostToDevice, stream2_));
+        cudaCheckError(cudaMemcpyAsync(A_rows_dev_, A_rows_, (m_ + 1) * sizeof(int64_t), cudaMemcpyHostToDevice, stream3_));
+        cudaCheckError(cudaMemcpyAsync(B_dev_, B_, (k_ * n_) * sizeof(T), cudaMemcpyHostToDevice, stream4_));
+        cudaCheckError(cudaMemcpyAsync(C_dev_, C_, (m_ * n_) * sizeof(T), cudaMemcpyHostToDevice, stream5_));
+        break;
+      }
+      case gpuOffloadType::unified: {
+        cudaCheckError(cudaMemPrefetchAsync(A_vals_, nnz_ * sizeof(T), gpuDevice_, stream1_));
+        cudaCheckError(cudaMemPrefetchAsync(A_cols_, nnz_ * sizeof(int64_t), gpuDevice_, stream2_));
+        cudaCheckError(cudaMemPrefetchAsync(A_rows_, (m_ + 1) * sizeof(int64_t), gpuDevice_, stream3_));
+        cudaCheckError(cudaMemPrefetchAsync(B_, (n_ * k_) * sizeof(T), gpuDevice_, stream4_));
+        cudaCheckError(cudaMemPrefetchAsync(C_, (m_ * n_) * sizeof(T), gpuDevice_, stream5_));
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+    }
+  }
+
+  void callSpmdnm() override {
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        // Move over data
+        cudaCheckError(cudaMemcpyAsync(A_vals_dev_, A_vals_, nnz_ * sizeof(T), cudaMemcpyHostToDevice, stream1_));
+        cudaCheckError(cudaMemcpyAsync(A_cols_dev_, A_cols_, nnz_ * sizeof(int64_t), cudaMemcpyHostToDevice, stream2_));
+        cudaCheckError(cudaMemcpyAsync(A_rows_dev_, A_rows_, (m_ + 1) * sizeof(int64_t), cudaMemcpyHostToDevice, stream3_));
+        cudaCheckError(cudaMemcpyAsync(B_dev_, B_, (k_ * n_) * sizeof(T), cudaMemcpyHostToDevice, stream4_));
+        cudaCheckError(cudaMemcpyAsync(C_dev_, C_, (m_ * n_) * sizeof(T), cudaMemcpyHostToDevice, stream5_));
+
+        // Set up descriptors
+        cusparseCheckError(cusparseCreateCsr(&A_descr_, 
+                                             m_, 
+                                             k_, 
+                                             nnz_, 
+                                             A_rows_dev_, 
+                                             A_cols_dev_,
+                                             A_vals_dev_, 
+                                             index_, 
+                                             index_, 
+                                             base_, 
+                                             dataType_));
+        cusparseCheckError(cusparseCreateDnMat(&B_descr_, 
+                                               k_, 
+                                               n_,
+                                               n_, 
+                                               B_dev_, 
+                                               dataType_,
+                                               B_order_));
+        cusparseCheckError(cusparseCreateDnMat(&C_descr_, 
+                                               m_, 
+                                               n_,
+                                               n_, 
+                                               C_dev_, 
+                                               dataType_,
+                                               C_order_));
+
+        // Set up temporary buffers
+        void* dBuffer = nullptr;
+        size_t bufferSize = 0;
+
+        // Begin matrix-matrix multiplication
+        cusparseCheckError(cusparseSpMM_bufferSize(handle_, 
+                                                   opA_, 
+                                                   opB_, 
+                                                   &alpha, 
+                                                   A_descr_,
+                                                   B_descr_, 
+                                                   &beta, 
+                                                   C_descr_,
+                                                   dataType_, 
+                                                   alg_, 
+                                                   &bufferSize));
+
+        // Allocate the temporary buffer
+        cudaCheckError(cudaMalloc((void**)&dBuffer, bufferSize));
+
+        cusparseCheckError(cusparseSpMM_preprocess(handle_, 
+                                                   opA_, 
+                                                   opB_, 
+                                                   &alpha, 
+                                                   A_descr_,
+                                                   B_descr_, 
+                                                   &beta, 
+                                                   C_descr_,
+                                                   dataType_, 
+                                                   alg_, 
+                                                   dBuffer));
+
+        cusparseCheckError(cusparseSpMM(handle_, 
+                                        opA_, 
+                                        opB_, 
+                                        &alpha, 
+                                        A_descr_,
+                                        B_descr_,
+                                        &beta, 
+                                        C_descr_,
+                                        dataType_,
+                                        alg_,
+                                        dBuffer));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        // Clean up descriptors
+        cusparseCheckError(cusparseDestroySpMat(A_descr_));
+        cusparseCheckError(cusparseDestroyDnMat(B_descr_));
+        cusparseCheckError(cusparseDestroyDnMat(C_descr_));
+
+        // Free up the temporary buffer
+        cudaCheckError(cudaFree(dBuffer));
+
+        // Move result back to CPU
+        cudaCheckError(cudaMemcpyAsync(C_, C_dev_, (sizeof(T) * m_ * n_),
+                                       cudaMemcpyDeviceToHost, stream1_));
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+      case gpuOffloadType::once: {
+        // Set up descriptors
+        cusparseCheckError(cusparseCreateCsr(&A_descr_, 
+                                             m_, 
+                                             k_, 
+                                             nnz_, 
+                                             A_rows_dev_, 
+                                             A_cols_dev_,
+                                             A_vals_dev_, 
+                                             index_, 
+                                             index_, 
+                                             base_, 
+                                             dataType_));
+        cusparseCheckError(cusparseCreateDnMat(&B_descr_, 
+                                               k_, 
+                                               n_,
+                                               n_, 
+                                               B_dev_, 
+                                               dataType_,
+                                               B_order_));
+        cusparseCheckError(cusparseCreateDnMat(&C_descr_, 
+                                               m_, 
+                                               n_,
+                                               n_, 
+                                               C_dev_, 
+                                               dataType_,
+                                               C_order_));
+
+        size_t bufferSize = 0;
+        // Begin matrix-matrix multiplication
+        cusparseCheckError(cusparseSpMM_bufferSize(handle_, 
+                                                   opA_, 
+                                                   opB_, 
+                                                   &alpha, 
+                                                   A_descr_,
+                                                   B_descr_, 
+                                                   &beta, 
+                                                   C_descr_,
+                                                   dataType_, 
+                                                   alg_, 
+                                                   &bufferSize));
+
+        // Allocate the temporary buffer
+        void* dBuffer = nullptr;
+        cudaCheckError(cudaMalloc((void**)&dBuffer, bufferSize));
+        cusparseCheckError(cusparseSpMM_preprocess(handle_, 
+                                                   opA_, 
+                                                   opB_, 
+                                                   &alpha, 
+                                                   A_descr_,
+                                                   B_descr_, 
+                                                   &beta, 
+                                                   C_descr_,
+                                                   dataType_, 
+                                                   alg_, 
+                                                   dBuffer));
+
+        cusparseCheckError(cusparseSpMM(handle_, 
+                                        opA_, 
+                                        opB_, 
+                                        &alpha, 
+                                        A_descr_,
+                                        B_descr_,
+                                        &beta, 
+                                        C_descr_,
+                                        dataType_, 
+                                        alg_, 
+                                        dBuffer));
+
+        // Clean up descriptors
+        cusparseCheckError(cusparseDestroySpMat(A_descr_));
+        cusparseCheckError(cusparseDestroyDnMat(B_descr_));
+        cusparseCheckError(cusparseDestroyDnMat(C_descr_));
+
+        // Free up the temporary buffer
+        cudaCheckError(cudaFree(dBuffer));
+      }
+      case gpuOffloadType::unified: {
+        // Create descriptors for the matrices
+        cusparseCheckError(cusparseCreateCsr(&A_descr_, 
+                                             m_, 
+                                             k_, 
+                                             nnz_, 
+                                             A_rows_, 
+                                             A_cols_,
+                                             A_vals_, 
+                                             index_, 
+                                             index_, 
+                                             base_, 
+                                             dataType_));
+        cusparseCheckError(cusparseCreateDnMat(&B_descr_, 
+                                               k_, 
+                                               n_,
+                                               n_, 
+                                               B_, 
+                                               dataType_,
+                                               B_order_));
+        cusparseCheckError(cusparseCreateDnMat(&C_descr_, 
+                                               m_, 
+                                               n_,
+                                               n_, 
+                                               C_, 
+                                               dataType_,
+                                               C_order_));
+
+        // Set up temporary buffers
+        void* dBuffer = nullptr;
+        size_t bufferSize = 0;
+
+        // Begin matrix-matrix multiplication
+        cusparseCheckError(cusparseSpMM_bufferSize(handle_, 
+                                                   opA_, 
+                                                   opB_, 
+                                                   &alpha, 
+                                                   A_descr_,
+                                                   B_descr_, 
+                                                   &beta, 
+                                                   C_descr_,
+                                                   dataType_, 
+                                                   alg_, 
+                                                   &bufferSize));
+
+        // Allocate the temporary buffer
+        cudaCheckError(cudaMalloc((void**)&dBuffer, bufferSize));
+
+        cusparseCheckError(cusparseSpMM_preprocess(handle_, 
+                                                   opA_, 
+                                                   opB_, 
+                                                   &alpha, 
+                                                   A_descr_,
+                                                   B_descr_, 
+                                                   &beta, 
+                                                   C_descr_,
+                                                   dataType_, 
+                                                   alg_, 
+                                                   dBuffer));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        cusparseCheckError(cusparseSpMM(handle_, 
+                                        opA_, 
+                                        opB_, 
+                                        &alpha, 
+                                        A_descr_, 
+                                        B_descr_,
+                                        &beta, 
+                                        C_descr_, 
+                                        dataType_, 
+                                        alg_, 
+                                        dBuffer));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        // Clean up descriptors
+        cusparseCheckError(cusparseDestroySpMat(A_descr_));
+        cusparseCheckError(cusparseDestroyDnMat(B_descr_));
+        cusparseCheckError(cusparseDestroyDnMat(C_descr_));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        // Free up the temporary buffer
+        cudaCheckError(cudaFree(dBuffer));
+        break;
+      }
+    }
+  }
+
+  void postLoopRequirements() override {
+    switch (offload_) {
+      case gpuOffloadType::always: {
+        break;
+      }  
+      case gpuOffloadType::once: {
+        // Move result back to CPU
+        cudaCheckError(cudaMemcpyAsync(C_, C_dev_, (sizeof(T) * m_ * n_),
+                                       cudaMemcpyDeviceToHost, stream1_));
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+      case gpuOffloadType::unified: {
+        // Move result back to CPU
+        cudaCheckError(cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_, 
+                                            cudaCpuDeviceId, stream1_));
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+    }
+  }
+
+  void postCallKernelCleanup() override {
+    if (offload_ == gpuOffloadType::unified) {
+      cudaCheckError(cudaFree(A_vals_));
+      cudaCheckError(cudaFree(A_cols_));
+      cudaCheckError(cudaFree(A_rows_));
+      cudaCheckError(cudaFree(B_));
+      cudaCheckError(cudaFree(C_));
+      free(A_vals_store_);
+      free(A_cols_store_);
+      free(A_rows_store_);
+    } else {
+      free(A_vals_);
+      free(A_cols_);
+      free(A_rows_);
+      free(B_);
+      free(C_);
+      cudaCheckError(cudaFree(A_vals_dev_));
+      cudaCheckError(cudaFree(A_cols_dev_));
+      cudaCheckError(cudaFree(A_rows_dev_));
+      cudaCheckError(cudaFree(B_dev_));
+      cudaCheckError(cudaFree(C_dev_));
+    }
+  }
+
+  bool alreadyInitialised_ = false;
+
+  /** Handle used when calling cuBLAS. */
+  cusparseHandle_t handle_;
+
+  /** CUDA Streams - used to asynchronously move data between host and device. */
+  cudaStream_t stream1_;
+  cudaStream_t stream2_;
+  cudaStream_t stream3_;
+  cudaStream_t stream4_;
+  cudaStream_t stream5_;
+  
+  /** The ID of the target GPU Device. */
+  int gpuDevice_;
+
+  /** The constant value Alpha. */
+  const T alpha = ALPHA;
+
+  /** The constant value Beta. */
+  const T beta = BETA;
+
+  // cuSPARSE parameters
+  cusparseOperation_t opA_;
+  cusparseOperation_t opB_;
+  cusparseSpMMAlg_t alg_;
+	cusparseIndexType_t index_;
+  cusparseIndexBase_t base_;
+  cudaDataType_t dataType_;
+
+  /**
+   * ___________ Host data ______________
+   */
+	/** CSR format vectors for matrix A */
+  cusparseSpMatDescr_t A_descr_;
+	T* A_vals_;
+	int64_t* A_cols_;
+  int64_t* A_rows_;
+  int64_t A_num_rows_;
+  int64_t A_num_cols_;
+
+  /** dense format values for matrices B and C */
+  cusparseDnMatDescr_t B_descr_;
+  int64_t B_num_rows_;
+  int64_t B_num_cols_;
+  int64_t B_leading_dim_;
+  cusparseOrder_t B_order_;
+
+  cusparseDnMatDescr_t C_descr_;
+  int64_t C_num_rows_;
+  int64_t C_num_cols_;
+  int64_t C_leading_dim_;
+  cusparseOrder_t C_order_;
+
+  /**
+   * _____________ Device data ________________
+   */
+  T* A_vals_dev_;
+  int64_t* A_cols_dev_;
+  int64_t* A_rows_dev_;
+
+  T* B_dev_;
+
+  T* C_dev_;
+
+  T* A_vals_store_;
+  int64_t* A_cols_store_;
+  int64_t* A_rows_store_;
+};
+};
+
+
+#endif
diff --git a/cuBLAS/spmdnv.hh b/cuBLAS/spmdnv.hh
new file mode 100644
index 0000000..4d0317e
--- /dev/null
+++ b/cuBLAS/spmdnv.hh
@@ -0,0 +1,484 @@
+#pragma once
+
+#ifdef GPU_CUBLAS
+#include <cusparse_v2.h>
+#include <cuda_runtime_api.h>
+#include <type_traits>
+#include <random>
+#include <iostream>
+
+#include "../include/kernels/GPU/spmdnv.hh"
+#include "../include/utilities.hh"
+#include "common.hh"
+
+namespace gpu {
+/** A class for SpMDnV GPU BLAS kernels. */
+template <typename T>
+class spmdnv_gpu : public spmdnv<T> {
+ public:
+  using spmdnv<T>::spmdnv;
+  using spmdnv<T>::initInputMatrixVector;
+  using spmdnv<T>::nnz_;
+  using spmdnv<T>::m_;
+  using spmdnv<T>::n_;
+  using spmdnv<T>::x_;
+  using spmdnv<T>::y_;
+  using spmdnv<T>::offload_;
+  using spmdnv<T>::sparsity_;
+  using spmdnv<T>::type_;
+
+  ~spmdnv_gpu() {
+    if (initialised_) {
+      cusparseCheckError(cusparseDestroy(handle_));
+
+      cudaCheckError(cudaStreamDestroy(s1_));
+      cudaCheckError(cudaStreamDestroy(s2_));
+      cudaCheckError(cudaStreamDestroy(s3_));
+      cudaCheckError(cudaStreamDestroy(s4_));
+      cudaCheckError(cudaStreamDestroy(s5_));
+      
+      initialised_ = false;
+    }
+  }
+
+  void initialise(gpuOffloadType offload, int m, int n, 
+                  double sparsity, matrixType type) override {
+    if (!initialised_) {
+      initialised_ = true;
+      cusparseCheckError(cusparseCreate(&handle_));
+      
+      cudaCheckError(cudaStreamCreate(&s1_));
+      cudaCheckError(cudaStreamCreate(&s2_));
+      cudaCheckError(cudaStreamCreate(&s3_));
+      cudaCheckError(cudaStreamCreate(&s4_));
+      cudaCheckError(cudaStreamCreate(&s5_));
+
+      cusparseCheckError(cusparseSetStream(handle_, s1_));
+
+      // Get device identifier
+      cudaCheckError(cudaGetDevice(&gpuDevice_));
+    }
+
+    offload_ = offload;
+    sparsity_ = sparsity;
+    type_ = type;
+
+
+    // Setting cusparse metadata
+    if (std::is_same_v<T, float>) {
+      dataType_ = CUDA_R_32F;
+    } else if (std::is_same_v<T, double>) {
+      dataType_ = CUDA_R_64F;
+    } else {
+      std::cerr << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
+      exit(1);
+    }
+    opA_ = opB_ = CUSPARSE_OPERATION_NON_TRANSPOSE;
+    alg_ = CUSPARSE_SPMV_ALG_DEFAULT;
+    index_ = CUSPARSE_INDEX_64I;
+    base_ = CUSPARSE_INDEX_BASE_ZERO;
+
+    m_ = m;
+    n_ = n;
+    nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_));
+
+    // Allocate dense data structures
+    if (offload_ == gpuOffloadType::unified) {
+      cudaCheckError(cudaMallocManaged(&x_, n_ * sizeof(T)));
+      cudaCheckError(cudaMallocManaged(&y_, m_ * sizeof(T)));
+      cudaCheckError(cudaDeviceSynchronize());
+    } else {
+      x_ = (T*)malloc(n_ * sizeof(T));
+      y_ = (T*)malloc(m_ * sizeof(T));
+
+      cudaCheckError(cudaMalloc((void**)&x_dev_, n_ * sizeof(T)));
+      cudaCheckError(cudaMalloc((void**)&y_dev_, m_ * sizeof(T)));
+      cudaCheckError(cudaDeviceSynchronize());
+    }
+
+    initInputMatrixVector();
+  }
+
+protected:
+
+  void toSparseFormat() override {
+    if (offload_ == gpuOffloadType::always) {
+      A_vals_store_ = (T*)malloc(sizeof(T) * nnz_);
+      A_cols_store_ = (int64_t*)malloc(sizeof(int64_t) * nnz_);
+      A_rows_store_ = (int64_t*)malloc(sizeof(int64_t) * (m_ + 1));
+
+      if (type_ == matrixType::random) {
+        randomCSR<T, int64_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, n_, nnz_);
+      } else if (type_ == matrixType::rmat) {
+        rMatCSR<T, int64_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, n_, nnz_);
+      } else if (type_ == matrixType::finiteElements) {
+        finiteElementCSR<T, int64_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, n_, nnz_);
+      } else {
+        std::cerr << "Matrix type not supported" << std::endl;
+        exit(1);
+      }
+    }
+
+
+    if (offload_ == gpuOffloadType::unified) {
+      cudaCheckError(cudaMallocManaged(&A_vals_, nnz_ * sizeof(T)));
+      cudaCheckError(cudaMallocManaged(&A_cols_, nnz_ * sizeof(int64_t)));
+      cudaCheckError(cudaMallocManaged(&A_rows_, (m_ + 1) * sizeof(int64_t)));
+    } else {      
+      A_vals_ = (T*)malloc(nnz_ * sizeof(T));
+      A_cols_ = (int64_t*)malloc(nnz_ * sizeof(int64_t));
+      A_rows_ = (int64_t*)malloc((m_ + 1) * sizeof(int64_t));
+      cudaCheckError(cudaMalloc((void**)&A_vals_dev_, nnz_ * sizeof(T)));
+      cudaCheckError(cudaMalloc((void**)&A_cols_dev_, nnz_ * sizeof(int64_t)));
+      cudaCheckError(cudaMalloc((void**)&A_rows_dev_, (m_ + 1) * sizeof(int64_t)));
+    }
+    cudaCheckError(cudaDeviceSynchronize());
+
+    memcpy(A_vals_, A_vals_store_, sizeof(T) * nnz_);
+    memcpy(A_cols_, A_cols_store_, sizeof(int64_t) * nnz_);
+    memcpy(A_rows_, A_rows_store_, sizeof(int64_t) * (m_ + 1));
+    cudaCheckError(cudaDeviceSynchronize());
+  }
+
+ private:
+  void preLoopRequirements() override {
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        break;
+      }
+      case gpuOffloadType::once: {
+        cudaCheckError(cudaMemcpyAsync(A_vals_dev_, A_vals_, nnz_ * sizeof(T), cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_cols_dev_, A_cols_, nnz_ * sizeof(int64_t), cudaMemcpyHostToDevice, s2_));
+        cudaCheckError(cudaMemcpyAsync(A_rows_dev_, A_rows_, (m_ + 1) * sizeof(int64_t), cudaMemcpyHostToDevice, s3_));
+        cudaCheckError(cudaMemcpyAsync(x_dev_, x_, n_ * sizeof(T), cudaMemcpyHostToDevice, s4_));
+        cudaCheckError(cudaMemcpyAsync(y_dev_, y_, m_ * sizeof(T), cudaMemcpyHostToDevice, s5_));
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+      case gpuOffloadType::unified: {
+        // Prefetch memory to device
+        cudaCheckError(cudaMemPrefetchAsync(A_vals_, nnz_ * sizeof(T), gpuDevice_, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(A_cols_, nnz_ * sizeof(int64_t), gpuDevice_, s2_));
+        cudaCheckError(cudaMemPrefetchAsync(A_rows_, (m_ + 1) * sizeof(int64_t), gpuDevice_, s3_));
+        cudaCheckError(cudaMemPrefetchAsync(x_, n_ * sizeof(T), gpuDevice_, s4_));
+        cudaCheckError(cudaMemPrefetchAsync(y_, m_ * sizeof(T), gpuDevice_, s5_));
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+    }
+  }
+
+  /** Make a call to the BLAS Library Kernel. */
+  void callSpMDnV() override {
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        cudaCheckError(cudaMemcpyAsync(A_vals_dev_, A_vals_, nnz_ * sizeof(T), cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_cols_dev_, A_cols_, nnz_ * sizeof(int64_t), cudaMemcpyHostToDevice, s2_));
+        cudaCheckError(cudaMemcpyAsync(A_rows_dev_, A_rows_, (m_ + 1) * sizeof(int64_t), cudaMemcpyHostToDevice, s3_));
+        cudaCheckError(cudaMemcpyAsync(x_dev_, x_, n_ * sizeof(T), cudaMemcpyHostToDevice, s4_));
+        cudaCheckError(cudaMemcpyAsync(y_dev_, y_, m_ * sizeof(T), cudaMemcpyHostToDevice, s5_));
+
+        cusparseCheckError(cusparseCreateCsr(&A_descr_,
+                                             m_,
+                                             n_,
+                                             nnz_,
+                                             A_rows_dev_,
+                                             A_cols_dev_,
+                                             A_vals_dev_,
+                                             index_,
+                                             index_,
+                                             base_,
+                                             dataType_));
+        cusparseCheckError(cusparseCreateDnVec(&x_descr_,
+                                               n_,
+                                               x_dev_,
+                                               dataType_));
+        cusparseCheckError(cusparseCreateDnVec(&y_descr_,
+                                               m_,
+                                               y_dev_,
+                                               dataType_));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        size_t bufferSize;
+        void* dBuffer = nullptr;
+        cusparseCheckError(cusparseSpMV_bufferSize(handle_,
+                                                   opA_,
+                                                   &alpha,
+                                                   A_descr_,
+                                                   x_descr_,
+                                                   &beta,
+                                                   y_descr_,
+                                                   dataType_,
+                                                   alg_,
+                                                   &bufferSize));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        if (bufferSize > 0) cudaCheckError(cudaMalloc(&dBuffer, bufferSize));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        cusparseCheckError(cusparseSpMV(handle_,
+                                        opA_,
+                                        &alpha,
+                                        A_descr_,
+                                        x_descr_,
+                                        &beta,
+                                        y_descr_,
+                                        dataType_,
+                                        alg_,
+                                        dBuffer));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        cusparseCheckError(cusparseDestroySpMat(A_descr_));
+        cusparseCheckError(cusparseDestroyDnVec(x_descr_));
+        cusparseCheckError(cusparseDestroyDnVec(y_descr_));
+
+        cudaCheckError(cudaDeviceSynchronize());
+        if (dBuffer != nullptr) cudaCheckError(cudaFree(dBuffer));
+
+        cudaCheckError(cudaMemcpyAsync(y_, y_dev_, m_ * sizeof(T), cudaMemcpyDeviceToHost, s2_));
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+      case gpuOffloadType::once: {
+        cusparseCheckError(cusparseCreateCsr(&A_descr_,
+                                             m_,
+                                             n_,
+                                             nnz_,
+                                             A_rows_dev_,
+                                             A_cols_dev_,
+                                             A_vals_dev_,
+                                             index_,
+                                             index_,
+                                             base_,
+                                             dataType_));
+        cusparseCheckError(cusparseCreateDnVec(&x_descr_,
+                                               n_,
+                                               x_dev_,
+                                               dataType_));
+        cusparseCheckError(cusparseCreateDnVec(&y_descr_,
+                                               m_,
+                                               y_dev_,
+                                               dataType_));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        size_t bufferSize;
+        void* dBuffer = nullptr;
+        cusparseCheckError(cusparseSpMV_bufferSize(handle_,
+                                                   opA_,
+                                                   &alpha,
+                                                   A_descr_,
+                                                   x_descr_,
+                                                   &beta,
+                                                   y_descr_,
+                                                   dataType_,
+                                                   alg_,
+                                                   &bufferSize));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        if (bufferSize > 0) cudaCheckError(cudaMalloc(&dBuffer, bufferSize));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        cusparseCheckError(cusparseSpMV(handle_,
+                                        opA_,
+                                        &alpha,
+                                        A_descr_,
+                                        x_descr_,
+                                        &beta,
+                                        y_descr_,
+                                        dataType_,
+                                        alg_,
+                                        dBuffer));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        cusparseCheckError(cusparseDestroySpMat(A_descr_));
+        cusparseCheckError(cusparseDestroyDnVec(x_descr_));
+        cusparseCheckError(cusparseDestroyDnVec(y_descr_));
+        cudaCheckError(cudaDeviceSynchronize());
+        if (dBuffer != nullptr) cudaCheckError(cudaFree(dBuffer));
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+      case gpuOffloadType::unified: {
+        cusparseCheckError(cusparseCreateCsr(&A_descr_,
+                                             m_,
+                                             n_,
+                                             nnz_,
+                                             A_rows_,
+                                             A_cols_,
+                                             A_vals_,
+                                             index_,
+                                             index_,
+                                             base_,
+                                             dataType_));
+        cusparseCheckError(cusparseCreateDnVec(&x_descr_,
+                                               n_,
+                                               x_,
+                                               dataType_));
+        cusparseCheckError(cusparseCreateDnVec(&y_descr_,
+                                               m_,
+                                               y_,
+                                               dataType_));
+        cudaCheckError(cudaDeviceSynchronize());
+        /*
+         * Workflow is :
+         *    cusparseSpMV_bufferSize
+         *    cusparseSpMV
+         */
+        size_t bufferSize;
+        void* dBuffer = nullptr;
+        cusparseCheckError(cusparseSpMV_bufferSize(handle_,
+                                                   opA_,
+                                                   &alpha,
+                                                   A_descr_,
+                                                   x_descr_,
+                                                   &beta,
+                                                   y_descr_,
+                                                   dataType_,
+                                                   alg_,
+                                                   &bufferSize));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        // TODO -- cusparseSpMV_preprocess()
+
+        if (bufferSize > 0) cudaCheckError(cudaMalloc(&dBuffer, bufferSize));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        cusparseCheckError(cusparseSpMV(handle_,
+                                        opA_,
+                                        &alpha,
+                                        A_descr_,
+                                        x_descr_,
+                                        &beta,
+                                        y_descr_,
+                                        dataType_,
+                                        alg_,
+                                        dBuffer));
+
+        cusparseCheckError(cusparseDestroySpMat(A_descr_));
+        cusparseCheckError(cusparseDestroyDnVec(x_descr_));
+        cusparseCheckError(cusparseDestroyDnVec(y_descr_));
+        cudaCheckError(cudaDeviceSynchronize());
+        if (dBuffer != nullptr) cudaCheckError(cudaFree(dBuffer));
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+    }
+	}
+
+  /** Perform any required steps after calling the SpMDnV kernel that should
+   * be timed. */
+  void postLoopRequirements() override {
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        break;
+      }
+      case gpuOffloadType::once: {
+        cudaCheckError(cudaMemcpyAsync(y_, y_dev_, sizeof(T) * m_, cudaMemcpyDeviceToHost, s3_));
+        break;
+      }
+      case gpuOffloadType::unified: {
+        cudaCheckError(cudaMemPrefetchAsync(y_, m_ * sizeof(T), cudaCpuDeviceId, s3_));
+        break;
+      }
+    }
+    cudaCheckError(cudaDeviceSynchronize());
+  }
+
+  /** Do any necessary cleanup (free pointers, close library handles, etc.)
+   * after Kernel has been called. */
+  void postCallKernelCleanup() override {
+    if (offload_ == gpuOffloadType::unified) {
+      cudaCheckError(cudaFree(A_vals_));
+      cudaCheckError(cudaFree(A_cols_));
+      cudaCheckError(cudaFree(A_rows_));
+      cudaCheckError(cudaFree(x_));
+      cudaCheckError(cudaFree(y_));
+      free(A_vals_store_);
+      free(A_cols_store_);
+      free(A_rows_store_);
+    } else {
+      free(A_vals_);
+      free(A_cols_);
+      free(A_rows_);
+      free(x_);
+      free(y_);
+      cudaCheckError(cudaFree(A_vals_dev_));
+      cudaCheckError(cudaFree(A_cols_dev_));
+      cudaCheckError(cudaFree(A_rows_dev_));
+      cudaCheckError(cudaFree(x_dev_));
+      cudaCheckError(cudaFree(y_dev_));
+    }
+  }
+
+  bool initialised_ = false;
+
+  /**
+   * ################################
+   *        CUSPARSE STUFF
+   * ################################
+   */
+  /** Handle used when calling cuBLAS. */
+  cusparseHandle_t handle_;
+
+  /** CUDA Streams - used to asynchronously move data between host and device.
+   */
+  cudaStream_t s1_;
+  cudaStream_t s2_;
+  cudaStream_t s3_;
+  cudaStream_t s4_;
+  cudaStream_t s5_;
+
+  /** The ID of the target GPU Device. */
+  int gpuDevice_;
+
+	// Create descriptors for matrices A->C
+	cusparseSpMatDescr_t A_descr_;
+  cusparseDnVecDescr_t x_descr_, y_descr_;
+
+	// cusparse metadata variables
+	cudaDataType_t dataType_;
+  cusparseOperation_t opA_;
+  cusparseOperation_t opB_;
+  cusparseSpMVAlg_t alg_;
+  cusparseIndexType_t index_;
+  cusparseIndexBase_t base_;
+
+  /** The constant value Alpha. */
+  const T alpha = ALPHA;
+
+  /** The constant value Beta. */
+  const T beta = BETA;
+
+  /**
+   * ################################
+   *        Matrix A parameters
+   * ################################
+   */
+  /** CSR format vectors for storage of matrix between offload type runs */
+  T* A_vals_store_;
+  int64_t* A_cols_store_;
+  int64_t* A_rows_store_;
+
+	/** CSR format vectors on the host (also used for USM) */
+	T* A_vals_;
+	int64_t* A_cols_;
+  int64_t* A_rows_;
+  /** CSR format vectors on the device. */
+	T* A_vals_dev_;
+	int64_t* A_cols_dev_;
+	int64_t* A_rows_dev_; 
+
+  /**
+   * ################################
+   *    Vectors x and y parameters
+   * ################################
+   */
+  /** Vectors on the host (also used for USM) */
+  T* x_host_;
+  T* y_host_;
+  /** Vectors on the device */
+  T* x_dev_;
+  T* y_dev_;
+};
+}  // namespace gpu
+#endif
\ No newline at end of file
diff --git a/cuBLAS/spmspm.hh b/cuBLAS/spmspm.hh
new file mode 100644
index 0000000..5ec847c
--- /dev/null
+++ b/cuBLAS/spmspm.hh
@@ -0,0 +1,907 @@
+#pragma once
+
+#ifdef GPU_CUBLAS
+#include <cusparse_v2.h>
+#include <cuda_runtime.h>
+#include <type_traits>
+#include <random>
+#include <iostream>
+
+#include "../include/kernels/GPU/spmspm.hh"
+#include "../include/utilities.hh"
+#include "common.hh"
+
+namespace gpu {
+/** A class for sparse GEMM GPU BLAS kernels. */
+template <typename T>
+class spmspm_gpu : public spmspm<T> {
+ public:
+  using spmspm<T>::spmspm;
+  using spmspm<T>::initInputMatrices;
+  using spmspm<T>::A_nnz_;
+  using spmspm<T>::B_nnz_;
+  using spmspm<T>::m_;
+  using spmspm<T>::n_;
+  using spmspm<T>::k_;
+  using spmspm<T>::offload_;
+  using spmspm<T>::sparsity_;
+  using spmspm<T>::type_;
+  using spmspm<T>::C_nnz_;
+  using spmspm<T>::C_rows_;
+  using spmspm<T>::C_cols_;
+  using spmspm<T>::C_vals_;
+
+  ~spmspm_gpu() {
+    if (alreadyInitialised_) {
+      cusparseCheckError(cusparseDestroy(handle_));
+
+      cudaCheckError(cudaStreamDestroy(s1_));
+      cudaCheckError(cudaStreamDestroy(s2_));
+      cudaCheckError(cudaStreamDestroy(s3_));
+      cudaCheckError(cudaStreamDestroy(s4_));
+      cudaCheckError(cudaStreamDestroy(s5_));
+      cudaCheckError(cudaStreamDestroy(s6_));
+      
+      alreadyInitialised_ = false;
+    }
+  }
+
+  /** Initialise the required data structures.
+   * `offload` refers to the data offload type:
+   *  - Once:    Move data from host to device before all iterations & move from
+   *             device to host after all iterations
+   *  - Always:  Move data from host to device and device to host each iteration
+   *  - Unified: Initialise data as unified memory; no data movement semantics
+   *             required */
+  void initialise(gpuOffloadType offload, int n, int m, int k, 
+                  double sparsity, matrixType type, 
+                  bool binary = false) override {
+    if (!alreadyInitialised_) {
+      alreadyInitialised_ = true;
+      cusparseCheckError(cusparseCreate(&handle_));
+      
+      cudaCheckError(cudaStreamCreate(&s1_));
+      cudaCheckError(cudaStreamCreate(&s2_));
+      cudaCheckError(cudaStreamCreate(&s3_));
+      cudaCheckError(cudaStreamCreate(&s4_));
+      cudaCheckError(cudaStreamCreate(&s5_));
+      cudaCheckError(cudaStreamCreate(&s6_));
+
+      cusparseCheckError(cusparseSetStream(handle_, s1_));
+
+      // Get device identifier
+      cudaCheckError(cudaGetDevice(&gpuDevice_));
+    }
+
+    offload_ = offload;
+    sparsity_ = sparsity;
+    type_ = type;
+
+    m_ = m;
+    n_ = n;
+    k_ = k;
+
+    /** Determine the number of nnz elements in A and B */
+    A_nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+    B_nnz_ = 1 + (uint64_t)((double)k_ * (double)n_ * (1.0 - sparsity_));
+
+    opA_ = CUSPARSE_OPERATION_NON_TRANSPOSE;
+    opB_ = CUSPARSE_OPERATION_NON_TRANSPOSE;
+    alg_ = CUSPARSE_SPGEMM_ALG2;
+    index_ = CUSPARSE_INDEX_32I;
+    base_ = CUSPARSE_INDEX_BASE_ZERO;
+    if (std::is_same_v<T, float>) dataType_ = CUDA_R_32F;
+    else if (std::is_same_v<T, double>) dataType_ = CUDA_R_64F;
+    else {
+      std::cerr << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
+      exit(1);
+    }
+
+    initInputMatrices();
+  }
+
+ protected:
+  void toSparseFormat() override {
+    if (offload_ == gpuOffloadType::always) {
+      A_vals_store_ = (T*)malloc(sizeof(T) * A_nnz_);
+      A_cols_store_ = (int32_t*)malloc(sizeof(int32_t) * A_nnz_);
+      A_rows_store_ = (int32_t*)malloc(sizeof(int32_t) * (m_ + 1));
+      B_vals_store_ = (T*)malloc(sizeof(T) * B_nnz_);
+      B_cols_store_ = (int32_t*)malloc(sizeof(int32_t) * B_nnz_);
+      B_rows_store_ = (int32_t*)malloc(sizeof(int32_t) * (k_ + 1));
+
+      int seedOffset = 0;
+      do {
+        if (type_ == matrixType::rmat) {
+          rMatCSR<T, int32_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, A_nnz_, SEED + seedOffset++);
+          rMatCSR<T, int32_t>(B_vals_store_, B_cols_store_, B_rows_store_, k_, n_, B_nnz_, SEED + seedOffset++);
+        } else if (type_ == matrixType::random) {
+          randomCSR<T, int32_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, A_nnz_, SEED + seedOffset++);
+          randomCSR<T, int32_t>(B_vals_store_, B_cols_store_, B_rows_store_, k_, n_, B_nnz_, SEED + seedOffset++);
+        } else if (type_ == matrixType::finiteElements) {
+          finiteElementCSR<T, int32_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, A_nnz_, SEED + seedOffset++);
+          finiteElementCSR<T, int32_t>(B_vals_store_, B_cols_store_, B_rows_store_, k_, n_, B_nnz_, SEED + seedOffset++);
+        } else {
+          std::cerr << "Matrix type not supported" << std::endl;
+          exit(1);
+        }
+      } while (calcCNNZ<int32_t>(m_, A_nnz_, A_rows_store_, A_cols_store_, k_, B_nnz_, B_rows_store_, B_cols_store_) == 0);
+    }
+
+    // Allocate CSR arrays
+    if (offload_ == gpuOffloadType::unified) {
+      cudaCheckError(cudaMallocManaged(&A_vals_, sizeof(T) * A_nnz_));
+      cudaCheckError(cudaMallocManaged(&A_cols_, sizeof(int32_t) * A_nnz_));
+      cudaCheckError(cudaMallocManaged(&A_rows_, sizeof(int32_t) * (m_ + 1)));
+      cudaCheckError(cudaMallocManaged(&B_vals_, sizeof(T) * B_nnz_));
+      cudaCheckError(cudaMallocManaged(&B_cols_, sizeof(int32_t) * B_nnz_));
+      cudaCheckError(cudaMallocManaged(&B_rows_, sizeof(int32_t) * (k_ + 1)));
+      cudaCheckError(cudaMallocManaged(&C_rows_32_, sizeof(int32_t) * (m_ + 1)));
+      C_vals_ = nullptr;
+      C_cols_32_ = nullptr;
+    } else {
+      A_vals_ = (T*)malloc(sizeof(T) * A_nnz_);
+      A_cols_ = (int32_t*)malloc(sizeof(int32_t) * A_nnz_);
+      A_rows_ = (int32_t*)malloc(sizeof(int32_t) * (m_ + 1));
+      B_vals_ = (T*)malloc(sizeof(T) * B_nnz_);
+      B_cols_ = (int32_t*)malloc(sizeof(int32_t) * B_nnz_);
+      B_rows_ = (int32_t*)malloc(sizeof(int32_t) * (k_ + 1));
+      C_rows_32_ = (int32_t*)malloc(sizeof(int32_t) * (m_ + 1));
+      C_vals_ = nullptr;
+      C_cols_32_ = nullptr;
+
+      cudaCheckError(cudaMalloc((void**)&A_vals_dev_, sizeof(T) * A_nnz_));
+      cudaCheckError(cudaMalloc((void**)&A_cols_dev_, sizeof(int32_t) * A_nnz_));
+      cudaCheckError(cudaMalloc((void**)&A_rows_dev_, sizeof(int32_t) * (m_ + 1)));
+      cudaCheckError(cudaMalloc((void**)&B_vals_dev_, sizeof(T) * B_nnz_));
+      cudaCheckError(cudaMalloc((void**)&B_cols_dev_, sizeof(int32_t) * B_nnz_));
+      cudaCheckError(cudaMalloc((void**)&B_rows_dev_, sizeof(int32_t) * (k_ + 1)));
+      cudaCheckError(cudaMalloc((void**)&C_rows_dev_, sizeof(int32_t) * (m_ + 1)));
+      C_vals_dev_ = nullptr;
+      C_cols_dev_ = nullptr;
+    }
+
+    // Move data into the correct arrays
+    memcpy(A_vals_, A_vals_store_, sizeof(T) * A_nnz_);
+    memcpy(A_cols_, A_cols_store_, sizeof(int32_t) * A_nnz_);
+    memcpy(A_rows_, A_rows_store_, sizeof(int32_t) * (m_ + 1));
+    memcpy(B_vals_, B_vals_store_, sizeof(T) * B_nnz_);
+    memcpy(B_cols_, B_cols_store_, sizeof(int32_t) * B_nnz_);
+    memcpy(B_rows_, B_rows_store_, sizeof(int32_t) * (k_ + 1));
+  }
+
+ private:
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        break;
+      }
+      case gpuOffloadType::once: {
+        cudaCheckError(cudaMemcpyAsync(A_vals_dev_, A_vals_, sizeof(T) * A_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_cols_dev_, A_cols_, sizeof(int32_t) * A_nnz_, cudaMemcpyHostToDevice, s2_));
+        cudaCheckError(cudaMemcpyAsync(A_rows_dev_, A_rows_, sizeof(int32_t) * (m_ + 1), cudaMemcpyHostToDevice, s3_));
+        cudaCheckError(cudaMemcpyAsync(B_vals_dev_, B_vals_, sizeof(T) * B_nnz_, cudaMemcpyHostToDevice, s4_));
+        cudaCheckError(cudaMemcpyAsync(B_cols_dev_, B_cols_, sizeof(int32_t) * B_nnz_, cudaMemcpyHostToDevice, s5_));
+        cudaCheckError(cudaMemcpyAsync(B_rows_dev_, B_rows_, sizeof(int32_t) * (k_ + 1), cudaMemcpyHostToDevice, s6_));
+        break;
+      }
+      case gpuOffloadType::unified: {
+        // Prefetch memory to device
+        cudaCheckError(cudaMemPrefetchAsync(A_vals_, sizeof(T) * A_nnz_, gpuDevice_, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(A_cols_, sizeof(int32_t) * A_nnz_, gpuDevice_, s2_));
+        cudaCheckError(cudaMemPrefetchAsync(A_rows_, sizeof(int32_t) * (m_ + 1), gpuDevice_, s3_));
+        cudaCheckError(cudaMemPrefetchAsync(B_vals_, sizeof(T) * B_nnz_, gpuDevice_, s4_));
+        cudaCheckError(cudaMemPrefetchAsync(B_cols_, sizeof(int32_t) * B_nnz_, gpuDevice_, s5_));
+        cudaCheckError(cudaMemPrefetchAsync(B_rows_, sizeof(int32_t) * (k_ + 1), gpuDevice_, s6_));
+        break;
+      }
+    }
+  }
+
+  /** Make a call to the BLAS Library Kernel. */
+  void callSpmspm() override {
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        if (C_allocated) {
+          free(C_vals_);
+          free(C_cols_32_);
+          C_allocated = false;
+        }
+
+        cudaCheckError(cudaMemcpyAsync(A_vals_dev_, A_vals_, sizeof(T) * A_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_cols_dev_, A_cols_, sizeof(int32_t) * A_nnz_, cudaMemcpyHostToDevice, s2_));
+        cudaCheckError(cudaMemcpyAsync(A_rows_dev_, A_rows_, sizeof(int32_t) * (m_ + 1), cudaMemcpyHostToDevice, s3_));
+        cudaCheckError(cudaMemcpyAsync(B_vals_dev_, B_vals_, sizeof(T) * B_nnz_, cudaMemcpyHostToDevice, s4_));
+        cudaCheckError(cudaMemcpyAsync(B_cols_dev_, B_cols_, sizeof(int32_t) * B_nnz_, cudaMemcpyHostToDevice, s5_));
+        cudaCheckError(cudaMemcpyAsync(B_rows_dev_, B_rows_, sizeof(int32_t) * (k_ + 1), cudaMemcpyHostToDevice, s6_));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        // Make matrix descriptors
+        cusparseCheckError(cusparseCreateCsr(&A_descr_, 
+                                             m_, 
+                                             k_, 
+                                             A_nnz_, 
+                                             A_rows_dev_,
+                                             A_cols_dev_, 
+                                             A_vals_dev_, 
+                                             index_, 
+                                             index_,
+                                             base_, 
+                                             dataType_));
+        cusparseCheckError(cusparseCreateCsr(&B_descr_, 
+                                             k_, 
+                                             n_, 
+                                             B_nnz_, 
+                                             B_rows_dev_,
+                                             B_cols_dev_, 
+                                             B_vals_dev_, 
+                                             index_, 
+                                             index_,
+                                             base_, 
+                                             dataType_));
+        cusparseCheckError(cusparseCreateCsr(&C_descr_, 
+                                             m_, 
+                                             n_, 
+                                             0, 
+                                             nullptr,
+                                             nullptr, 
+                                             nullptr, 
+                                             index_, 
+                                             index_,
+                                             base_, 
+                                             dataType_));
+        
+        cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDescr_));
+
+        size_t bufferSize1 = 0;
+        cusparseCheckError(cusparseSpGEMM_workEstimation(handle_, 
+                                                         opA_, 
+                                                         opB_, 
+                                                         &alpha,
+                                                         A_descr_, 
+                                                         B_descr_, 
+                                                         &beta,
+                                                         C_descr_, 
+                                                         dataType_, 
+                                                         alg_,
+                                                         spgemmDescr_, 
+                                                         &bufferSize1,
+                                                         nullptr));
+
+        void* dBuffer1 = nullptr;
+        cudaCheckError(cudaMalloc((void**)&dBuffer1, bufferSize1));
+
+        cusparseCheckError(cusparseSpGEMM_workEstimation(handle_, 
+                                                         opA_, 
+                                                         opB_, 
+                                                         &alpha,
+                                                         A_descr_, 
+                                                         B_descr_, 
+                                                         &beta,
+                                                         C_descr_, 
+                                                         dataType_, 
+                                                         alg_,
+                                                         spgemmDescr_, 
+                                                         &bufferSize1,
+                                                         dBuffer1));
+
+        int64_t num_prods;
+        cusparseCheckError(cusparseSpGEMM_getNumProducts(spgemmDescr_, &num_prods));
+
+        size_t bufferSize3 = 0;
+        cusparseCheckError(cusparseSpGEMM_estimateMemory(handle_,
+                                                         opA_,
+                                                         opB_,
+                                                         &alpha,
+                                                         A_descr_,
+                                                         B_descr_,
+                                                         &beta,
+                                                         C_descr_,
+                                                         dataType_,
+                                                         alg_,
+                                                         spgemmDescr_,
+                                                         0.2,
+                                                         &bufferSize3,
+                                                         nullptr,
+                                                         nullptr));
+
+        void* dBuffer3 = nullptr;
+        cudaCheckError(cudaMalloc((void**)&dBuffer3, bufferSize3));
+        size_t bufferSize2 = 0;
+        cusparseCheckError(cusparseSpGEMM_estimateMemory(handle_,
+                                                         opA_,
+                                                         opB_,
+                                                         &alpha,
+                                                         A_descr_,
+                                                         B_descr_,
+                                                         &beta,
+                                                         C_descr_,
+                                                         dataType_,
+                                                         alg_,
+                                                         spgemmDescr_,
+                                                         0.2,
+                                                         &bufferSize3,
+                                                         dBuffer3,
+                                                         &bufferSize2));
+
+        void* dBuffer2 = nullptr;
+        cudaCheckError(cudaMalloc((void**)&dBuffer2, bufferSize2));
+
+        cusparseCheckError(cusparseSpGEMM_compute(handle_, 
+                                                  opA_, 
+                                                  opB_, 
+                                                  &alpha, 
+                                                  A_descr_,
+                                                  B_descr_, 
+                                                  &beta, 
+                                                  C_descr_,
+                                                  dataType_, 
+                                                  alg_, 
+                                                  spgemmDescr_,
+                                                  &bufferSize2, 
+                                                  dBuffer2));
+
+        cusparseCheckError(cusparseSpMatGetSize(C_descr_, 
+                                                &C_num_rows_, 
+                                                &C_num_cols_,
+                                                &C_nnz_));
+
+        cudaCheckError(cudaMalloc(&C_vals_dev_, sizeof(T) * C_nnz_));
+        cudaCheckError(cudaMalloc(&C_cols_dev_, sizeof(int32_t) * C_nnz_));
+
+        cusparseCheckError(cusparseCsrSetPointers(C_descr_, 
+                                                  C_rows_dev_, 
+                                                  C_cols_dev_,
+                                                  C_vals_dev_));
+
+        cusparseCheckError(cusparseSpGEMM_copy(handle_, 
+                                               opA_, 
+                                               opB_, 
+                                               &alpha, 
+                                               A_descr_,
+                                               B_descr_, 
+                                               &beta, 
+                                               C_descr_, 
+                                               dataType_,
+                                               alg_, 
+                                               spgemmDescr_));
+
+        // Freeing memory
+        cudaCheckError(cudaFree(dBuffer1));
+        cudaCheckError(cudaFree(dBuffer2));
+        cudaCheckError(cudaFree(dBuffer3));
+        cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDescr_));
+        cusparseCheckError(cusparseDestroySpMat(A_descr_));
+        cusparseCheckError(cusparseDestroySpMat(B_descr_));
+        cusparseCheckError(cusparseDestroySpMat(C_descr_));
+
+        C_vals_ = (T*)malloc(sizeof(T) * C_nnz_);
+        C_cols_32_ = (int32_t*)malloc(sizeof(int32_t) * C_nnz_);
+        C_allocated = true;
+        
+        cudaCheckError(cudaMemcpyAsync(C_rows_32_, C_rows_dev_, sizeof(int32_t) * (m_ + 1), cudaMemcpyDeviceToHost, s1_));
+        cudaCheckError(cudaMemcpyAsync(C_cols_32_, C_cols_dev_, sizeof(int32_t) * C_nnz_, cudaMemcpyDeviceToHost, s2_));
+        cudaCheckError(cudaMemcpyAsync(C_vals_, C_vals_dev_, sizeof(T) * C_nnz_, cudaMemcpyDeviceToHost, s3_));
+        cudaCheckError(cudaDeviceSynchronize());
+        
+        cudaCheckError(cudaFree(C_vals_dev_));
+        cudaCheckError(cudaFree(C_cols_dev_));
+        break;
+      }
+      case gpuOffloadType::once: {
+        if (C_allocated) {
+          cudaCheckError(cudaFree(C_vals_dev_));
+          cudaCheckError(cudaFree(C_cols_dev_));
+          C_allocated = false;
+        }
+        // Make matrix descriptors
+        cusparseCheckError(cusparseCreateCsr(&A_descr_, 
+                                             m_, 
+                                             k_, 
+                                             A_nnz_, 
+                                             A_rows_dev_,
+                                             A_cols_dev_, 
+                                             A_vals_dev_, 
+                                             index_, 
+                                             index_,
+                                             base_, 
+                                             dataType_));
+        cusparseCheckError(cusparseCreateCsr(&B_descr_, 
+                                             k_, 
+                                             n_, 
+                                             B_nnz_, 
+                                             B_rows_dev_,
+                                             B_cols_dev_, 
+                                             B_vals_dev_, 
+                                             index_, 
+                                             index_,
+                                             base_, 
+                                             dataType_));
+        cusparseCheckError(cusparseCreateCsr(&C_descr_, 
+                                             m_, 
+                                             n_, 
+                                             0, 
+                                             nullptr,
+                                             nullptr, 
+                                             nullptr, 
+                                             index_, 
+                                             index_,
+                                             base_, 
+                                             dataType_));
+
+        cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDescr_));
+
+        size_t bufferSize1 = 0;
+        cusparseCheckError(cusparseSpGEMM_workEstimation(handle_, 
+                                                         opA_, 
+                                                         opB_, 
+                                                         &alpha,
+                                                         A_descr_, 
+                                                         B_descr_, 
+                                                         &beta,
+                                                         C_descr_, 
+                                                         dataType_, 
+                                                         alg_,
+                                                         spgemmDescr_, 
+                                                         &bufferSize1,
+                                                         nullptr));
+
+        void* dBuffer1 = nullptr;
+        cudaCheckError(cudaMalloc((void**)&dBuffer1, bufferSize1));
+
+        cusparseCheckError(cusparseSpGEMM_workEstimation(handle_, 
+                                                         opA_, 
+                                                         opB_, 
+                                                         &alpha,
+                                                         A_descr_, 
+                                                         B_descr_, 
+                                                         &beta,
+                                                         C_descr_, 
+                                                         dataType_, 
+                                                         alg_,
+                                                         spgemmDescr_, 
+                                                         &bufferSize1,
+                                                         dBuffer1));
+
+        int64_t num_prods;
+        cusparseCheckError(cusparseSpGEMM_getNumProducts(spgemmDescr_, &num_prods));
+
+        size_t bufferSize3 = 0;
+        cusparseCheckError(cusparseSpGEMM_estimateMemory(handle_,
+                                                         opA_,
+                                                         opB_,
+                                                         &alpha,
+                                                         A_descr_,
+                                                         B_descr_,
+                                                         &beta,
+                                                         C_descr_,
+                                                         dataType_,
+                                                         alg_,
+                                                         spgemmDescr_,
+                                                         0.2,
+                                                         &bufferSize3,
+                                                         nullptr,
+                                                         nullptr));
+
+        void* dBuffer3 = nullptr;
+        cudaCheckError(cudaMalloc((void**)&dBuffer3, bufferSize3));
+        size_t bufferSize2 = 0;
+        cusparseCheckError(cusparseSpGEMM_estimateMemory(handle_,
+                                                         opA_,
+                                                         opB_,
+                                                         &alpha,
+                                                         A_descr_,
+                                                         B_descr_,
+                                                         &beta,
+                                                         C_descr_,
+                                                         dataType_,
+                                                         alg_,
+                                                         spgemmDescr_,
+                                                         0.2,
+                                                         &bufferSize3,
+                                                         dBuffer3,
+                                                         &bufferSize2));
+
+        void* dBuffer2 = nullptr;
+        cudaCheckError(cudaMalloc((void**)&dBuffer2, bufferSize2));
+
+        cusparseCheckError(cusparseSpGEMM_compute(handle_, 
+                                                  opA_, 
+                                                  opB_, 
+                                                  &alpha, 
+                                                  A_descr_,
+                                                  B_descr_, 
+                                                  &beta, 
+                                                  C_descr_,
+                                                  dataType_, 
+                                                  alg_, 
+                                                  spgemmDescr_,
+                                                  &bufferSize2, 
+                                                  dBuffer2));
+
+        cusparseCheckError(cusparseSpMatGetSize(C_descr_, 
+                                                &C_num_rows_, 
+                                                &C_num_cols_,
+                                                &C_nnz_));
+
+        cudaCheckError(cudaMalloc(&C_vals_dev_, sizeof(T) * C_nnz_));
+        cudaCheckError(cudaMalloc(&C_cols_dev_, sizeof(int32_t) * C_nnz_));
+        C_allocated = true;
+
+        cusparseCheckError(cusparseCsrSetPointers(C_descr_, 
+                                                  C_rows_dev_, 
+                                                  C_cols_dev_,
+                                                  C_vals_dev_));
+
+        cusparseCheckError(cusparseSpGEMM_copy(handle_, 
+                                               opA_, 
+                                               opB_, 
+                                               &alpha, 
+                                               A_descr_,
+                                               B_descr_, 
+                                               &beta, 
+                                               C_descr_, 
+                                               dataType_,
+                                               alg_, 
+                                               spgemmDescr_));
+
+        // Freeing memory
+        cudaCheckError(cudaFree(dBuffer1));
+        cudaCheckError(cudaFree(dBuffer2));
+        cudaCheckError(cudaFree(dBuffer3));
+        cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDescr_));
+        cusparseCheckError(cusparseDestroySpMat(A_descr_));
+        cusparseCheckError(cusparseDestroySpMat(B_descr_));
+        cusparseCheckError(cusparseDestroySpMat(C_descr_));
+        break;
+      }
+      case gpuOffloadType::unified: {
+        if (C_allocated) {
+          cudaCheckError(cudaFree(C_cols_32_));
+          cudaCheckError(cudaFree(C_vals_));
+          C_allocated = false;
+        }
+
+        // Make matrix descriptors
+        cusparseCheckError(cusparseCreateCsr(&A_descr_, 
+                                             m_, 
+                                             k_, 
+                                             A_nnz_, 
+                                             A_rows_,
+                                             A_cols_, 
+                                             A_vals_, 
+                                             index_, 
+                                             index_,
+                                             base_, 
+                                             dataType_));
+        cusparseCheckError(cusparseCreateCsr(&B_descr_, 
+                                             k_, 
+                                             n_, 
+                                             B_nnz_, 
+                                             B_rows_,
+                                             B_cols_, 
+                                             B_vals_, 
+                                             index_, 
+                                             index_,
+                                             base_, 
+                                             dataType_));
+        cusparseCheckError(cusparseCreateCsr(&C_descr_, 
+                                             m_, 
+                                             n_, 
+                                             0, 
+                                             nullptr,
+                                             nullptr, 
+                                             nullptr, 
+                                             index_, 
+                                             index_,
+                                             base_, 
+                                             dataType_));
+
+        cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDescr_));
+
+        size_t bufferSize1 = 0;
+        cusparseCheckError(cusparseSpGEMM_workEstimation(handle_, 
+                                                         opA_, 
+                                                         opB_, 
+                                                         &alpha,
+                                                         A_descr_, 
+                                                         B_descr_, 
+                                                         &beta,
+                                                         C_descr_, 
+                                                         dataType_, 
+                                                         alg_,
+                                                         spgemmDescr_, 
+                                                         &bufferSize1,
+                                                         nullptr));
+
+        void* dBuffer1 = nullptr;
+        cudaCheckError(cudaMalloc((void**)&dBuffer1, bufferSize1));
+
+        cusparseCheckError(cusparseSpGEMM_workEstimation(handle_, 
+                                                         opA_, 
+                                                         opB_, 
+                                                         &alpha,
+                                                         A_descr_, 
+                                                         B_descr_, 
+                                                         &beta,
+                                                         C_descr_, 
+                                                         dataType_, 
+                                                         alg_,
+                                                         spgemmDescr_, 
+                                                         &bufferSize1,
+                                                         dBuffer1));
+
+        int64_t num_prods;
+        cusparseCheckError(cusparseSpGEMM_getNumProducts(spgemmDescr_, &num_prods));
+
+        size_t bufferSize3 = 0;
+        cusparseCheckError(cusparseSpGEMM_estimateMemory(handle_,
+                                                         opA_,
+                                                         opB_,
+                                                         &alpha,
+                                                         A_descr_,
+                                                         B_descr_,
+                                                         &beta,
+                                                         C_descr_,
+                                                         dataType_,
+                                                         alg_,
+                                                         spgemmDescr_,
+                                                         0.2,
+                                                         &bufferSize3,
+                                                         nullptr,
+                                                         nullptr));
+
+        void* dBuffer3 = nullptr;
+        cudaCheckError(cudaMalloc((void**)&dBuffer3, bufferSize3));
+        size_t bufferSize2 = 0;
+        cusparseCheckError(cusparseSpGEMM_estimateMemory(handle_,
+                                                         opA_,
+                                                         opB_,
+                                                         &alpha,
+                                                         A_descr_,
+                                                         B_descr_,
+                                                         &beta,
+                                                         C_descr_,
+                                                         dataType_,
+                                                         alg_,
+                                                         spgemmDescr_,
+                                                         0.2,
+                                                         &bufferSize3,
+                                                         dBuffer3,
+                                                         &bufferSize2));
+
+        void* dBuffer2 = nullptr;
+        cudaCheckError(cudaMalloc((void**)&dBuffer2, bufferSize2));
+
+        cusparseCheckError(cusparseSpGEMM_compute(handle_, 
+                                                  opA_, 
+                                                  opB_, 
+                                                  &alpha, 
+                                                  A_descr_,
+                                                  B_descr_, 
+                                                  &beta, 
+                                                  C_descr_,
+                                                  dataType_, 
+                                                  alg_, 
+                                                  spgemmDescr_,
+                                                  &bufferSize2, 
+                                                  dBuffer2));
+
+        cusparseCheckError(cusparseSpMatGetSize(C_descr_, 
+                                                &C_num_rows_, 
+                                                &C_num_cols_,
+                                                &C_nnz_));
+
+        cudaCheckError(cudaMallocManaged(&C_vals_, sizeof(T) * C_nnz_));
+        cudaCheckError(cudaMallocManaged(&C_cols_32_, sizeof(int32_t) * C_nnz_));
+        C_allocated = true;
+
+        cusparseCheckError(cusparseCsrSetPointers(C_descr_, 
+                                                  C_rows_32_, 
+                                                  C_cols_32_,
+                                                  C_vals_));
+
+        cusparseCheckError(cusparseSpGEMM_copy(handle_, 
+                                               opA_, 
+                                               opB_, 
+                                               &alpha, 
+                                               A_descr_,
+                                               B_descr_, 
+                                               &beta, 
+                                               C_descr_, 
+                                               dataType_,
+                                               alg_, 
+                                               spgemmDescr_));
+
+        // Freeing memory
+        cudaCheckError(cudaFree(dBuffer1));
+        cudaCheckError(cudaFree(dBuffer2));
+        cudaCheckError(cudaFree(dBuffer3));
+        cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDescr_));
+        cusparseCheckError(cusparseDestroySpMat(A_descr_));
+        cusparseCheckError(cusparseDestroySpMat(B_descr_));
+        cusparseCheckError(cusparseDestroySpMat(C_descr_));
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+    }
+	}
+
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        break;
+      }
+      case gpuOffloadType::once: {
+        C_vals_ = (T*)malloc(sizeof(T) * C_nnz_);
+        C_cols_32_ = (int32_t*)malloc(sizeof(int32_t) * C_nnz_);
+        
+        cudaCheckError(cudaMemcpyAsync(C_rows_32_, C_rows_dev_, sizeof(int32_t) * (m_ + 1), cudaMemcpyDeviceToHost, s1_));
+        cudaCheckError(cudaMemcpyAsync(C_cols_32_, C_cols_dev_, sizeof(int32_t) * C_nnz_, cudaMemcpyDeviceToHost, s2_));
+        cudaCheckError(cudaMemcpyAsync(C_vals_, C_vals_dev_, sizeof(T) * C_nnz_, cudaMemcpyDeviceToHost, s3_));
+        cudaCheckError(cudaDeviceSynchronize());
+
+        if (C_allocated) {
+          cudaCheckError(cudaFree(C_vals_dev_));
+          cudaCheckError(cudaFree(C_cols_dev_));
+          C_allocated = false;
+        }
+        break;
+      }
+      case gpuOffloadType::unified: {
+        cudaCheckError(cudaMemPrefetchAsync(C_vals_, sizeof(T) * C_nnz_, cudaCpuDeviceId, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(C_cols_32_, sizeof(int32_t) * C_nnz_, cudaCpuDeviceId, s2_));
+        cudaCheckError(cudaMemPrefetchAsync(C_rows_32_, sizeof(int32_t) * (m_ + 1), cudaCpuDeviceId, s3_));
+        break;
+      }
+    }
+  }
+
+  /** Do any necessary cleanup (free pointers, close library handles, etc.)
+   * after Kernel has been called. */
+  void postCallKernelCleanup() override {
+    switch (offload_) {
+      case gpuOffloadType::always: {
+        if (C_allocated) {
+          free(C_vals_);
+          free(C_cols_32_);
+          C_allocated = false;
+        }
+        free(A_vals_);
+        free(A_cols_);
+        free(A_rows_);
+        free(B_vals_);
+        free(B_cols_);
+        free(B_rows_);
+        free(C_rows_32_);
+
+        cudaCheckError(cudaFree(A_vals_dev_));
+        cudaCheckError(cudaFree(A_cols_dev_));
+        cudaCheckError(cudaFree(A_rows_dev_));
+        cudaCheckError(cudaFree(B_vals_dev_));
+        cudaCheckError(cudaFree(B_cols_dev_));
+        cudaCheckError(cudaFree(B_rows_dev_));
+        cudaCheckError(cudaFree(C_rows_dev_));
+        break;
+      }
+      case gpuOffloadType::once: {
+        free(A_vals_);
+        free(A_cols_);
+        free(A_rows_);
+        free(B_vals_);
+        free(B_cols_);
+        free(B_rows_);
+        free(C_vals_);
+        free(C_cols_32_);
+        free(C_rows_32_);
+
+        cudaCheckError(cudaFree(A_vals_dev_));
+        cudaCheckError(cudaFree(A_cols_dev_));
+        cudaCheckError(cudaFree(A_rows_dev_));
+        cudaCheckError(cudaFree(B_vals_dev_));
+        cudaCheckError(cudaFree(B_cols_dev_));
+        cudaCheckError(cudaFree(B_rows_dev_));
+        cudaCheckError(cudaFree(C_rows_dev_));
+        break;
+      }
+      case gpuOffloadType::unified: {
+        if (C_allocated) {
+          cudaCheckError(cudaFree(C_vals_));
+          cudaCheckError(cudaFree(C_cols_32_));
+          C_allocated = false;
+        }
+        cudaCheckError(cudaFree(A_vals_));
+        cudaCheckError(cudaFree(A_cols_));
+        cudaCheckError(cudaFree(A_rows_));
+        cudaCheckError(cudaFree(B_vals_));
+        cudaCheckError(cudaFree(B_cols_));
+        cudaCheckError(cudaFree(B_rows_));
+        cudaCheckError(cudaFree(C_rows_32_));
+
+        free(A_vals_store_);
+        free(A_cols_store_);
+        free(A_rows_store_);
+        free(B_vals_store_);
+        free(B_cols_store_);
+        free(B_rows_store_);
+        break;
+      }
+    }
+  }
+
+  bool alreadyInitialised_ = false;
+
+  /** Handle used when calling cuBLAS. */
+  cusparseHandle_t handle_;
+
+  /** CUDA Streams - used to asynchronously move data between host and device. */
+  cudaStream_t s1_;
+  cudaStream_t s2_;
+  cudaStream_t s3_;
+  cudaStream_t s4_;
+  cudaStream_t s5_;
+  cudaStream_t s6_;
+  
+  /** The ID of the target GPU Device. */
+  int gpuDevice_;
+
+  /** Storage for matrices between offload type calls */
+  T* A_vals_store_;
+  int32_t* A_cols_store_;
+  int32_t* A_rows_store_;
+  T* B_vals_store_;
+  int32_t* B_cols_store_;
+  int32_t* B_rows_store_;
+
+	/** CSR format vectors for matrices A, B and C on the host */
+	T* A_vals_;
+	int32_t* A_cols_;
+  int32_t* A_rows_;
+
+  T* B_vals_;
+  int32_t* B_cols_;
+  int32_t* B_rows_;
+
+  int64_t C_num_rows_;
+  int64_t C_num_cols_;
+
+  /** CSR format vectors for matrices A, B and C on the device. */
+	T* A_vals_dev_;
+  T* B_vals_dev_;
+  T* C_vals_dev_;
+	int32_t* A_cols_dev_;
+  int32_t* A_rows_dev_;
+  int32_t* B_cols_dev_;
+  int32_t* B_rows_dev_;
+  int32_t* C_cols_dev_;
+  int32_t* C_rows_dev_;
+
+  int32_t* C_cols_32_;
+  int32_t* C_rows_32_;
+
+  bool C_allocated = false;
+
+  /** The constant value Alpha. */
+  const T alpha = ALPHA;
+
+  /** The constant value Beta. */
+  const T beta = BETA;
+
+	// Create descriptors for matrices A->C
+	cusparseSpMatDescr_t A_descr_, B_descr_, C_descr_;
+
+	cusparseSpGEMMDescr_t spgemmDescr_;
+
+  cusparseOperation_t opA_;
+  cusparseOperation_t opB_;
+  cusparseSpGEMMAlg_t alg_;
+  cusparseIndexType_t index_;
+  cusparseIndexBase_t base_;
+	cudaDataType_t dataType_;
+};
+}  // namespace gpu
+#endif
\ No newline at end of file
diff --git a/include/.DS_Store b/include/.DS_Store
new file mode 100644
index 0000000..869e02c
Binary files /dev/null and b/include/.DS_Store differ
diff --git a/include/doGemm.hh b/include/doGemm.hh
index c1aa742..55a6384 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -32,21 +32,22 @@ template <typename T>
 class doGemm {
  public:
   doGemm(const std::string csvDir, const int iters, const int startDim,
-         const int upperLimit, const bool cpuEnabled = true,
+         const int upperLimit, const int step, const bool cpuEnabled = true,
          const bool gpuEnabled = true)
       : CSV_DIR(csvDir),
         iterations_(iters),
         startDimention_(startDim),
         upperLimit_(upperLimit),
+        step_(step),
         doCPU_(cpuEnabled),
         doGPU_(gpuEnabled)
 #if CPU_ENABLED
         ,
-        gemmCpu_(iterations_)
+        cpu_(iterations_)
 #endif
 #if GPU_ENABLED
         ,
-        gemmGpu_(iterations_)
+        gpu_(iterations_)
 #endif
   {
     static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) &&
@@ -57,7 +58,7 @@ class doGemm {
   /** Run all problem types and write data to CSV files. */
   void collectData() {
     // Square Problem Sizes...
-    // Re-initialise offload threshold structures & previous results
+    // Re-initialise offload threshold structures
     cpuGpu_always_ = cpuGpu_offloadThreshold();
     cpuGpu_once_ = cpuGpu_offloadThreshold();
     cpuGpu_unified_ = cpuGpu_offloadThreshold();
@@ -66,7 +67,7 @@ class doGemm {
     prev_gpuResult_unified = time_checksum_gflop();
     std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
                                         "_square_square_M=N=K.csv");
-    for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+    for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
       // M = dim, N = dim, K = dim;
       callKernels(csvFile, dim, dim, dim);
     }
@@ -79,218 +80,217 @@ class doGemm {
     }
 #endif
 
-    // Rectangular Problem Sizes:
-    // Tall and thin x Short and wide
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_tall-thin_short-wide_M=N_M=16K.csv");
-    int K = startDimention_;
-    int M = 16 * K;
-    int N = 16 * K;
-    while (M <= upperLimit_) {
-      callKernels(csvFile, M, N, K);
-      M += 16;
-      N += 16;
-      K++;
-    }
-    // Close file
-    csvFile.close();
+  // Rectangular Problem Sizes:
+  // Tall and thin x Short and wide
+  // Re-initialise offload threshold structures & previous results
+  cpuGpu_always_ = cpuGpu_offloadThreshold();
+  cpuGpu_once_ = cpuGpu_offloadThreshold();
+  cpuGpu_unified_ = cpuGpu_offloadThreshold();
+  prev_gpuResult_always = time_checksum_gflop();
+  prev_gpuResult_once = time_checksum_gflop();
+  prev_gpuResult_unified = time_checksum_gflop();
+  csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                        "_tall-thin_short-wide_M=N_M=16K.csv");
+  int K = startDimention_;
+  int M = 16 * K;
+  int N = 16 * K;
+  while (M <= upperLimit_) {
+    callKernels(csvFile, M, N, K);
+    M += 16 * step_;
+    N += 16 * step_;
+    K += step_;
+  }
+  // Close file
+  csvFile.close();
 #if CPU_ENABLED && GPU_ENABLED
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, M=16K)");
-    }
+  if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, M=16K)");
+  }
 #endif
 
-    // Tall and thin x Short and wide
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_tall-thin_short-wide_M=N_K=32.csv");
-    if (upperLimit_ >= 32) {
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-        // M = dim, N = dim, K = 32;
-        callKernels(csvFile, dim, dim, 32);
-      }
+  // Tall and thin x Short and wide
+  // Re-initialise offload threshold structures & previous results
+  cpuGpu_always_ = cpuGpu_offloadThreshold();
+  cpuGpu_once_ = cpuGpu_offloadThreshold();
+  cpuGpu_unified_ = cpuGpu_offloadThreshold();
+  prev_gpuResult_always = time_checksum_gflop();
+  prev_gpuResult_once = time_checksum_gflop();
+  prev_gpuResult_unified = time_checksum_gflop();
+  csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                        "_tall-thin_short-wide_M=N_K=32.csv");
+  if (upperLimit_ >= 32) {
+    for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
+      // M = dim, N = dim, K = 32;
+      callKernels(csvFile, dim, dim, 32);
     }
-    // Close file
-    csvFile.close();
+  }
+  // Close file
+  csvFile.close();
 #if CPU_ENABLED && GPU_ENABLED
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, K=32)");
-    }
+  if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, K=32)");
+  }
 #endif
 
-    // Short and wide x Tall and thin
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_short-wide_tall-thin_M=N_K=16M.csv");
-    M = startDimention_;
-    N = startDimention_;
-    K = 16 * M;
-    while (K <= upperLimit_) {
-      callKernels(csvFile, M, N, K);
-      M++;
-      N++;
-      K += 16;
-    }
-    // Close file
-    csvFile.close();
+  // Short and wide x Tall and thin
+  // Re-initialise offload threshold structures & previous results
+  cpuGpu_always_ = cpuGpu_offloadThreshold();
+  cpuGpu_once_ = cpuGpu_offloadThreshold();
+  cpuGpu_unified_ = cpuGpu_offloadThreshold();
+  prev_gpuResult_always = time_checksum_gflop();
+  prev_gpuResult_once = time_checksum_gflop();
+  prev_gpuResult_unified = time_checksum_gflop();
+  csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                        "_short-wide_tall-thin_M=N_K=16M.csv");
+  M = startDimention_;
+  N = startDimention_;
+  K = 16 * M;
+  while (K <= upperLimit_) {
+    callKernels(csvFile, M, N, K);
+    M += step_;
+    N += step_;
+    K += 16 * step_;
+  }
+  // Close file
+  csvFile.close();
 #if CPU_ENABLED && GPU_ENABLED
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N, K=16M)");
-    }
+  if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N, K=16M)");
+  }
 #endif
 
-    // Short and wide x Tall and thin
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_short-wide_tall-thin_M=N=32_K.csv");
-    if (upperLimit_ >= 32) {
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-        // M = 32, N = 32, K = dim;
-        callKernels(csvFile, 32, 32, dim);
-      }
+  // Short and wide x Tall and thin
+  // Re-initialise offload threshold structures & previous results
+  cpuGpu_always_ = cpuGpu_offloadThreshold();
+  cpuGpu_once_ = cpuGpu_offloadThreshold();
+  cpuGpu_unified_ = cpuGpu_offloadThreshold();
+  prev_gpuResult_always = time_checksum_gflop();
+  prev_gpuResult_once = time_checksum_gflop();
+  prev_gpuResult_unified = time_checksum_gflop();
+  csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                        "_short-wide_tall-thin_M=N=32_K.csv");
+  if (upperLimit_ >= 32) {
+    for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
+      // M = 32, N = 32, K = dim;
+      callKernels(csvFile, 32, 32, dim);
     }
-    // Close file
-    csvFile.close();
+  }
+  // Close file
+  csvFile.close();
 #if CPU_ENABLED && GPU_ENABLED
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N=32, K)");
-    }
+  if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N=32, K)");
+  }
 #endif
 
-    // Tall and Thin x Square
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_tall-thin_square_K=N_M=16K.csv");
-    K = startDimention_;
-    N = startDimention_;
-    M = 16 * K;
-    while (M <= upperLimit_) {
-      callKernels(csvFile, M, N, K);
-      M += 16;
-      N++;
-      K++;
-    }
-    // Close file
-    csvFile.close();
+  // Tall and Thin x Square
+  // Re-initialise offload threshold structures & previous results
+  cpuGpu_always_ = cpuGpu_offloadThreshold();
+  cpuGpu_once_ = cpuGpu_offloadThreshold();
+  cpuGpu_unified_ = cpuGpu_offloadThreshold();
+  prev_gpuResult_always = time_checksum_gflop();
+  prev_gpuResult_once = time_checksum_gflop();
+  prev_gpuResult_unified = time_checksum_gflop();
+  csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                        "_tall-thin_square_K=N_M=16K.csv");
+  K = startDimention_;
+  N = startDimention_;
+  M = 16 * K;
+  while (M <= upperLimit_) {
+    callKernels(csvFile, M, N, K);
+    M += 16 * step_;
+    N += step_;
+    K += step_;
+  }
+  // Close file
+  csvFile.close();
 #if CPU_ENABLED && GPU_ENABLED
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Tall-and-Thin x Square (K=N, M=16K)");
-    }
+  if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Tall-and-Thin x Square (K=N, M=16K)");
+  }
 #endif
 
-    // Tall and Thin x Square
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_tall-thin_square_K=N=32_M.csv");
-    if (upperLimit_ >= 32) {
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-        // M = dim, N = 32, K = 32;
-        callKernels(csvFile, dim, 32, 32);
-      }
+  // Tall and Thin x Square
+  // Re-initialise offload threshold structures & previous results
+  cpuGpu_always_ = cpuGpu_offloadThreshold();
+  cpuGpu_once_ = cpuGpu_offloadThreshold();
+  cpuGpu_unified_ = cpuGpu_offloadThreshold();
+  prev_gpuResult_always = time_checksum_gflop();
+  prev_gpuResult_once = time_checksum_gflop();
+  prev_gpuResult_unified = time_checksum_gflop();
+  csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                        "_tall-thin_square_K=N=32_M.csv");
+  if (upperLimit_ >= 32) {
+    for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
+      // M = dim, N = 32, K = 32;
+      callKernels(csvFile, dim, 32, 32);
     }
-    // Close file
-    csvFile.close();
+  }
+  // Close file
+  csvFile.close();
 #if CPU_ENABLED && GPU_ENABLED
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Tall-and-Thin x Square (M, K=N=32)");
-    }
+  if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Tall-and-Thin x Square (M, K=N=32)");
+  }
 #endif
 
-    // Square x Short and Wide
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_square_short-wide_M=K_N=16K.csv");
-    M = startDimention_;
-    K = startDimention_;
-    N = 16 * K;
-    while (N <= upperLimit_) {
-      callKernels(csvFile, M, N, K);
-      M++;
-      N += 16;
-      K++;
-    }
-    // Close file
-    csvFile.close();
+  // Square x Short and Wide
+  // Re-initialise offload threshold structures & previous results
+  cpuGpu_always_ = cpuGpu_offloadThreshold();
+  cpuGpu_once_ = cpuGpu_offloadThreshold();
+  cpuGpu_unified_ = cpuGpu_offloadThreshold();
+  prev_gpuResult_always = time_checksum_gflop();
+  prev_gpuResult_once = time_checksum_gflop();
+  prev_gpuResult_unified = time_checksum_gflop();
+  csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                        "_square_short-wide_M=K_N=16K.csv");
+  M = startDimention_;
+  K = startDimention_;
+  N = 16 * K;
+  while (N <= upperLimit_) {
+    callKernels(csvFile, M, N, K);
+    M += step_;
+    N += 16 * step_;
+    K += step_;
+  }
+  // Close file
+  csvFile.close();
 #if CPU_ENABLED && GPU_ENABLED
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)");
-    }
+  if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)");
+  }
 #endif
-
-    // Square x Short and Wide
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_square_short-wide_M=K=32_N.csv");
-    if (upperLimit_ >= 32) {
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-        // M = 32, N = dim, K = 32;
-        callKernels(csvFile, 32, dim, 32);
-      }
+  // Square x Short and Wide
+  // Re-initialise offload threshold structures & previous results
+  cpuGpu_always_ = cpuGpu_offloadThreshold();
+  cpuGpu_once_ = cpuGpu_offloadThreshold();
+  cpuGpu_unified_ = cpuGpu_offloadThreshold();
+  prev_gpuResult_always = time_checksum_gflop();
+  prev_gpuResult_once = time_checksum_gflop();
+  prev_gpuResult_unified = time_checksum_gflop();
+  csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                        "_square_short-wide_M=K=32_N.csv");
+  if (upperLimit_ >= 32) {
+    for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
+      // M = 32, N = dim, K = 32;
+      callKernels(csvFile, 32, dim, 32);
     }
-    // Close file
-    csvFile.close();
+  }
 #if CPU_ENABLED && GPU_ENABLED
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)");
-    }
+  if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)");
+  }
 #endif
+    // Close file
+    csvFile.close();
   }
 
  private:
@@ -301,55 +301,54 @@ class doGemm {
     const uint64_t flops = calcFlops(M, N, K);
     std::string kernelName = getKernelName();
 
-    time_checksum_gflop cpuResult;
-    time_checksum_gflop gpuResult_once;
-    time_checksum_gflop gpuResult_always;
-    time_checksum_gflop gpuResult_unified;
-
 // Perform CPU kernel
 #if CPU_ENABLED
+    time_checksum_gflop cpuResult;
     if (doCPU_) {
-      gemmCpu_.initialise(M, N, K);
-      cpuResult = gemmCpu_.compute();
+      cpu_.initialise(M, N, K);
+      cpuResult = cpu_.compute();
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
       // Write result to CSV file
-      writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize, iterations_,
-                     cpuResult.runtime, cpuResult.gflops);
+      writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize,
+                     0.0, iterations_, cpuResult.runtime, cpuResult.gflops);
     }
 #endif
 
 // Perform the GPU kernels
 #if GPU_ENABLED
+    time_checksum_gflop gpuResult_once;
+    time_checksum_gflop gpuResult_always;
+    time_checksum_gflop gpuResult_unified;
     if (doGPU_) {
       // - ONCE : Offload to/from GPU once before all iterations and once
       // after
-      gemmGpu_.initialise(gpuOffloadType::once, M, N, K);
-      gpuResult_once = gemmGpu_.compute();
+      gpu_.initialise(gpuOffloadType::once, M, N, K);
+      gpuResult_once = gpu_.compute();
       gpuResult_once.gflops =
           calcGflops(flops, iterations_, gpuResult_once.runtime);
 
       // - ALWAYS: Offload to/from GPU every iteration
-      gemmGpu_.initialise(gpuOffloadType::always, M, N, K);
-      gpuResult_always = gemmGpu_.compute();
+      gpu_.initialise(gpuOffloadType::always, M, N, K);
+      gpuResult_always = gpu_.compute();
       gpuResult_always.gflops =
           calcGflops(flops, iterations_, gpuResult_always.runtime);
 
       // - UNIFIED : data passed from host to device (and device to host) as
       //             needed
-      gemmGpu_.initialise(gpuOffloadType::unified, M, N, K);
-      gpuResult_unified = gemmGpu_.compute();
+      gpu_.initialise(gpuOffloadType::unified, M, N, K);
+      gpuResult_unified = gpu_.compute();
       gpuResult_unified.gflops =
           calcGflops(flops, iterations_, gpuResult_unified.runtime);
 
       // Write results to CSV file
       writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, K, probSize,
-                     iterations_, gpuResult_once.runtime,
+                     0.0, iterations_, gpuResult_once.runtime,
                      gpuResult_once.gflops);
       writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, K,
-                     probSize, iterations_, gpuResult_always.runtime,
+                     probSize, 0.0, iterations_, gpuResult_always.runtime,
                      gpuResult_always.gflops);
       writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, K, probSize,
-                     iterations_, gpuResult_unified.runtime,
+                     0.0, iterations_, gpuResult_unified.runtime,
                      gpuResult_unified.gflops);
     }
 #endif
@@ -386,8 +385,8 @@ class doGemm {
   void checkChecksums(time_checksum_gflop cpuResult,
                       time_checksum_gflop gpuResult_once,
                       time_checksum_gflop gpuResult_always,
-                      time_checksum_gflop gpuResult_unified, const int M,
-                      const int N, const int K) {
+                      time_checksum_gflop gpuResult_unified, 
+                      const int M, const int N, const int K) {
     // Ensure that each checksum difference is less than 0.1%
     double hundredOverChecksum = 100 / std::fabs(cpuResult.checksum);
     if (((std::fabs(cpuResult.checksum - gpuResult_once.checksum) *
@@ -396,21 +395,12 @@ class doGemm {
           hundredOverChecksum)) > 0.1 &&
         ((std::fabs(cpuResult.checksum - gpuResult_unified.checksum) *
           hundredOverChecksum)) > 0.1) {
-      std::cerr << "ERROR - " << getKernelName()
-                << " kernel checksums do not match:\n\tInput "
-                   "dimensions: M="
-                << M << ", N=" << N << ", K=" << K << std::endl;
-      std::cerr << std::setprecision(10)
-                << "\tCPU Checksum = " << cpuResult.checksum << std::endl;
-      std::cerr << std::setprecision(10)
-                << "\tGPU (Once) Checksum = " << gpuResult_once.checksum
-                << std::endl;
-      std::cerr << std::setprecision(10)
-                << "\tGPU (Always) Checksum = " << gpuResult_always.checksum
-                << std::endl;
-      std::cerr << std::setprecision(10)
-                << "\tGPU (Unified) Checksum = " << gpuResult_unified.checksum
-                << std::endl;
+      std::cerr << "ERROR - " << getKernelName() << " kernel checksums do not match:\n\tInput "
+                   "dimensions: M=" << M << ", N=" << N << ", K=" << K << std::endl;
+      std::cerr << std::setprecision(10) << "\tCPU Checksum = " << cpuResult.checksum << std::endl;
+      std::cerr << std::setprecision(10) << "\tGPU (Once) Checksum = " << gpuResult_once.checksum << std::endl;
+      std::cerr << std::setprecision(10) << "\tGPU (Always) Checksum = " << gpuResult_always.checksum << std::endl;
+      std::cerr << std::setprecision(10) << "\tGPU (Unified) Checksum = " << gpuResult_unified.checksum << std::endl;
       exit(1);
     }
   }
@@ -524,7 +514,7 @@ class doGemm {
   }
 
   /** Print to stdout the offload thresholds. */
-  void printOffloadThreshold(std::string problemName) const {
+  void printOffloadThreshold(const std::string& problemName) const {
     std::vector<std::string> header = {
         "Device",  "M",          "N", "K", "Total Prob. Size (KiB)",
         "GFLOP/s", "CPU GFLOP/s"};
@@ -534,8 +524,7 @@ class doGemm {
     std::stringstream probSize_o;
     std::stringstream gpuGflops_o;
     std::stringstream cpuGflops_o;
-    probSize_o << std::fixed << std::setprecision(2)
-               << cpuGpu_once_.probSize_kib;
+    probSize_o << std::fixed << std::setprecision(2) << cpuGpu_once_.probSize_kib;
     gpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.gpuGflops;
     cpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.cpuGflops;
     if (cpuGpu_once_.M == 0) {
@@ -554,12 +543,9 @@ class doGemm {
     std::stringstream probSize_a;
     std::stringstream gpuGflops_a;
     std::stringstream cpuGflops_a;
-    probSize_a << std::fixed << std::setprecision(2)
-               << cpuGpu_always_.probSize_kib;
-    gpuGflops_a << std::fixed << std::setprecision(2)
-                << cpuGpu_always_.gpuGflops;
-    cpuGflops_a << std::fixed << std::setprecision(2)
-                << cpuGpu_always_.cpuGflops;
+    probSize_a << std::fixed << std::setprecision(2) << cpuGpu_always_.probSize_kib;
+    gpuGflops_a << std::fixed << std::setprecision(2) << cpuGpu_always_.gpuGflops;
+    cpuGflops_a << std::fixed << std::setprecision(2) << cpuGpu_always_.cpuGflops;
     if (cpuGpu_always_.M == 0) {
       // No offload threshold found
       rows.push_back({"GPU (Offload Always)", std::to_string(0),
@@ -576,12 +562,9 @@ class doGemm {
     std::stringstream probSize_u;
     std::stringstream gpuGflops_u;
     std::stringstream cpuGflops_u;
-    probSize_u << std::fixed << std::setprecision(2)
-               << cpuGpu_unified_.probSize_kib;
-    gpuGflops_u << std::fixed << std::setprecision(2)
-                << cpuGpu_unified_.gpuGflops;
-    cpuGflops_u << std::fixed << std::setprecision(2)
-                << cpuGpu_unified_.cpuGflops;
+    probSize_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.probSize_kib;
+    gpuGflops_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.gpuGflops;
+    cpuGflops_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.cpuGflops;
     if (cpuGpu_unified_.M == 0) {
       // No offload threshold found
       rows.push_back({"GPU (Unified Memory)", std::to_string(0),
@@ -606,12 +589,15 @@ class doGemm {
   /** The number of iterations to perform per problem size. */
   const int iterations_;
 
-  /** The value of the first probelm size dimention run. */
+  /** The value of the first problem size dimension run. */
   const int startDimention_;
 
-  /** The maximum value of the largest problem size dimention. */
+  /** The maximum value of the largest problem size dimension. */
   const int upperLimit_;
 
+  /** The step size between each problem size dimension. */
+  const int step_;
+
   /** Whether the CPU kernels should be run. */
   const bool doCPU_ = true;
 
@@ -620,12 +606,12 @@ class doGemm {
 
 #if CPU_ENABLED
   /** The GEMM CPU kernel. */
-  cpu::gemm_cpu<T> gemmCpu_;
+  cpu::gemm_cpu<T> cpu_;
 #endif
 
 #if GPU_ENABLED
   /** The GEMM GPU kernel. */
-  gpu::gemm_gpu<T> gemmGpu_;
+  gpu::gemm_gpu<T> gpu_;
 #endif
 
   /** The point at which offloading to GPU (offload once) becomes worthwhile. */
diff --git a/include/doGemv.hh b/include/doGemv.hh
index b86aad6..c5edcd1 100644
--- a/include/doGemv.hh
+++ b/include/doGemv.hh
@@ -32,21 +32,22 @@ template <typename T>
 class doGemv {
  public:
   doGemv(const std::string csvDir, const int iters, const int startDim,
-         const int upperLimit, const bool cpuEnabled = true,
+         const int upperLimit, const int step, const bool cpuEnabled = true,
          const bool gpuEnabled = true)
       : CSV_DIR(csvDir),
         iterations_(iters),
         startDimention_(startDim),
         upperLimit_(upperLimit),
+        step_(step),
         doCPU_(cpuEnabled),
         doGPU_(gpuEnabled)
 #if CPU_ENABLED
         ,
-        gemvCpu_(iterations_)
+        cpu_(iterations_)
 #endif
 #if GPU_ENABLED
         ,
-        gemvGpu_(iterations_)
+        gpu_(iterations_)
 #endif
   {
     static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) &&
@@ -66,7 +67,7 @@ class doGemv {
     prev_gpuResult_unified = time_checksum_gflop();
     std::ofstream csvFile =
         initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv");
-    for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+    for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
       // M = dim, N = dim;
       callKernels(csvFile, dim, dim);
     }
@@ -94,8 +95,8 @@ class doGemv {
     int M = 16 * N;
     while (M <= upperLimit_) {
       callKernels(csvFile, M, N);
-      M += 16;
-      N++;
+      M += 16 * step_;
+      N += step_;
     }
     // Close file
     csvFile.close();
@@ -117,7 +118,7 @@ class doGemv {
     csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
                           "_tall-thin_vector_M_N=32.csv");
     if (upperLimit_ >= 32) {
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+      for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
         // M = dim, N = 32;
         callKernels(csvFile, dim, 32);
       }
@@ -145,8 +146,8 @@ class doGemv {
     N = 16 * M;
     while (N <= upperLimit_) {
       callKernels(csvFile, M, N);
-      M++;
-      N += 16;
+      M += step_;
+      N += 16 * step_;
     }
     // Close file
     csvFile.close();
@@ -168,7 +169,7 @@ class doGemv {
     csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
                           "_short-wide_vector_M=32_N.csv");
     if (upperLimit_ >= 32) {
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+      for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
         // M = 32, N = dim;
         callKernels(csvFile, 32, dim);
       }
@@ -190,55 +191,54 @@ class doGemv {
     const uint64_t flops = calcFlops(M, N);
     std::string kernelName = getKernelName();
 
-    time_checksum_gflop cpuResult;
-    time_checksum_gflop gpuResult_once;
-    time_checksum_gflop gpuResult_always;
-    time_checksum_gflop gpuResult_unified;
-
 // Perform CPU kernel
 #if CPU_ENABLED
+    time_checksum_gflop cpuResult;
     if (doCPU_) {
-      gemvCpu_.initialise(M, N);
-      cpuResult = gemvCpu_.compute();
+      cpu_.initialise(M, N);
+      cpuResult = cpu_.compute();
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
       // Write result to CSV file
-      writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, iterations_,
-                     cpuResult.runtime, cpuResult.gflops);
+      writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, 0.0,
+                     iterations_, cpuResult.runtime, cpuResult.gflops);
     }
 #endif
 
 // Perform the GPU kernels
 #if GPU_ENABLED
+    time_checksum_gflop gpuResult_once;
+    time_checksum_gflop gpuResult_always;
+    time_checksum_gflop gpuResult_unified;
     if (doGPU_) {
       // - ONCE : Offload to/from GPU once before all iterations and once
       // after
-      gemvGpu_.initialise(gpuOffloadType::once, M, N);
-      gpuResult_once = gemvGpu_.compute();
+      gpu_.initialise(gpuOffloadType::once, M, N);
+      gpuResult_once = gpu_.compute();
       gpuResult_once.gflops =
           calcGflops(flops, iterations_, gpuResult_once.runtime);
 
       // - ALWAYS: Offload to/from GPU every iteration
-      gemvGpu_.initialise(gpuOffloadType::always, M, N);
-      gpuResult_always = gemvGpu_.compute();
+      gpu_.initialise(gpuOffloadType::always, M, N);
+      gpuResult_always = gpu_.compute();
       gpuResult_always.gflops =
           calcGflops(flops, iterations_, gpuResult_always.runtime);
 
       // - UNIFIED : data passed from host to device (and device to host) as
       //             needed
-      gemvGpu_.initialise(gpuOffloadType::unified, M, N);
-      gpuResult_unified = gemvGpu_.compute();
+      gpu_.initialise(gpuOffloadType::unified, M, N);
+      gpuResult_unified = gpu_.compute();
       gpuResult_unified.gflops =
           calcGflops(flops, iterations_, gpuResult_unified.runtime);
 
       // Write results to CSV file
       writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, 0, probSize,
-                     iterations_, gpuResult_once.runtime,
+                     0.0, iterations_, gpuResult_once.runtime,
                      gpuResult_once.gflops);
       writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, 0,
-                     probSize, iterations_, gpuResult_always.runtime,
+                     probSize, 0.0, iterations_, gpuResult_always.runtime,
                      gpuResult_always.gflops);
       writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, 0, probSize,
-                     iterations_, gpuResult_unified.runtime,
+                     0.0, iterations_, gpuResult_unified.runtime,
                      gpuResult_unified.gflops);
     }
 #endif
@@ -488,6 +488,9 @@ class doGemv {
   /** The maximum value of the largest problem size dimention. */
   const int upperLimit_;
 
+  /** The step size between each problem size dimension. */
+  const int step_;
+
   /** Whether the CPU kernels should be run. */
   const bool doCPU_ = true;
 
@@ -496,12 +499,12 @@ class doGemv {
 
 #if CPU_ENABLED
   /** The GEMV CPU kernel. */
-  cpu::gemv_cpu<T> gemvCpu_;
+  cpu::gemv_cpu<T> cpu_;
 #endif
 
 #if GPU_ENABLED
   /** The GEMV GPU kernel. */
-  gpu::gemv_gpu<T> gemvGpu_;
+  gpu::gemv_gpu<T> gpu_;
 #endif
 
   /** The point at which offloading to GPU (offload once) becomes worthwhile. */
diff --git a/include/doSpmdnm.hh b/include/doSpmdnm.hh
new file mode 100644
index 0000000..89ab8b1
--- /dev/null
+++ b/include/doSpmdnm.hh
@@ -0,0 +1,645 @@
+#pragma once
+#include <sstream>
+#include <type_traits>
+#include <iostream>
+
+#include "helpers.hh"
+#include "tablePrinter.hh"
+#include "utilities.hh"
+
+#if defined CPU_ARMPL
+#include "../ArmPL/spmdnm.hh"
+#elif defined CPU_ONEMKL
+#include "../oneMKL/CPU/spmdnm.hh"
+#elif defined CPU_AOCL
+#include "../AOCL/spmdnm.hh"
+#endif
+
+#if defined GPU_CUBLAS
+#include "../cuBLAS/spmdnm.hh"
+#elif defined GPU_ONEMKL
+#include "../oneMKL/GPU/spmdnm.hh"
+#elif defined GPU_ROCBLAS
+#include "../rocBLAS/spmdnm.hh"
+#endif
+
+
+/**
+* 'T represents the type of the sparse GEMM kernel that will be run. E.g.,
+ * T=float is for SSpMDnM
+*/
+template <typename T>
+class doSpmdnm {
+public:
+    doSpmdnm(const std::string csvDir, const int iters, const int startDim,
+             const int upperLimit, const int step, const double sparsity, const matrixType type,
+             const bool cpuEnabled = true, const bool gpuEnabled = true)
+          : CSV_DIR(csvDir),
+            iterations_(iters),
+            startDimention_(startDim),
+            upperLimit_(upperLimit),
+            step_(step),
+            sparsity_(sparsity),
+            type_(type),
+            doCPU_(cpuEnabled),
+            doGPU_(gpuEnabled)
+#if CPU_ENABLED
+    ,
+        cpu_(iterations_)
+#endif
+#if GPU_ENABLED
+    ,
+        gpu_(iterations_)
+#endif
+    {
+      static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) &&
+                    "ERROR - doSpmdnm can only be constructed using one of the "
+                    "following types: [float, double].");
+    }
+
+    void collectData() {
+      // Square Problem Sizes...
+      // Re-initialise offload threshold structures
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                                          "_square_square_M=N=K.csv");
+      for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
+        // M = dim, N = dim, K = dim;
+        callKernels(csvFile, dim, dim, dim);
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Square x Square (M=N=K)");
+      }
+#endif
+
+      // Rectangular Problem Sizes:
+      // Tall and thin x Short and wide
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_tall-thin_short-wide_M=N_M=16K.csv");
+      int K = startDimention_;
+      int M = 16 * K;
+      int N = 16 * K;
+      while (M <= upperLimit_) {
+        callKernels(csvFile, M, N, K);
+        M += 16 * step_;
+        N += 16 * step_;
+        K += step_;
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, M=16K)");
+      }
+#endif
+
+      // Tall and thin x Short and wide
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_tall-thin_short-wide_M=N_K=32.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
+          // M = dim, N = dim, K = 32;
+          callKernels(csvFile, dim, dim, 32);
+        }
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, K=32)");
+      }
+#endif
+
+      // Short and wide x Tall and thin
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_short-wide_tall-thin_M=N_K=16M.csv");
+      M = startDimention_;
+      N = startDimention_;
+      K = 16 * M;
+      while (K <= upperLimit_) {
+        callKernels(csvFile, M, N, K);
+        M += step_;
+        N += step_;
+        K += 16 * step_;
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N, K=16M)");
+      }
+#endif
+
+      // Short and wide x Tall and thin
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_short-wide_tall-thin_M=N=32_K.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
+          // M = 32, N = 32, K = dim;
+          callKernels(csvFile, 32, 32, dim);
+        }
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N=32, K)");
+      }
+#endif
+
+      // Tall and Thin x Square
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_tall-thin_square_K=N_M=16K.csv");
+      K = startDimention_;
+      N = startDimention_;
+      M = 16 * K;
+      while (M <= upperLimit_) {
+        callKernels(csvFile, M, N, K);
+        M += 16 * step_;
+        N += step_;
+        K += step_;
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Tall-and-Thin x Square (K=N, M=16K)");
+      }
+#endif
+
+      // Tall and Thin x Square
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_tall-thin_square_K=N=32_M.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
+          // M = dim, N = 32, K = 32;
+          callKernels(csvFile, dim, 32, 32);
+        }
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Tall-and-Thin x Square (M, K=N=32)");
+      }
+#endif
+
+      // Square x Short and Wide
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_square_short-wide_M=K_N=16K.csv");
+      M = startDimention_;
+      K = startDimention_;
+      N = 16 * K;
+      while (N <= upperLimit_) {
+        callKernels(csvFile, M, N, K);
+        M += step_;
+        N += 16 * step_;
+        K += step_;
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)");
+      }
+#endif
+      // Square x Short and Wide
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_square_short-wide_M=K=32_N.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
+          // M = 32, N = dim, K = 32;
+          callKernels(csvFile, 32, dim, 32);
+        }
+      }
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)");
+      }
+#endif
+      // Close file
+      csvFile.close();
+    }
+
+private:
+    /** Call the appropriate CPU and GPU GEMM kernels. */
+    void callKernels(std::ofstream& csvFile, const int M, const int N,
+                     const int K) {
+      const double probSize = calcKib(M, N, K);
+      const uint64_t flops = calcFlops(M, N, K);
+      std::string kernelName = getKernelName();
+
+// Perform CPU kernel
+#if CPU_ENABLED
+      time_checksum_gflop cpuResult;
+      if (doCPU_) {
+        cpu_.initialise(M, N, K, sparsity_, type_);
+        cpuResult = cpu_.compute();
+        cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
+        // Write result to CSV file
+        writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize,
+                       sparsity_, iterations_, cpuResult.runtime, cpuResult
+                       .gflops);
+      }
+#endif
+
+// Perform the GPU kernels
+#if GPU_ENABLED
+      time_checksum_gflop gpuResult_once;
+      time_checksum_gflop gpuResult_always;
+      time_checksum_gflop gpuResult_unified;
+      /*
+        * We run three different offload types:
+        *  - ALWAYS: Offload to/from GPU every iteration
+        *  - ONCE : Offload to/from GPU once before all iterations and once after
+        *  - UNIFIED : data passed from host to device (and device to host) as needed 
+        * THE ORDER OF THESE IS IMPORTANT -- To reduce time spent generating matrices, we 
+        * generate once during the ALWAYS offload, and then re-use the same matrices for
+        * the ONCE and UNIFIED offload tests.  Deleting them after UNIFIED.  Therefore, 
+        * changing the order here will require this logic within the spmm GPU classes to 
+        * be updated. 
+      */
+      if (doGPU_) {
+        // - ALWAYS: Offload to/from GPU every iteration
+        gpu_.initialise(gpuOffloadType::always, M, N, K, sparsity_, type_);
+        gpuResult_always = gpu_.compute();
+        gpuResult_always.gflops = calcGflops(flops, iterations_, gpuResult_always.runtime);
+        writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, K,
+                       probSize, sparsity_, iterations_, gpuResult_always.runtime,
+                       gpuResult_always.gflops);
+
+        // - ONCE : Offload to/from GPU once before all iterations and once
+        // after
+        gpu_.initialise(gpuOffloadType::once, M, N, K, sparsity_, type_);
+        gpuResult_once = gpu_.compute();
+        gpuResult_once.gflops = calcGflops(flops, iterations_, gpuResult_once.runtime);
+        writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, K, probSize,
+                       sparsity_, iterations_, gpuResult_once.runtime,
+                       gpuResult_once.gflops);
+
+        // - UNIFIED : data passed from host to device (and device to host) as
+        //             needed
+        gpu_.initialise(gpuOffloadType::unified, M, N, K, sparsity_, type_);
+        gpuResult_unified = gpu_.compute();
+        gpuResult_unified.gflops = calcGflops(flops, iterations_, gpuResult_unified.runtime);
+        writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, K, probSize,
+                       sparsity_, iterations_, gpuResult_unified.runtime,
+                       gpuResult_unified.gflops);
+      }
+#endif
+
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+        // Make sure all checksums match if CPU and GPU kernels are run.
+        //  - The majority of BLAS Libraries guarentee the same result if a
+        //  function
+        //    is called multiple times. Given all input matrices are identical for
+        //    each GPU offload type, we need only to compare the CPU and GPU
+        //    checksums.
+        checkChecksums(cpuResult, gpuResult_once, gpuResult_always,
+                      gpuResult_unified, M, N, K);
+
+        // Check if offload structs should be reset
+        checkOffloadStructReset(cpuResult, gpuResult_once, gpuResult_always,
+                                gpuResult_unified);
+
+        // Check if offload threshold has been achieved for each GPU offload type.
+        updateOffloadStructs(cpuResult, gpuResult_once, gpuResult_always,
+                            gpuResult_unified, M, N, K, probSize);
+
+        // Update previous results
+        prev_gpuResult_once = gpuResult_once;
+        prev_gpuResult_always = gpuResult_always;
+        prev_gpuResult_unified = gpuResult_unified;
+      }
+#endif
+    }
+
+
+    /** Ensure all CPU and GPU checksums are within the permitted limit of
+     * eachother. */
+    void checkChecksums(time_checksum_gflop cpuResult,
+                        time_checksum_gflop gpuResult_once,
+                        time_checksum_gflop gpuResult_always,
+                        time_checksum_gflop gpuResult_unified, 
+                        const int M, const int N, const int K) {
+      // Ensure that each checksum difference is less than 0.1%
+     double hundredOverChecksum = 100 / std::fabs(cpuResult.checksum);
+     if (((std::fabs(cpuResult.checksum - gpuResult_once.checksum) *
+           hundredOverChecksum)) > 0.1 &&
+         ((std::fabs(cpuResult.checksum - gpuResult_always.checksum) *
+           hundredOverChecksum)) > 0.1 &&
+         ((std::fabs(cpuResult.checksum - gpuResult_unified.checksum) *
+           hundredOverChecksum)) > 0.1) {
+       std::cerr << "ERROR - " << getKernelName() << " kernel checksums do not match:\n\tInput "
+                    "dimensions: M=" << M << ", N=" << N << ", K=" << K << std::endl;
+       std::cerr << std::setprecision(10) << "\tCPU Checksum = " << cpuResult.checksum << std::endl;
+       std::cerr << std::setprecision(10) << "\tGPU (Once) Checksum = " << gpuResult_once.checksum << std::endl;
+       std::cerr << std::setprecision(10) << "\tGPU (Always) Checksum = " << gpuResult_always.checksum << std::endl;
+       std::cerr << std::setprecision(10) << "\tGPU (Unified) Checksum = " << gpuResult_unified.checksum << std::endl;
+       exit(1);
+     }
+    }
+
+    /** Check whether the offload structures need to be reset; and doing so if
+     * required.
+     *   - If CPU.gflops >= GPU.gflops for last two problem sizes, then reset
+     * offload structures as GPU may not necessarily have reached the offload
+     * threshold. */
+    void checkOffloadStructReset(time_checksum_gflop cpuResult,
+                                 time_checksum_gflop gpuResult_once,
+                                 time_checksum_gflop gpuResult_always,
+                                 time_checksum_gflop gpuResult_unified) {
+      if ((cpuGpu_once_.M != 0) && (cpuResult.gflops >= gpuResult_once.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_once.gflops)) {
+        cpuGpu_once_.cpuGflops = 0.0;
+        cpuGpu_once_.gpuGflops = 0.0;
+        cpuGpu_once_.probSize_kib = 0.0;
+        cpuGpu_once_.M = 0;
+        cpuGpu_once_.N = 0;
+        cpuGpu_once_.K = 0;
+      }
+      if ((cpuGpu_always_.M != 0) &&
+          (cpuResult.gflops >= gpuResult_always.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_always.gflops)) {
+        cpuGpu_always_.cpuGflops = 0.0;
+        cpuGpu_always_.gpuGflops = 0.0;
+        cpuGpu_always_.probSize_kib = 0.0;
+        cpuGpu_always_.M = 0;
+        cpuGpu_always_.N = 0;
+        cpuGpu_always_.K = 0;
+      }
+      if ((cpuGpu_unified_.M != 0) &&
+          (cpuResult.gflops >= gpuResult_unified.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_unified.gflops)) {
+        cpuGpu_unified_.cpuGflops = 0.0;
+        cpuGpu_unified_.gpuGflops = 0.0;
+        cpuGpu_unified_.probSize_kib = 0.0;
+        cpuGpu_unified_.M = 0;
+        cpuGpu_unified_.N = 0;
+        cpuGpu_unified_.K = 0;
+      }
+    }
+
+    /** Update the offload threshold structs if GPU.gflops > CPU.gflops. */
+    void updateOffloadStructs(time_checksum_gflop cpuResult,
+                              time_checksum_gflop gpuResult_once,
+                              time_checksum_gflop gpuResult_always,
+                              time_checksum_gflop gpuResult_unified, const int M,
+                              const int N, const int K, const double probSize) {
+      if ((cpuGpu_once_.M == 0) && cpuResult.gflops < gpuResult_once.gflops) {
+        cpuGpu_once_.cpuGflops = cpuResult.gflops;
+        cpuGpu_once_.gpuGflops = gpuResult_once.gflops;
+        cpuGpu_once_.probSize_kib = probSize;
+        cpuGpu_once_.M = M;
+        cpuGpu_once_.N = N;
+        cpuGpu_once_.K = K;
+      }
+      if ((cpuGpu_always_.M == 0) && cpuResult.gflops < gpuResult_always.gflops) {
+        cpuGpu_always_.cpuGflops = cpuResult.gflops;
+        cpuGpu_always_.gpuGflops = gpuResult_always.gflops;
+        cpuGpu_always_.probSize_kib = probSize;
+        cpuGpu_always_.M = M;
+        cpuGpu_always_.N = N;
+        cpuGpu_always_.K = K;
+      }
+      if ((cpuGpu_unified_.M == 0) &&
+          cpuResult.gflops < gpuResult_unified.gflops) {
+        cpuGpu_unified_.cpuGflops = cpuResult.gflops;
+        cpuGpu_unified_.gpuGflops = gpuResult_unified.gflops;
+        cpuGpu_unified_.probSize_kib = probSize;
+        cpuGpu_unified_.M = M;
+        cpuGpu_unified_.N = N;
+        cpuGpu_unified_.K = K;
+      }
+    }
+
+    /** A function for calculating FLOPs performed by a SpMDnM.
+     * C = alpha*AB + beta*C */
+    constexpr uint64_t calcFlops(const int M, const int N, const int K) const {
+      // Sparse Matrix x Dense Matrix is just a series of SpMDnV - one for each column
+      uint64_t NNZ = (uint64_t)((double)M * (double)K * (1.0 - sparsity_));
+      return 2 * NNZ * N;
+    }
+
+    /** A function for calculating the total GEMM problem size in KiB.
+      Uses a single CSR format matrix: (M+1) + 2NNZ; and two dense matrices */
+    constexpr double calcKib(const int M, const int N, const int K) const {
+      uint64_t NNZ = 1 + (uint64_t)((double)M * (double)K * (1.0 - sparsity_));
+      uint64_t M_ = (uint64_t)M, N_ = (uint64_t)N, K_ = (uint64_t)K;
+      uint64_t probSize = (M_ + 1) + (2 * NNZ) + (K_ * N_) + (M_ * N_);
+      return ((double)(probSize * (sizeof(T))) / 1024);
+    }
+
+    /** Get the name of the kernel being run. */
+    std::string getKernelName() const {
+      switch (sizeof(T)) {
+        case 4:
+          return "sspmdnm";
+        case 8:
+          return "dspmdnm";
+        default:
+          return "unknown";
+      }
+    }
+
+    /** Print to stdout the offload thresholds. */
+    void printOffloadThreshold(const std::string& problemName) const {
+      std::vector<std::string> header = {
+              "Device",  "M",          "N", "K", "Total Prob. Size (KiB)",
+              "GFLOP/s", "CPU GFLOP/s"};
+
+      std::vector<std::vector<std::string>> rows;
+      // Initialise GPU_Once row
+      std::stringstream probSize_o;
+      std::stringstream gpuGflops_o;
+      std::stringstream cpuGflops_o;
+      probSize_o << std::fixed << std::setprecision(2) << cpuGpu_once_.probSize_kib;
+      gpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.gpuGflops;
+      cpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.cpuGflops;
+      if (cpuGpu_once_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Offload Once)", std::to_string(0),
+                        std::to_string(0), std::to_string(0), probSize_o.str(),
+                        "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Offload Once)", std::to_string(cpuGpu_once_.M),
+                        std::to_string(cpuGpu_once_.N),
+                        std::to_string(cpuGpu_once_.K), probSize_o.str(),
+                        gpuGflops_o.str(), cpuGflops_o.str()});
+      }
+
+      // Initialise GPU_always row
+      std::stringstream probSize_a;
+      std::stringstream gpuGflops_a;
+      std::stringstream cpuGflops_a;
+      probSize_a << std::fixed << std::setprecision(2) << cpuGpu_always_.probSize_kib;
+      gpuGflops_a << std::fixed << std::setprecision(2) << cpuGpu_always_.gpuGflops;
+      cpuGflops_a << std::fixed << std::setprecision(2) << cpuGpu_always_.cpuGflops;
+      if (cpuGpu_always_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Offload Always)", std::to_string(0),
+                        std::to_string(0), std::to_string(0), probSize_a.str(),
+                        "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Offload Always)", std::to_string(cpuGpu_always_.M),
+                        std::to_string(cpuGpu_always_.N),
+                        std::to_string(cpuGpu_always_.K), probSize_a.str(),
+                        gpuGflops_a.str(), cpuGflops_a.str()});
+      }
+
+      // Initialise GPU_unified row
+      std::stringstream probSize_u;
+      std::stringstream gpuGflops_u;
+      std::stringstream cpuGflops_u;
+      probSize_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.probSize_kib;
+      gpuGflops_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.gpuGflops;
+      cpuGflops_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.cpuGflops;
+      if (cpuGpu_unified_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Unified Memory)", std::to_string(0),
+                        std::to_string(0), std::to_string(0), probSize_u.str(),
+                        "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Unified Memory)", std::to_string(cpuGpu_unified_.M),
+                        std::to_string(cpuGpu_unified_.N),
+                        std::to_string(cpuGpu_unified_.K), probSize_u.str(),
+                        gpuGflops_u.str(), cpuGflops_u.str()});
+      }
+
+      // Print table
+      tablePrinter tPrinter(
+              problemName + " Problem Domian GPU Offload Thresholds:", header, rows);
+      tPrinter.print(1);
+    }
+
+    /** The output directory where CSV files should be saved to. */
+    const std::string CSV_DIR;
+
+    /** The number of iterations to perform per problem size. */
+    const int iterations_;
+
+    /** The value of the first problem size dimension run. */
+    const int startDimention_;
+
+    /** The maximum value of the largest problem size dimension. */
+    const int upperLimit_;
+
+    /** The step size between each problem size dimension. */
+    const int step_;
+
+    /** The sparsity value of the sparse matrix. */
+    const double sparsity_;
+    
+    const matrixType type_ = matrixType::rmat;
+
+    /** Whether the CPU kernels should be run. */
+    const bool doCPU_ = true;
+
+    /** Whether the GPU kernels should be run. */
+    const bool doGPU_ = true;
+
+
+#if CPU_ENABLED
+    /** The SpMDnM CPU kernel. */
+  cpu::spmdnm_cpu<T> cpu_;
+#endif
+
+#if GPU_ENABLED
+    /** The SpMDnM GPU kernel. */
+  gpu::spmdnm_gpu<T> gpu_;
+#endif
+
+    /** The point at which offloading to GPU (offload once) becomes worthwhile. */
+    cpuGpu_offloadThreshold cpuGpu_once_;
+
+    /** The point at which offloading to GPU (offload always) becomes worthwhile.
+     */
+    cpuGpu_offloadThreshold cpuGpu_always_;
+
+    /** The point at which offloading to GPU (unified memory) becomes worthwhile.
+     */
+    cpuGpu_offloadThreshold cpuGpu_unified_;
+
+    /** The previous problem size's GPU (offload once) performance results. */
+    time_checksum_gflop prev_gpuResult_once;
+
+    /** The previous problem size's GPU (offload always) performance results. */
+    time_checksum_gflop prev_gpuResult_always;
+
+    /** The previous problem size's GPU (unified memory) performance results. */
+    time_checksum_gflop prev_gpuResult_unified;
+};
\ No newline at end of file
diff --git a/include/doSpmdnv.hh b/include/doSpmdnv.hh
new file mode 100644
index 0000000..2871c88
--- /dev/null
+++ b/include/doSpmdnv.hh
@@ -0,0 +1,522 @@
+#pragma once
+#include <sstream>
+#include <type_traits>
+
+#include "helpers.hh"
+#include "tablePrinter.hh"
+#include "utilities.hh"
+
+#if defined CPU_ARMPL
+#include "../ArmPL/spmdnv.hh"
+#elif defined CPU_ONEMKL
+#include "../oneMKL/CPU/spmdnv.hh"
+#elif defined CPU_AOCL
+#include "../AOCL/spmdnv.hh"
+#elif defined CPU_NVPL
+// Todo #include "../NVPL/spmdnv.hh"
+#endif
+
+#if defined GPU_CUBLAS
+#include "../cuBLAS/spmdnv.hh"
+#elif defined GPU_ONEMKL
+#include "../oneMKL/GPU/spmdnv.hh"
+#elif defined GPU_ROCBLAS
+#include "../rocBLAS/spmdnv.hh"
+#endif
+
+/** `T` represents the type of kernel that will be run - i.e. T=float is for
+ *      SSpMDnV. */
+template <typename T>
+class doSpmdnv {
+public:
+    doSpmdnv(const std::string csvDir, const int iters, const int startDim,
+           const int upperLimit, const int step, const double sparsity, const matrixType type,
+           const bool cpuEnabled =true, const bool gpuEnabled = true)
+            : CSV_DIR(csvDir),
+              iterations_(iters),
+              startDimention_(startDim),
+              upperLimit_(upperLimit),
+              step_(step),
+              sparsity_(sparsity),
+              type_(type),
+              doCPU_(cpuEnabled),
+              doGPU_(gpuEnabled)
+#if CPU_ENABLED
+    ,
+        cpu_(iterations_)
+#endif
+#if GPU_ENABLED
+    ,
+        gpu_(iterations_)
+#endif
+    {
+      static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) &&
+                    "ERROR - doSpMDnV can only be constructed using one of the "
+                    "following types: [float, double].");
+    }
+
+    /** Run all problem types and write data to CSV files. */
+    void collectData() {
+      // Square Problem Sizes...
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      std::ofstream csvFile =
+              initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv");
+      for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
+        // M = dim, N = dim;
+        callKernels(csvFile, dim, dim);
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Square x Vector (M=N)");
+      }
+#endif
+
+      // Rectangular Problem Sizes:
+      // Tall and thin x Vector
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_tall-thin_vector_M=16N.csv");
+      int N = startDimention_;
+      int M = 16 * N;
+      while (M <= upperLimit_) {
+        callKernels(csvFile, M, N);
+        M += 16 * step_;
+        N += step_;
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+      // Print offload results to stdout
+      printOffloadThreshold("Tall-and-Thin x Vector (M=16N)");
+    }
+#endif
+
+      // Tall and thin x Vector
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_tall-thin_vector_M_N=32.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
+          // M = dim, N = 32;
+          callKernels(csvFile, dim, 32);
+        }
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+      // Print offload results to stdout
+      printOffloadThreshold("Tall-and-Thin x Vector (M, N=32)");
+    }
+#endif
+
+      // Short and wide x Vector
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_short-wide_vector_N=16M.csv");
+      M = startDimention_;
+      N = 16 * M;
+      while (N <= upperLimit_) {
+        callKernels(csvFile, M, N);
+        M += step_;
+        N += 16 * step_;
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+      // Print offload results to stdout
+      printOffloadThreshold("Short-and-Wide x Vector (N=16M)");
+    }
+#endif
+
+      // Short and wide x Vector
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_short-wide_vector_M=32_N.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
+          // M = 32, N = dim;
+          callKernels(csvFile, 32, dim);
+        }
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+      // Print offload results to stdout
+      printOffloadThreshold("Short-and-Wide x Vector (M=32, N)");
+    }
+#endif
+    }
+
+private:
+    /** Call the appropriate CPU and GPU SPGEMV kernels. */
+    void callKernels(std::ofstream& csvFile, const int M, const int N) {
+      const double probSize = calcKib(M, N, sparsity_);
+      const uint64_t flops = calcFlops(M, N, sparsity_);
+      std::string kernelName = getKernelName();
+
+// Perform CPU kernel
+#if CPU_ENABLED
+    time_checksum_gflop cpuResult;
+    if (doCPU_) {
+      cpu_.initialise(M, N, sparsity_, type_);
+      cpuResult = cpu_.compute();
+      cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
+      // Write result to CSV file
+      writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, sparsity_,
+                     iterations_, cpuResult.runtime, cpuResult.gflops);
+    }
+#endif
+
+// Perform the GPU kernels
+#if GPU_ENABLED
+    time_checksum_gflop gpuResult_always;
+    time_checksum_gflop gpuResult_once;
+    time_checksum_gflop gpuResult_unified;
+    /*
+        * We run three different offload types:
+        *  - ALWAYS: Offload to/from GPU every iteration
+        *  - ONCE : Offload to/from GPU once before all iterations and once after
+        *  - UNIFIED : data passed from host to device (and device to host) as needed 
+        * THE ORDER OF THESE IS IMPORTANT -- To reduce time spent generating matrices, we 
+        * generate once during the ALWAYS offload, and then re-use the same matrices for
+        * the ONCE and UNIFIED offload tests.  Deleting them after UNIFIED.  Therefore, 
+        * changing the order here will require this logic within the spmm GPU classes to 
+        * be updated. 
+      */
+    if (doGPU_) {
+      // - ALWAYS: Offload to/from GPU every iteration
+      gpu_.initialise(gpuOffloadType::always, M, N, sparsity_, type_);
+      gpuResult_always = gpu_.compute();
+      gpuResult_always.gflops =
+          calcGflops(flops, iterations_, gpuResult_always.runtime);
+      writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, 0,
+                     probSize, sparsity_, iterations_, gpuResult_always.runtime,
+                     gpuResult_always.gflops);
+
+      // - ONCE : Offload to/from GPU once before all iterations and once
+      // after
+      gpu_.initialise(gpuOffloadType::once, M, N, sparsity_, type_);
+      gpuResult_once = gpu_.compute();
+      gpuResult_once.gflops =
+          calcGflops(flops, iterations_, gpuResult_once.runtime);
+      writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, 0, probSize,
+                     sparsity_, iterations_, gpuResult_once.runtime,
+                     gpuResult_once.gflops);
+
+      // - UNIFIED : data passed from host to device (and device to host) as
+      //             needed
+      gpu_.initialise(gpuOffloadType::unified, M, N, sparsity_, type_);
+      gpuResult_unified = gpu_.compute();
+      gpuResult_unified.gflops =
+          calcGflops(flops, iterations_, gpuResult_unified.runtime);
+      writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, 0, probSize,
+                     sparsity_, iterations_, gpuResult_unified.runtime,
+                     gpuResult_unified.gflops);
+    }
+#endif
+
+#if CPU_ENABLED && GPU_ENABLED
+    if (doCPU_ && doGPU_) {
+      // Make sure all checksums match if CPU and GPU kernels are run.
+      //  - The majority of BLAS Libraries guarentee the same result if a
+      //  function
+      //    is called multiple times. Given all input matrices are identical for
+      //    each GPU offload type, we need only to compare the CPU and GPU
+      //    checksums.
+      checkChecksums(cpuResult, gpuResult_once, gpuResult_always,
+                     gpuResult_unified, M, N);
+
+      // Check if offload structs should be reset
+      checkOffloadStructReset(cpuResult, gpuResult_once, gpuResult_always,
+                              gpuResult_unified);
+
+      // Check if offload threshold has been achieved for each GPU offload type.
+      updateOffloadStructs(cpuResult, gpuResult_once, gpuResult_always,
+                           gpuResult_unified, M, N, probSize);
+
+      // Update previous results
+      prev_gpuResult_once = gpuResult_once;
+      prev_gpuResult_always = gpuResult_always;
+      prev_gpuResult_unified = gpuResult_unified;
+    }
+#endif
+  }
+
+    /** Todo -- find a sensible way to do this for sparse */
+    void checkChecksums(time_checksum_gflop cpuResult,
+                        time_checksum_gflop gpuResult_once,
+                        time_checksum_gflop gpuResult_always,
+                        time_checksum_gflop gpuResult_unified, const int M,
+                        const int N) {
+      // Ensure that each checksum difference is less than 0.1%
+     double hundredOverChecksum = 100 / std::fabs(cpuResult.checksum);
+     if (((std::fabs(cpuResult.checksum - gpuResult_once.checksum) * hundredOverChecksum)) > 0.1 &&
+         ((std::fabs(cpuResult.checksum - gpuResult_always.checksum) * hundredOverChecksum)) > 0.1 &&
+         ((std::fabs(cpuResult.checksum - gpuResult_unified.checksum) * hundredOverChecksum)) > 0.1) {
+       std::cerr << "ERROR - " << getKernelName() << " kernel checksums do not match:\n\tInput "
+                    "dimensions: M=" << M << ", N=" << N << std::endl;
+       std::cerr << std::setprecision(10) << "\tCPU Checksum = " << cpuResult.checksum << std::endl;
+       std::cerr << std::setprecision(10) << "\tGPU (Once) Checksum = " << gpuResult_once.checksum << std::endl;
+       std::cerr << std::setprecision(10) << "\tGPU (Always) Checksum = " << gpuResult_always.checksum << std::endl;
+       std::cerr << std::setprecision(10) << "\tGPU (Unified) Checksum = " << gpuResult_unified.checksum << std::endl;
+       exit(1);
+     }
+    }
+
+
+    /** Check whether the offload structures need to be reset; and doing so if
+     * required.
+     *   - If CPU.gflops >= GPU.gflops for last two problem sizes, then reset
+     * offload structures as GPU may not necessarily have reached the offload
+     * threshold.
+     */
+    void checkOffloadStructReset(time_checksum_gflop cpuResult,
+                                 time_checksum_gflop gpuResult_once,
+                                 time_checksum_gflop gpuResult_always,
+                                 time_checksum_gflop gpuResult_unified) {
+      if ((cpuGpu_once_.M != 0) && (cpuResult.gflops >= gpuResult_once.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_once.gflops)) {
+        cpuGpu_once_.cpuGflops = 0.0;
+        cpuGpu_once_.gpuGflops = 0.0;
+        cpuGpu_once_.probSize_kib = 0.0;
+        cpuGpu_once_.M = 0;
+        cpuGpu_once_.N = 0;
+      }
+      if ((cpuGpu_always_.M != 0) &&
+          (cpuResult.gflops >= gpuResult_always.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_always.gflops)) {
+        cpuGpu_always_.cpuGflops = 0.0;
+        cpuGpu_always_.gpuGflops = 0.0;
+        cpuGpu_always_.probSize_kib = 0.0;
+        cpuGpu_always_.M = 0;
+        cpuGpu_always_.N = 0;
+      }
+      if ((cpuGpu_unified_.M != 0) &&
+          (cpuResult.gflops >= gpuResult_unified.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_unified.gflops)) {
+        cpuGpu_unified_.cpuGflops = 0.0;
+        cpuGpu_unified_.gpuGflops = 0.0;
+        cpuGpu_unified_.probSize_kib = 0.0;
+        cpuGpu_unified_.M = 0;
+        cpuGpu_unified_.N = 0;
+      }
+    }
+
+    /** Update the offload threshold structs if GPU.gflops > CPU.gflops. */
+    void updateOffloadStructs(time_checksum_gflop cpuResult,
+                              time_checksum_gflop gpuResult_once,
+                              time_checksum_gflop gpuResult_always,
+                              time_checksum_gflop gpuResult_unified, const int M,
+                              const int N, const double probSize) {
+      if ((cpuGpu_once_.M == 0) && cpuResult.gflops < gpuResult_once.gflops) {
+        cpuGpu_once_.cpuGflops = cpuResult.gflops;
+        cpuGpu_once_.gpuGflops = gpuResult_once.gflops;
+        cpuGpu_once_.probSize_kib = probSize;
+        cpuGpu_once_.M = M;
+        cpuGpu_once_.N = N;
+      }
+      if ((cpuGpu_always_.M == 0) && cpuResult.gflops < gpuResult_always.gflops) {
+        cpuGpu_always_.cpuGflops = cpuResult.gflops;
+        cpuGpu_always_.gpuGflops = gpuResult_always.gflops;
+        cpuGpu_always_.probSize_kib = probSize;
+        cpuGpu_always_.M = M;
+        cpuGpu_always_.N = N;
+      }
+      if ((cpuGpu_unified_.M == 0) &&
+          cpuResult.gflops < gpuResult_unified.gflops) {
+        cpuGpu_unified_.cpuGflops = cpuResult.gflops;
+        cpuGpu_unified_.gpuGflops = gpuResult_unified.gflops;
+        cpuGpu_unified_.probSize_kib = probSize;
+        cpuGpu_unified_.M = M;
+        cpuGpu_unified_.N = N;
+      }
+    }
+
+    /** Todo -- work out how tis can be determined for a sparse problem with
+     * an unknown algorithm
+     * A function for calculating FLOPs performed by a GEMV.
+     * y = alpha*Ax + beta*y */
+    constexpr uint64_t calcFlops(const int M, const int N, const double SPARSITY) const {
+      // There are two flops per non-zero element in the sparse matrix
+      uint64_t NNZ = 1 + (uint64_t)((double)M * (double)N * (1.0 - SPARSITY));
+      return 2 * NNZ;
+    }
+
+    /** A function for calculating the total GEMV problem size in KiB. */
+    constexpr double calcKib(const int M, const int N, const double SPARSITY) const {
+      // Needs a CSR format matrix (one array of ints size m + 1 (row pointers), one array of ints size nnz (column indices), and one array of fps of size nnz (values))
+      // Also needs two vectors x and y, of sizes n and m, respectively
+      uint64_t NNZ = 1 + (uint64_t)((double)M * (double)N * (1.0 - SPARSITY));
+      uint64_t intSize = (M + 1) + NNZ;
+      uint64_t fpSize = NNZ + N + M;
+      return (((double)(fpSize * (sizeof(T))) + (double)(intSize * sizeof(int64_t)))/ 1024);
+    }
+
+    /** Get the name of the kernel being run. */
+    std::string getKernelName() const {
+      switch (sizeof(T)) {
+        case 4:
+          return "sspmdnv";
+        case 8:
+          return "dspmdnv";
+        default:
+          return "unknown";
+      }
+    }
+
+    /** Print to stdout the offload thresholds. */
+    void printOffloadThreshold(std::string problemName) const {
+      std::vector<std::string> header = {
+              "Device", "M", "N", "Total Prob. Size (KiB)", "GFLOP/s", "CPU GFLOP/s"};
+
+      std::vector<std::vector<std::string>> rows;
+      // Initialise GPU_Once row
+      std::stringstream probSize_o;
+      std::stringstream gpuGflops_o;
+      std::stringstream cpuGflops_o;
+      probSize_o << std::fixed << std::setprecision(2) << cpuGpu_once_.probSize_kib;
+      gpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.gpuGflops;
+      cpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.cpuGflops;
+      if (cpuGpu_once_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Offload Once)", std::to_string(0),
+                        std::to_string(0), probSize_o.str(), "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Offload Once)", std::to_string(cpuGpu_once_.M),
+                        std::to_string(cpuGpu_once_.N), probSize_o.str(),
+                        gpuGflops_o.str(), cpuGflops_o.str()});
+      }
+
+      // Initialise GPU_always row
+      std::stringstream probSize_a;
+      std::stringstream gpuGflops_a;
+      std::stringstream cpuGflops_a;
+      probSize_a << std::fixed << std::setprecision(2) << cpuGpu_always_.probSize_kib;
+      gpuGflops_a << std::fixed << std::setprecision(2) << cpuGpu_always_.gpuGflops;
+      cpuGflops_a << std::fixed << std::setprecision(2) << cpuGpu_always_.cpuGflops;
+      if (cpuGpu_always_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Offload Always)", std::to_string(0),
+                        std::to_string(0), probSize_a.str(), "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Offload Always)", std::to_string(cpuGpu_always_.M),
+                        std::to_string(cpuGpu_always_.N), probSize_a.str(),
+                        gpuGflops_a.str(), cpuGflops_a.str()});
+      }
+
+      // Initialise GPU_unified row
+      std::stringstream probSize_u;
+      std::stringstream gpuGflops_u;
+      std::stringstream cpuGflops_u;
+      probSize_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.probSize_kib;
+      gpuGflops_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.gpuGflops;
+      cpuGflops_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.cpuGflops;
+      if (cpuGpu_unified_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Unified Memory)", std::to_string(0),
+                        std::to_string(0), probSize_u.str(), "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Unified Memory)", std::to_string(cpuGpu_unified_.M),
+                        std::to_string(cpuGpu_unified_.N), probSize_u.str(),
+                        gpuGflops_u.str(), cpuGflops_u.str()});
+      }
+
+      // Print table
+      tablePrinter tPrinter(
+              problemName + " Problem Domian GPU Offload Thresholds:", header, rows);
+      tPrinter.print(1);
+    }
+
+    /** The output directory where CSV files should be saved to. */
+    const std::string CSV_DIR;
+
+    /** The number of iterations to perform per problem size. */
+    const int iterations_;
+
+    /** The value of the first probelm size dimention run. */
+    const int startDimention_;
+
+    /** The maximum value of the largest problem size dimention. */
+    const int upperLimit_;
+
+    /** The step size between each problem size dimension. */
+    const int step_;
+
+    /** The sparsity value of the sparse matrix. */
+    const double sparsity_;
+
+    const matrixType type_;
+
+    /** Whether the CPU kernels should be run. */
+    const bool doCPU_ = true;
+
+    /** Whether the GPU kernels should be run. */
+    const bool doGPU_ = true;
+
+#if CPU_ENABLED
+    /** The SpMDnV CPU kernel. */
+  cpu::spmdnv_cpu<T> cpu_;
+#endif
+
+#if GPU_ENABLED
+    /** The SpMDnV GPU kernel. */
+  gpu::spmdnv_gpu<T> gpu_;
+#endif
+
+    /** The point at which offloading to GPU (offload once) becomes worthwhile. */
+    cpuGpu_offloadThreshold cpuGpu_once_;
+
+    /** The point at which offloading to GPU (offload always) becomes worthwhile.
+     */
+    cpuGpu_offloadThreshold cpuGpu_always_;
+
+    /** The point at which offloading to GPU (unified memory) becomes worthwhile.
+     */
+    cpuGpu_offloadThreshold cpuGpu_unified_;
+
+    /** The previous problem size's GPU (offload once) performance results. */
+    time_checksum_gflop prev_gpuResult_once;
+
+    /** The previous problem size's GPU (offload always) performance results. */
+    time_checksum_gflop prev_gpuResult_always;
+
+    /** The previous problem size's GPU (unified memory) performance results. */
+    time_checksum_gflop prev_gpuResult_unified;
+};
\ No newline at end of file
diff --git a/include/doSpmspm.hh b/include/doSpmspm.hh
new file mode 100644
index 0000000..731c449
--- /dev/null
+++ b/include/doSpmspm.hh
@@ -0,0 +1,650 @@
+#pragma once
+#include <sstream>
+#include <type_traits>
+#include <cstdint>
+
+#include "helpers.hh"
+#include "tablePrinter.hh"
+#include "utilities.hh"
+
+#if defined CPU_ARMPL
+#include "../ArmPL/spmspm.hh"
+#elif defined CPU_ONEMKL
+#include "../oneMKL/CPU/spmspm.hh"
+#elif defined CPU_AOCL
+#include "../AOCL/spmspm.hh"
+#endif
+
+#if defined GPU_CUBLAS
+#include "../cuBLAS/spmspm.hh"
+#elif defined GPU_ONEMKL
+#include "../oneMKL/GPU/spmspm.hh"
+#elif defined GPU_ROCBLAS
+#include "../rocBLAS/spmspm.hh"
+#endif
+
+/** `T` represents the type of kernel that will be run - i.e. T=float is for
+ *      SGEMM. */
+template <typename T>
+class doSpmspm {
+public:
+    doSpmspm(const std::string csvDir, const int iters, const int startDim,
+             const int upperLimit, const int step, const double sparsity, const matrixType type,
+             const bool cpuEnabled = true, const bool gpuEnabled = true)
+            : CSV_DIR(csvDir),
+              iterations_(iters),
+              startDimention_(startDim),
+              upperLimit_(upperLimit),
+              step_(step),
+              sparsity_(sparsity),
+              type_(type),
+              doCPU_(cpuEnabled),
+              doGPU_(gpuEnabled)
+#if CPU_ENABLED
+    ,
+        cpu_(iterations_)
+#endif
+#if GPU_ENABLED
+    ,
+        gpu_(iterations_)
+#endif
+    {
+      static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) &&
+                    "ERROR - doSpmspm can only be constructed using one of the "
+                    "following types: [float, double].");                   
+    }
+
+    /** Run all problem types and write data to CSV files. */
+    void collectData() {
+      // Square Problem Sizes...
+      // Re-initialise offload threshold structures
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                                          "_square_square_M=N=K.csv");
+      for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
+        // M = dim, N = dim, K = dim;
+        callKernels(csvFile, dim, dim, dim);
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+      // Print offload results to stdout
+      printOffloadThreshold("Square x Square (M=N=K)");
+    }
+#endif
+
+      // Rectangular Problem Sizes:
+      // Tall and thin x Short and wide
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_tall-thin_short-wide_M=N_M=16K.csv");
+      int K = startDimention_;
+      int M = 16 * K;
+      int N = 16 * K;
+      while (M <= upperLimit_) {
+        callKernels(csvFile, M, N, K);
+        M += 16 * step_;
+        N += 16 * step_;
+        K += step_;
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, M=16K)");
+  }
+#endif
+
+      // Tall and thin x Short and wide
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_tall-thin_short-wide_M=N_K=32.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
+          // M = dim, N = dim, K = 32;
+          callKernels(csvFile, dim, dim, 32);
+        }
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, K=32)");
+  }
+#endif
+
+      // Short and wide x Tall and thin
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_short-wide_tall-thin_M=N_K=16M.csv");
+      M = startDimention_;
+      N = startDimention_;
+      K = 16 * M;
+      while (K <= upperLimit_) {
+        callKernels(csvFile, M, N, K);
+        M += step_;
+        N += step_;
+        K += 16 * step_;
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N, K=16M)");
+  }
+#endif
+
+      // Short and wide x Tall and thin
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_short-wide_tall-thin_M=N=32_K.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
+          // M = 32, N = 32, K = dim;
+          callKernels(csvFile, 32, 32, dim);
+        }
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N=32, K)");
+  }
+#endif
+
+      // Tall and Thin x Square
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_tall-thin_square_K=N_M=16K.csv");
+      K = startDimention_;
+      N = startDimention_;
+      M = 16 * K;
+      while (M <= upperLimit_) {
+        callKernels(csvFile, M, N, K);
+        M += 16 * step_;
+        N += step_;
+        K += step_;
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Tall-and-Thin x Square (K=N, M=16K)");
+  }
+#endif
+
+      // Tall and Thin x Square
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_tall-thin_square_K=N=32_M.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
+          // M = dim, N = 32, K = 32;
+          callKernels(csvFile, dim, 32, 32);
+        }
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Tall-and-Thin x Square (M, K=N=32)");
+  }
+#endif
+
+      // Square x Short and Wide
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_square_short-wide_M=K_N=16K.csv");
+      M = startDimention_;
+      K = startDimention_;
+      N = 16 * K;
+      while (N <= upperLimit_) {
+        callKernels(csvFile, M, N, K);
+        M += step_;
+        N += 16 * step_;
+        K += step_;
+      }
+      // Close file
+      csvFile.close();
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)");
+  }
+#endif
+      // Square x Short and Wide
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_square_short-wide_M=K=32_N.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim += step_) {
+          // M = 32, N = dim, K = 32;
+          callKernels(csvFile, 32, dim, 32);
+        }
+      }
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)");
+  }
+#endif
+      // Close file
+      csvFile.close();
+    }
+
+private:
+    /** Ensure all CPU and GPU checksums are within the permitted limit of
+     * eachother. */
+    void checkChecksums(time_checksum_gflop cpuResult,
+                        time_checksum_gflop gpuResult_once,
+                        time_checksum_gflop gpuResult_always,
+                        time_checksum_gflop gpuResult_unified, const int M,
+                        const int N, const int K) {
+      // Ensure that each checksum difference is less than 0.1%
+      double hundredOverChecksum = 100 / std::fabs(cpuResult.checksum);
+      if (((std::fabs(cpuResult.checksum - gpuResult_once.checksum) *
+            hundredOverChecksum)) > 0.1 &&
+          ((std::fabs(cpuResult.checksum - gpuResult_always.checksum) *
+            hundredOverChecksum)) > 0.1 &&
+          ((std::fabs(cpuResult.checksum - gpuResult_unified.checksum) *
+            hundredOverChecksum)) > 0.1) {
+        std::cerr << "ERROR - " << getKernelName() << " kernel checksums do not match:\n\tInput "
+                     "dimensions: M=" << M << ", N=" << N << ", K=" << K << std::endl;
+        std::cerr << std::setprecision(10) << "\tCPU Checksum = " << cpuResult.checksum << std::endl;
+        std::cerr << std::setprecision(10) << "\tGPU (Once) Checksum = " << gpuResult_once.checksum << std::endl;
+        std::cerr << std::setprecision(10) << "\tGPU (Always) Checksum = " << gpuResult_always.checksum << std::endl;
+        std::cerr << std::setprecision(10) << "\tGPU (Unified) Checksum = " << gpuResult_unified.checksum << std::endl;
+        exit(1);
+      }
+    }
+
+    /** Check whether the offload structures need to be reset; and doing so if
+     * required.
+     *   - If CPU.gflops >= GPU.gflops for last two problem sizes, then reset
+     * offload structures as GPU may not necessarily have reached the offload
+     * threshold. */
+    void checkOffloadStructReset(time_checksum_gflop cpuResult,
+                                 time_checksum_gflop gpuResult_once,
+                                 time_checksum_gflop gpuResult_always,
+                                 time_checksum_gflop gpuResult_unified) {
+      if ((cpuGpu_once_.M != 0) && (cpuResult.gflops >= gpuResult_once.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_once.gflops)) {
+        cpuGpu_once_.cpuGflops = 0.0;
+        cpuGpu_once_.gpuGflops = 0.0;
+        cpuGpu_once_.probSize_kib = 0.0;
+        cpuGpu_once_.M = 0;
+        cpuGpu_once_.N = 0;
+        cpuGpu_once_.K = 0;
+      }
+      if ((cpuGpu_always_.M != 0) &&
+          (cpuResult.gflops >= gpuResult_always.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_always.gflops)) {
+        cpuGpu_always_.cpuGflops = 0.0;
+        cpuGpu_always_.gpuGflops = 0.0;
+        cpuGpu_always_.probSize_kib = 0.0;
+        cpuGpu_always_.M = 0;
+        cpuGpu_always_.N = 0;
+        cpuGpu_always_.K = 0;
+      }
+      if ((cpuGpu_unified_.M != 0) &&
+          (cpuResult.gflops >= gpuResult_unified.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_unified.gflops)) {
+        cpuGpu_unified_.cpuGflops = 0.0;
+        cpuGpu_unified_.gpuGflops = 0.0;
+        cpuGpu_unified_.probSize_kib = 0.0;
+        cpuGpu_unified_.M = 0;
+        cpuGpu_unified_.N = 0;
+        cpuGpu_unified_.K = 0;
+      }
+    }
+
+    /** Update the offload threshold structs if GPU.gflops > CPU.gflops. */
+    void updateOffloadStructs(time_checksum_gflop cpuResult,
+                              time_checksum_gflop gpuResult_once,
+                              time_checksum_gflop gpuResult_always,
+                              time_checksum_gflop gpuResult_unified, const int M,
+                              const int N, const int K, const double probSize) {
+      if ((cpuGpu_once_.M == 0) && cpuResult.gflops < gpuResult_once.gflops) {
+        cpuGpu_once_.cpuGflops = cpuResult.gflops;
+        cpuGpu_once_.gpuGflops = gpuResult_once.gflops;
+        cpuGpu_once_.probSize_kib = probSize;
+        cpuGpu_once_.M = M;
+        cpuGpu_once_.N = N;
+        cpuGpu_once_.K = K;
+      }
+      if ((cpuGpu_always_.M == 0) && cpuResult.gflops < gpuResult_always.gflops) {
+        cpuGpu_always_.cpuGflops = cpuResult.gflops;
+        cpuGpu_always_.gpuGflops = gpuResult_always.gflops;
+        cpuGpu_always_.probSize_kib = probSize;
+        cpuGpu_always_.M = M;
+        cpuGpu_always_.N = N;
+        cpuGpu_always_.K = K;
+      }
+      if ((cpuGpu_unified_.M == 0) &&
+          cpuResult.gflops < gpuResult_unified.gflops) {
+        cpuGpu_unified_.cpuGflops = cpuResult.gflops;
+        cpuGpu_unified_.gpuGflops = gpuResult_unified.gflops;
+        cpuGpu_unified_.probSize_kib = probSize;
+        cpuGpu_unified_.M = M;
+        cpuGpu_unified_.N = N;
+        cpuGpu_unified_.K = K;
+      }
+    }
+
+    void callKernels(std::ofstream& csvFile, const int N, const int M,
+                     const int K) {
+      const double probSize = calcKib(N, N, N, sparsity_);
+      const uint64_t flops = calcFlops(N, N, N, sparsity_);
+      std::string kernelName = getKernelName();
+
+#if CPU_ENABLED
+      time_checksum_gflop cpuResult;
+      if (doCPU_) {
+        cpu_.initialise(N, M, K, sparsity_, type_);
+        cpuResult = cpu_.compute();
+        cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
+        writeLineToCsv(csvFile, "cpu", kernelName, N, M, K, probSize,
+                       sparsity_, iterations_, cpuResult.runtime,
+                       cpuResult.gflops);
+      }
+#endif
+#if GPU_ENABLED
+      // Perform the GPU kernels
+      time_checksum_gflop gpuResult_always;
+      time_checksum_gflop gpuResult_once;
+      time_checksum_gflop gpuResult_unified;
+      /*
+        * We run three different offload types:
+        *  - ALWAYS: Offload to/from GPU every iteration
+        *  - ONCE : Offload to/from GPU once before all iterations and once after
+        *  - UNIFIED : data passed from host to device (and device to host) as needed 
+        * THE ORDER OF THESE IS IMPORTANT -- To reduce time spent generating matrices, we 
+        * generate once during the ALWAYS offload, and then re-use the same matrices for
+        * the ONCE and UNIFIED offload tests.  Deleting them after UNIFIED.  Therefore, 
+        * changing the order here will require this logic within the spmspm GPU classes to 
+        * be updated. 
+      */
+      if (doGPU_) {
+        // - ALWAYS: Offload to/from GPU every iteration
+        gpu_.initialise(gpuOffloadType::always, N, M, K, sparsity_, type_);
+        gpuResult_always = gpu_.compute();
+        gpuResult_always.gflops =
+              calcGflops(flops, iterations_, gpuResult_always.runtime);
+        writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, M, K,
+                       probSize, sparsity_, iterations_, gpuResult_always.runtime,
+                       gpuResult_always.gflops);
+
+        // - ONCE : Offload to/from GPU once before all iterations and once
+        // after
+        gpu_.initialise(gpuOffloadType::once, N, M, K, sparsity_, type_);
+        gpuResult_once = gpu_.compute();
+        gpuResult_once.gflops =
+              calcGflops(flops, iterations_, gpuResult_once.runtime);
+        writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, M, K, probSize,
+                       sparsity_, iterations_, gpuResult_once.runtime,
+                       gpuResult_once.gflops);
+        
+        // - UNIFIED : data passed from host to device (and device to host) as
+        //             needed
+        gpu_.initialise(gpuOffloadType::unified, N, M, K, sparsity_, type_);
+        gpuResult_unified = gpu_.compute();
+        gpuResult_unified.gflops =
+        calcGflops(flops, iterations_, gpuResult_unified.runtime);
+        writeLineToCsv(csvFile, "gpu_unified", kernelName, N, M, K, probSize,
+                       sparsity_, iterations_, gpuResult_unified.runtime,
+                       gpuResult_unified.gflops);
+      }
+#endif
+#if CPU_ENABLED && GPU_ENABLED
+      if (doCPU_ && doGPU_) {
+        // Check that all checksums are within the permitted limit
+        checkChecksums(cpuResult, gpuResult_once, gpuResult_always,
+                       gpuResult_unified, N, M, K);
+        // Check whether offload structs need to be reset
+        checkOffloadStructReset(cpuResult, gpuResult_once, gpuResult_always,
+                                gpuResult_unified);
+        // Update offload structs if required
+        updateOffloadStructs(cpuResult, gpuResult_once, gpuResult_always,
+                             gpuResult_unified, N, M, K, probSize);
+        // Update previous GPU results
+        prev_gpuResult_once = gpuResult_once;
+        prev_gpuResult_always = gpuResult_always;
+        prev_gpuResult_unified = gpuResult_unified;
+      }
+#endif
+    }
+
+    /** A function for calculating FLOPs performed by a GEMM.
+     * C = alpha*AB + beta*C */
+    constexpr uint64_t calcFlops(const int M, const int N, const int K, const double SPARSITY) const {
+      // The number of scalar multiplications is nnz(Ak)*nnz(Bk) for each inner index k
+      // Therefore, the expectation is to have NNZA * NNZB / K, as each K index would 
+      // on average have NNZA/K * NNZB/K.  This assumes a uniform distribution of non-zero elements
+      uint64_t NNZA = 1 + (uint64_t)((double)M * (double)K * (1.0 - SPARSITY));
+      uint64_t NNZB = 1 + (uint64_t)((double)K * (double)N * (1.0 - SPARSITY));
+      return (NNZA * NNZB) / K;
+    }
+
+    /** A function for calculating the total GEMM problem size in KiB. 
+      Each matrix is stored in CSR, and so needs (nRows + 1) + 2NNZ space.
+      For A and B, this is easy, but for C we do not know its size ahead of time 
+      (we know nRows but not NNZ).  However, we can estimate the NNZ, on average.
+
+      Each value of C is the sum of the products of the corresponding row of A 
+      and column of B.
+      As each value of A and B has a probability of (1 - SPARSITY) if being non-zero, 
+      the probability that both A and B are non-zero (and thus that the product is 
+      non-zero)is (1 - SPARSITY)^2.
+      There are K products that are summed together.  If any one of these products is 
+      non-zero, so too shall the sum be.  Therefore, the estimated sparsity of C is
+      (1 - (1 - SPARSITY)^2)^K
+      */
+    constexpr double calcKib(const int M, const int N, const int K, const double SPARSITY) const {
+      uint64_t M_ = (uint64_t)M, K_ = (uint64_t)K;
+      uint64_t NNZA = 1 + (uint64_t)((double)M * (double)K * (1.0 - SPARSITY));
+      uint64_t NNZB = 1 + (uint64_t)((double)K * (double)N * (1.0 - SPARSITY));
+      double CSPARSITY = 1 - pow(pow(1.0 - SPARSITY, 2), K);
+      uint64_t NNZC = 1 + (uint64_t)((double)M * (double)N * CSPARSITY);
+
+      uint64_t probSize = (M_ + 1) + (2 * NNZA) + (K_ + 1) + (2 * NNZB) + (M_ + 1) + (2 * NNZC);
+      return ((double)(probSize * (sizeof(T))) / 1024);
+    }
+
+    /** Get the name of the kernel being run. */
+    std::string getKernelName() const {
+      switch (sizeof(T)) {
+        case 4:
+          return "sspmspm";
+        case 8:
+          return "dspmspm";
+        default:
+          return "unknown";
+      }
+    }
+
+    /** Print to stdout the offload thresholds. */
+    void printOffloadThreshold(const std::string& problemName) const {
+      std::vector<std::string> header = {
+              "Device",  "M",          "N", "K", "Total Prob. Size (KiB)",
+              "GFLOP/s", "CPU GFLOP/s"};
+
+      std::vector<std::vector<std::string>> rows;
+      // Initialise GPU_Once row
+      std::stringstream probSize_o;
+      std::stringstream gpuGflops_o;
+      std::stringstream cpuGflops_o;
+      probSize_o << std::fixed << std::setprecision(2) << cpuGpu_once_.probSize_kib;
+      gpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.gpuGflops;
+      cpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.cpuGflops;
+      if (cpuGpu_once_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Offload Once)", std::to_string(0),
+                        std::to_string(0), std::to_string(0), probSize_o.str(),
+                        "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Offload Once)", std::to_string(cpuGpu_once_.M),
+                        std::to_string(cpuGpu_once_.N),
+                        std::to_string(cpuGpu_once_.K), probSize_o.str(),
+                        gpuGflops_o.str(), cpuGflops_o.str()});
+      }
+
+      // Initialise GPU_always row
+      std::stringstream probSize_a;
+      std::stringstream gpuGflops_a;
+      std::stringstream cpuGflops_a;
+      probSize_a << std::fixed << std::setprecision(2) << cpuGpu_always_.probSize_kib;
+      gpuGflops_a << std::fixed << std::setprecision(2) << cpuGpu_always_.gpuGflops;
+      cpuGflops_a << std::fixed << std::setprecision(2) << cpuGpu_always_.cpuGflops;
+      if (cpuGpu_always_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Offload Always)", std::to_string(0),
+                        std::to_string(0), std::to_string(0), probSize_a.str(),
+                        "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Offload Always)", std::to_string(cpuGpu_always_.M),
+                        std::to_string(cpuGpu_always_.N),
+                        std::to_string(cpuGpu_always_.K), probSize_a.str(),
+                        gpuGflops_a.str(), cpuGflops_a.str()});
+      }
+
+      // Initialise GPU_unified row
+      std::stringstream probSize_u;
+      std::stringstream gpuGflops_u;
+      std::stringstream cpuGflops_u;
+      probSize_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.probSize_kib;
+      gpuGflops_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.gpuGflops;
+      cpuGflops_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.cpuGflops;
+      if (cpuGpu_unified_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Unified Memory)", std::to_string(0),
+                        std::to_string(0), std::to_string(0), probSize_u.str(),
+                        "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Unified Memory)", std::to_string(cpuGpu_unified_.M),
+                        std::to_string(cpuGpu_unified_.N),
+                        std::to_string(cpuGpu_unified_.K), probSize_u.str(),
+                        gpuGflops_u.str(), cpuGflops_u.str()});
+      }
+
+      // Print table
+      tablePrinter tPrinter(
+              problemName + " Problem Domian GPU Offload Thresholds:", header, rows);
+      tPrinter.print(1);
+    }
+
+    /** The output directory where CSV files should be saved to. */
+    const std::string CSV_DIR;
+
+    /** The number of iterations to perform per problem size. */
+    const int iterations_;
+
+    /** The value of the first probelm size dimention run. */
+    const int startDimention_;
+
+    /** The maximum value of the largest problem size dimention. */
+    const int upperLimit_;
+
+    /** The step size between problem sizes. */
+    const int step_;
+
+    /** The sparsity value of the sparse matrices. */
+    const double sparsity_;
+
+    const matrixType type_;
+
+    /** Whether the CPU kernels should be run. */
+    const bool doCPU_ = true;
+
+    /** Whether the GPU kernels should be run. */
+    const bool doGPU_ = true;
+
+#if CPU_ENABLED
+    /** The CPU kernel. */
+  cpu::spmspm_cpu<T> cpu_;
+#endif
+
+#if GPU_ENABLED
+    /** The GPU kernel. */
+	gpu::spmspm_gpu<T> gpu_;
+#endif
+
+    /** The point at which offloading to GPU (offload once) becomes worthwhile. */
+    cpuGpu_offloadThreshold cpuGpu_once_;
+
+    /** The point at which offloading to GPU (offload always) becomes worthwhile.
+     */
+    cpuGpu_offloadThreshold cpuGpu_always_;
+
+    /** The point at which offloading to GPU (unified memory) becomes worthwhile.
+     */
+    cpuGpu_offloadThreshold cpuGpu_unified_;
+
+    /** The previous problem size's GPU (offload once) performance results. */
+    time_checksum_gflop prev_gpuResult_once;
+
+    /** The previous problem size's GPU (offload always) performance results. */
+    time_checksum_gflop prev_gpuResult_always;
+
+    /** The previous problem size's GPU (unified memory) performance results. */
+    time_checksum_gflop prev_gpuResult_unified;
+};
\ No newline at end of file
diff --git a/include/helpers.hh b/include/helpers.hh
index 5618557..db8df69 100644
--- a/include/helpers.hh
+++ b/include/helpers.hh
@@ -17,9 +17,8 @@ std::ofstream initCSVFile(const std::string filename) {
 
   std::ofstream newFile(filename);
 
-  newFile << "Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total "
-             "Seconds,GFLOP/s"
-          << std::endl;
+  newFile << "Device,Kernel,M,N,K,Total Problem Size (KiB),sparsity,Iterations,"
+             "Total Seconds,GFLOP/s" << std::endl;
 
   return newFile;
 }
@@ -28,22 +27,17 @@ std::ofstream initCSVFile(const std::string filename) {
  * Function does not close the file. */
 void writeLineToCsv(std::ofstream& file, const std::string device,
                     const std::string kernel, const int M, const int N,
-                    const int K, const double totalProbSize, const int iters,
-                    const double totalTime, const double gflops) {
+                    const int K, const double totalProbSize, const float
+                    sparsity, const int iters, const double totalTime,
+                    const double gflops) {
   if (!file.is_open()) {
-    std::cout << "ERROR - Attempted to write line to a closed CSV file."
-              << std::endl;
+    std::cout << "ERROR - Attempted to write line to a closed CSV file." << std::endl;
     exit(1);
   }
-  file << device << "," << kernel << "," << M << "," << N << "," << K << ","
-       << std::fixed << std::setprecision(3) << totalProbSize << "," << iters
-       << "," << std::fixed << std::setprecision(5) << totalTime << ","
-       << std::fixed << std::setprecision(3) << gflops << std::endl;
+  file << device << "," << kernel << "," << M << "," << N << "," << K << "," << std::fixed << std::setprecision(3) << totalProbSize << "," << std::fixed << std::setprecision(8) << sparsity << "," << iters << "," << std::fixed << std::setprecision(5) << totalTime << "," << std::fixed << std::setprecision(3) << gflops << std::endl;
 }
 
 /** Calculate average GFLOPs. */
 double calcGflops(const uint64_t flops, const int iters, const double seconds) {
-  return (seconds == 0.0 || seconds == INFINITY)
-             ? 0.0
-             : ((double)(flops * iters) / seconds) * 1e-9;
+  return (seconds == 0.0) ? 0.0 : ((double)(flops * iters) / seconds) * 1e-9;
 }
\ No newline at end of file
diff --git a/include/kernels/.DS_Store b/include/kernels/.DS_Store
new file mode 100644
index 0000000..9cc84b2
Binary files /dev/null and b/include/kernels/.DS_Store differ
diff --git a/include/kernels/CPU/gemm.hh b/include/kernels/CPU/gemm.hh
index 6b4c93e..6dd4786 100644
--- a/include/kernels/CPU/gemm.hh
+++ b/include/kernels/CPU/gemm.hh
@@ -7,7 +7,7 @@ namespace cpu {
 /** An abstract class for GEMM BLAS kernels. */
 template <typename T>
 class gemm : public ::gemm<T> {
- public:
+public:
   using ::gemm<T>::gemm;
   using ::gemm<T>::initInputMatrices;
   using ::gemm<T>::m_;
@@ -17,7 +17,7 @@ class gemm : public ::gemm<T> {
   using ::gemm<T>::B_;
   using ::gemm<T>::C_;
 
- public:
+public:
   /** Initialise the required data structures. */
   void initialise(int m, int n, int k) {
     m_ = m;
@@ -32,7 +32,7 @@ class gemm : public ::gemm<T> {
     initInputMatrices();
   }
 
- private:
+private:
   /** Do any necessary cleanup (free pointers, close library handles, etc.)
    * after Kernel has been called. */
   void postCallKernelCleanup() override {
diff --git a/include/kernels/CPU/spmdnm.hh b/include/kernels/CPU/spmdnm.hh
new file mode 100644
index 0000000..b383fe7
--- /dev/null
+++ b/include/kernels/CPU/spmdnm.hh
@@ -0,0 +1,61 @@
+#pragma once
+
+#include "../spmdnm.hh"
+
+namespace cpu {
+
+/**
+ * An abstract class for sparse matrix-dense matrix BLAS kernels
+ */
+template <typename T>
+class spmdnm : public :: spmdnm<T> {
+public:
+  using ::spmdnm<T>::spmdnm;
+  using ::spmdnm<T>::initInputMatrices;
+  using ::spmdnm<T>::iterations_;
+  using ::spmdnm<T>::nnz_;
+  using ::spmdnm<T>::sparsity_;
+  using ::spmdnm<T>::type_;
+  using ::spmdnm<T>::m_;
+  using ::spmdnm<T>::n_;
+  using ::spmdnm<T>::k_;
+  using ::spmdnm<T>::B_;
+  using ::spmdnm<T>::C_;
+
+public:
+  /**
+    * Initialise the required data structures.
+    */
+  void initialise(int m, int n, int k, double sparsity,
+                  matrixType type, bool binary = false) {
+    m_ = m;
+    n_ = n;
+    k_ = k;
+    sparsity_ = sparsity;
+    type_ = type;
+
+    nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+
+    // Allocate memory for dense matrices
+    B_ = (T*)calloc(k_ * n_, sizeof(T));
+    C_ = (T*)calloc(m_ * n_, sizeof(T));
+
+    // Check for allocation failures
+    if (!B_ || !C_) {
+      std::cerr << "ERROR: Memory allocation failed in spmdnm initialization" << std::endl;
+      exit(1);
+    }
+
+    initInputMatrices();
+  }
+
+private:
+    /** Do any necessary cleanup (free pointers, close library handles, etc.)
+     * after Kernel has been called. */
+    void postCallKernelCleanup() {
+      free(B_);
+      free(C_);
+    }
+};
+
+}
\ No newline at end of file
diff --git a/include/kernels/CPU/spmdnv.hh b/include/kernels/CPU/spmdnv.hh
new file mode 100644
index 0000000..ab02207
--- /dev/null
+++ b/include/kernels/CPU/spmdnv.hh
@@ -0,0 +1,52 @@
+#pragma once
+
+#include "../spmdnv.hh"
+
+#include <random>
+#include <memory>
+
+namespace cpu {
+
+/** An abstract class for SpMDnV BLAS kernels. */
+template <typename T>
+class spmdnv : public ::spmdnv<T> {
+public:
+  using ::spmdnv<T>::spmdnv;
+  using ::spmdnv<T>::initInputMatrixVector;
+  using ::spmdnv<T>::m_;
+  using ::spmdnv<T>::n_;
+  using ::spmdnv<T>::x_;
+  using ::spmdnv<T>::y_;
+  using ::spmdnv<T>::sparsity_;
+  using ::spmdnv<T>::nnz_;
+  using ::spmdnv<T>::type_; 
+
+public:
+  /** Initialise the required data structures. */
+  void initialise(int m, int n, double sparsity, matrixType type) {
+    m_ = m;
+    n_ = n;
+    sparsity_ = sparsity;
+    type_ = type;
+
+    // Note that the below should be the same as the edges calculation
+    // used in the initInputMatricesSparse function.  If changed here,
+    // change there
+    nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_));
+
+    x_ = (T*)malloc(sizeof(T) * n_);
+    y_ = (T*)malloc(sizeof(T) * m_);
+
+    // Initialise the matrix and vectors
+    initInputMatrixVector();
+  }
+
+private:
+  /** Do any necessary cleanup (free pointers, close library handles, etc.)
+    * after Kernel has been called. */
+  void postCallKernelCleanup() {
+    free(x_);
+    free(y_);
+  }
+};
+}  // namespace cpu
\ No newline at end of file
diff --git a/include/kernels/CPU/spmspm.hh b/include/kernels/CPU/spmspm.hh
new file mode 100644
index 0000000..e0bce32
--- /dev/null
+++ b/include/kernels/CPU/spmspm.hh
@@ -0,0 +1,53 @@
+#pragma once
+
+#include "../spmspm.hh"
+
+#include <random>
+#include <memory>
+#include <iostream>
+
+namespace cpu {
+
+/** An abstract class for sparse matrix-sparse matrix BLAS kernels. */
+template <typename T>
+class spmspm : public ::spmspm<T> {
+public:
+  using ::spmspm<T>::spmspm;
+  using ::spmspm<T>::initInputMatrices;
+  using ::spmspm<T>::iterations_;
+  using ::spmspm<T>::A_nnz_;
+  using ::spmspm<T>::B_nnz_;
+  using ::spmspm<T>::sparsity_;
+  using ::spmspm<T>::type_;
+  using ::spmspm<T>::m_;
+  using ::spmspm<T>::n_;
+  using ::spmspm<T>::k_;
+  using ::spmspm<T>::C_rows_;
+  using ::spmspm<T>::C_cols_;
+  using ::spmspm<T>::C_vals_;
+  using ::spmspm<T>::C_nnz_;
+
+public:
+  /** Initialise the required data structures. */
+  void initialise(int n, int m, int k, double sparsity,
+                  matrixType type, bool binary = false) {
+    n_ = n;
+    m_ = m;
+    k_ = k;
+
+    sparsity_ = sparsity;
+    type_ = type;
+
+    /** Determine the number of nnz elements in A and B */
+    A_nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+    B_nnz_ = 1 + (uint64_t)((double)k_ * (double)n_ * (1.0 - sparsity_));
+
+    initInputMatrices();
+  }
+
+private:
+    /** Do any necessary cleanup (free pointers, close library handles, etc.)
+     * after Kernel has been called. */
+  void postCallKernelCleanup() {}
+};
+}  // namespace cpu
diff --git a/include/kernels/GPU/spmdnm.hh b/include/kernels/GPU/spmdnm.hh
new file mode 100644
index 0000000..b817280
--- /dev/null
+++ b/include/kernels/GPU/spmdnm.hh
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "../spmdnm.hh"
+
+namespace gpu {
+
+/** An abstract class for sparse matrix-dense matrix BLAS kernels. */
+    template <typename T>
+    class spmdnm : public ::spmdnm<T> {
+    public:
+        using ::spmdnm<T>::spmdnm;
+
+        /** Initialise the required data structures.
+       * `offload` refers to the data offload type:
+       *  - Once:    Move data from host to device before all iterations & move from
+       *             device to host after all iterations
+       *  - Always:  Move data from host to device and device to host each iteration
+       *  - Unified: Initialise data as unified memory; no data movement semantics
+       *             required */
+        virtual void initialise(gpuOffloadType offload, int m, int n, int k,
+                                double sparsity, matrixType type, 
+                                bool binary = false) = 0;
+
+    protected:
+        /** Whether data should be offloaded to/from the GPU each iteration, or just
+         * before & after. */
+        gpuOffloadType offload_ = gpuOffloadType::always;
+    };
+}  // namespace gpu
\ No newline at end of file
diff --git a/include/kernels/GPU/spmdnv.hh b/include/kernels/GPU/spmdnv.hh
new file mode 100644
index 0000000..41e46ac
--- /dev/null
+++ b/include/kernels/GPU/spmdnv.hh
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "../spmdnv.hh"
+
+namespace gpu {
+
+/** An abstract class for GEMV BLAS kernels. */
+    template <typename T>
+    class spmdnv : public ::spmdnv<T> {
+    public:
+        using ::spmdnv<T>::spmdnv;
+
+        /** Initialise the required data structures.
+         * `offload` refers to the data offload type:
+         *  - Once:    Move data from host to device before all iterations & move from
+         *             device to host after all iterations
+         *  - Always:  Move data from host to device and device to host each iteration
+         *  - Unified: Initialise data as unified memory; no data movement semantics
+         *             required */
+        virtual void initialise(gpuOffloadType offload, int m, int n,
+                                double sparsity, matrixType type) = 0;
+
+    protected:
+        /** Whether data should be offloaded to/from the GPU each iteration, or just
+         * before & after. */
+        gpuOffloadType offload_ = gpuOffloadType::always;
+    };
+}  // namespace gpu
\ No newline at end of file
diff --git a/include/kernels/GPU/spmspm.hh b/include/kernels/GPU/spmspm.hh
new file mode 100644
index 0000000..e36d470
--- /dev/null
+++ b/include/kernels/GPU/spmspm.hh
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "../spmspm.hh"
+
+namespace gpu {
+
+/** An abstract class for sparse matrix-sparse matrix BLAS kernels. */
+template <typename T>
+class spmspm : public ::spmspm<T> {
+public:
+    using ::spmspm<T>::spmspm;
+
+    /** Initialise the required data structures.
+   * `offload` refers to the data offload type:
+   *  - Once:    Move data from host to device before all iterations & move from
+   *             device to host after all iterations
+   *  - Always:  Move data from host to device and device to host each iteration
+   *  - Unified: Initialise data as unified memory; no data movement semantics
+   *             required */
+    virtual void initialise(gpuOffloadType offload, int m, int n, int k,
+                            double sparsity, matrixType type, 
+                            bool binary = false) = 0;
+
+protected:
+    /** Whether data should be offloaded to/from the GPU each iteration, or just
+     * before & after. */
+    gpuOffloadType offload_ = gpuOffloadType::always;
+};
+}  // namespace gpu
\ No newline at end of file
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index 4eda90f..3f0aece 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -1,9 +1,15 @@
 #pragma once
 
+#ifdef CPU_ONEMKL
+#include <mkl.h>
+#endif
+
 #include <algorithm>
 #include <chrono>
 #include <cmath>
 #include <limits>
+#include <random>
+#include <iostream>
 
 #include "../utilities.hh"
 
diff --git a/include/kernels/gemv.hh b/include/kernels/gemv.hh
index ba12d02..a64b19c 100644
--- a/include/kernels/gemv.hh
+++ b/include/kernels/gemv.hh
@@ -4,6 +4,7 @@
 #include <chrono>
 #include <cmath>
 #include <limits>
+#include <random>
 
 #include "../utilities.hh"
 
@@ -82,6 +83,83 @@ class gemv {
     }
   }
 
+  void initInputMatrixVectorSparse() {
+    // Initialise sparse matrix
+    for (int i = 0; i < (n_ * n_); i++) {
+      A_[i] = 0.0;
+    }
+
+    // Random number generator objects for use in descent
+    std::default_random_engine gen;
+    gen.seed(std::chrono::system_clock::now()
+                     .time_since_epoch().count());
+    std::uniform_real_distribution<double> dist(0.0, 1.0);
+
+    uint64_t edges = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 -
+            sparsity_));
+
+    // Using a=0.45 and b=c=0.22 as default probabilities
+    for (uint64_t i = 0; i < edges; i++) {
+      while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
+                   false)) {}
+    }
+
+    // Initialise the input and output vectors
+    for (int y = 0; y < n_; y++) {
+      x_[y] = (T)((double)(rand() % 100) / 3.0);
+    }
+    for (int y = 0; y < m_; y++) {
+      y_[y] = (T)0.0;
+    }
+  }
+
+  /** Recursive function to populate sparse matrices */
+  bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b,
+            float c, std::default_random_engine* gen,
+            std::uniform_real_distribution<double> dist, bool bin) {
+    // If a 1x1 submatrix, then add an edge and return out
+    if (x1 >= x2 && y1 >= y2) {
+      // Needed to avoid overfloe segfaults with large problem sizes
+      uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1);
+      if (abs(M[index]) > 0.1) {
+        return false;
+      } else {
+        // Add 1.0 if this is a binary graph, and a random real number otherwise
+        M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0);
+        return true;
+      }
+    } else {
+      // Divide up the matrix
+      int xMidPoint = x1 + floor((x2 - x1) / 2);
+      int yMidPoint = y1 + floor((y2 - y1) / 2);
+
+      // ToDo -- add some noise to these values between iterations
+      float newA = a;
+      float newB = b;
+      float newC = c;
+
+      // Work out which quarter to recurse into
+      // There are some ugly ternary operators here to avoid going out of bounds in the edge case
+      // that we are already at 1 width or 1 height
+      float randomNum = dist(*gen);
+      if (randomNum < a) {
+        return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+                    newA, newB, newC, gen, dist, bin);
+      } else if (randomNum < (a + b)) {
+        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+                    newA, newB, newC, gen, dist, bin);
+      } else if (randomNum < (a + b + c)) {
+        return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+                    newA, newB, newC, gen, dist, bin);
+      } else {
+        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+                    ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
+                    gen, dist, bin);
+      }
+    }
+    return true;
+  }
+
   /** Call the extern consume() function. */
   void callConsume() { consume((void*)A_, (void*)x_, (void*)y_); }
 
@@ -105,4 +183,6 @@ class gemv {
 
   /** The distance between two vector elements. */
   const int vecIncrement_ = 1;
+
+  double sparsity_ = 0.0;
 };
diff --git a/include/kernels/spmdnm.hh b/include/kernels/spmdnm.hh
new file mode 100644
index 0000000..92ea1ad
--- /dev/null
+++ b/include/kernels/spmdnm.hh
@@ -0,0 +1,117 @@
+#pragma once
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <limits>
+#include <random>
+#include <iostream>
+
+#include "../utilities.hh"
+
+/**
+* A generic abstract class defining the operation of timing a sparse GEMM
+ * BLAS kernel for n iterations
+*/
+template <typename T>
+class spmdnm {
+public:
+    spmdnm(const int iters) : iterations_(iters) {}
+
+    /** Call the kernel n times.  Returns the time elapsed for all n calls
+     * in seconds */
+    time_checksum_gflop compute() {
+      // Start the timer
+      std::chrono::time_point<std::chrono::high_resolution_clock> startTime =
+              std::chrono::high_resolution_clock::now();
+
+      // perform the SPMM calls
+      preLoopRequirements();
+      for (int i = 0; i < iterations_; i++) {
+        callSpmdnm();
+      }
+      postLoopRequirements();
+
+      // Stop the timer
+      std::chrono::time_point<std::chrono::high_resolution_clock> endTime =
+              std::chrono::high_resolution_clock::now();
+      std::chrono::duration<double> time_s = endTime - startTime;
+
+      double checksum = calcChecksum();
+
+      postCallKernelCleanup();
+
+      return {time_s.count(), checksum, 0.0};
+    }
+
+    int64_t nnz_ = 0;
+
+private:
+    /** Performs the steps required before calling the SPMM kernel that
+     * should be timed */
+    virtual void preLoopRequirements() = 0;
+
+    /** Perform the sparse GEMM kernel. */
+    virtual void callSpmdnm() = 0;
+
+    /** Perform any steps required after calling the SPMM kernel that should
+     * be timed */
+    virtual void postLoopRequirements() = 0;
+
+    /** Do the necessary cleanup after the kernel has been finished that
+     * should not be timed */
+    virtual void postCallKernelCleanup() = 0;
+
+    /** Calculate a checksum from the result matrix C. */
+    constexpr double calcChecksum() {
+      return 0;
+      // Checksum for GEMM calculated by summing all four corners of C together
+      return ((double)C_[0] + (double)C_[m_ - 1] + (double)C_[(m_ * (n_ - 1))] +
+              (double)C_[m_ * n_ - 1]);
+    }
+
+protected:
+    /** Set up the starting matrices */
+    void initInputMatrices() {
+      // Initialize B with random values
+      srand(SEED);
+      for (int i = 0; i < (k_ * n_); i++) {
+        B_[i] = (T)((double)(rand() % 100) / 7.0);
+      }
+
+      // Initialize C to zero
+      for (int i = 0; i < (m_ * n_); i++) {
+        C_[i] = (T)0.0;
+      }
+
+      toSparseFormat();
+    }
+
+    /** Move matrices into the sparse representation of for the given library */
+    virtual void toSparseFormat() = 0;
+
+    /** Call the external consume() function on the matrices */
+    void callConsume() {}/** Recursive function to populate sparse matrices */
+
+    /** The number of iterations to perform per problem size. */
+    const int iterations_;
+
+    /** Matrix dimension M. */
+    int64_t m_ = 0;
+
+    /** Matrix dimension N. */
+    int64_t n_ = 0;
+
+    /** Matrix dimension K. */
+    int64_t k_ = 0;
+
+    /** Dense representation of input matrix B. */
+    T* B_;
+
+    /** Dense representation of output matrix C. */
+    T* C_;
+
+    double sparsity_;
+
+    matrixType type_;
+};
\ No newline at end of file
diff --git a/include/kernels/spmdnv.hh b/include/kernels/spmdnv.hh
new file mode 100644
index 0000000..b887e29
--- /dev/null
+++ b/include/kernels/spmdnv.hh
@@ -0,0 +1,118 @@
+
+#pragma once
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <limits>
+#include <random>
+
+#include "../utilities.hh"
+
+/** A generic abstract class defining the operation of timing an SPGEMM BLAS
+ * kernel for n iterations. */
+template <typename T>
+class spmdnv {
+public:
+    spmdnv(const int iters) : iterations_(iters) {}
+
+    /** Call the BLAS kernel n times.
+     * Returns the time elapsed for n BLAS calls in seconds. */
+    time_checksum_gflop compute() {
+      // Start timer
+      std::chrono::time_point<std::chrono::high_resolution_clock> startTime =
+              std::chrono::high_resolution_clock::now();
+
+      // Perform all SPGEMM calls
+      preLoopRequirements();
+      for (int i = 0; i < iterations_; i++) {
+        callSpMDnV();
+      }
+      postLoopRequirements();
+
+      // Stop Timer
+      std::chrono::time_point<std::chrono::high_resolution_clock> endTime =
+              std::chrono::high_resolution_clock::now();
+      // Get time elapsed in seconds
+      std::chrono::duration<double> time_s = endTime - startTime;
+
+      double checksum = calcChecksum();
+
+      postCallKernelCleanup();
+
+      return {time_s.count(), checksum, 0.0};
+    }
+
+    int64_t nnz_ = 0;
+
+private:
+    /** Perform any required steps before calling the SpMDnV kernel that should
+     * be timed. */
+    virtual void preLoopRequirements() = 0;
+
+    /** Perform the SpMDnV kernel. */
+    virtual void callSpMDnV() = 0;
+
+    /** Perform any required steps after calling the SpMDnV kernel that should
+     * be timed. */
+    virtual void postLoopRequirements() = 0;
+
+    /** Do any necessary cleanup (free pointers, close library handles, etc.)
+     * after Kernel has been called. */
+    virtual void postCallKernelCleanup() = 0;
+
+    /** Calculate a checksum from the result vector y. */
+    // Todo -- work out how to sensibly do this for sparse
+    constexpr double calcChecksum() {
+      // Checksum for SpMDnV calculated by summing max and min element of output
+      // vector
+      return ((double)y_[0] + (double)y_[m_ - 1]);
+    }
+
+protected:
+    void initInputMatrixVector() {
+      // Set the seed to allow checksum to work
+      srand(SEED);
+      
+      // Initialise the input and output vectors
+      for (int y = 0; y < n_; y++) {
+        x_[y] = (T)((double)(rand() % 100) / 3.0);
+      }
+      for (int y = 0; y < m_; y++) {
+        y_[y] = (T)0.0;
+      }
+
+      toSparseFormat();
+    }
+
+    bool print_ = false;
+
+    /** Move starting matrix into the sparse representation of for the given
+     * library */
+    virtual void toSparseFormat() = 0;
+
+    /** Call the extern consume() function. */
+    void callConsume() {}
+
+    /** The number of iterations to perform per problem size. */
+    const int iterations_;
+
+    /** Matrix dimension M. */
+    int m_ = 0;
+
+    /** Matrix / vector dimension N. */
+    int n_ = 0;
+
+    /** Input vector x. */
+    T* x_;
+
+    /** Input vector y. */
+    T* y_;
+
+    /** The distance between two vector elements. */
+    const int vecIncrement_ = 1;
+
+    double sparsity_ = 0.0;
+
+    matrixType type_;
+};
diff --git a/include/kernels/spmspm.hh b/include/kernels/spmspm.hh
new file mode 100644
index 0000000..87f55b3
--- /dev/null
+++ b/include/kernels/spmspm.hh
@@ -0,0 +1,117 @@
+#pragma once
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <limits>
+#include <iostream>
+
+#include "../utilities.hh"
+
+/** A generic abstract class defining the operation of timing a SpMSpM BLAS
+ * kernel for n iterations */
+template <typename T>
+class spmspm {
+public:
+    spmspm(const int iters) : iterations_(iters) {}
+
+    /** Call the kernel n times.  Returns the time elapsed for all n calls
+     * in seconds */
+    time_checksum_gflop compute() {
+      // Start the timer
+      std::chrono::time_point<std::chrono::high_resolution_clock> startTime =
+              std::chrono::high_resolution_clock::now();
+
+      // perform the SpMSpM calls
+      preLoopRequirements();
+      for (int i = 0; i < iterations_; i++) {
+        callSpmspm();
+      }
+      postLoopRequirements();
+
+      // Stop the timer
+      std::chrono::time_point<std::chrono::high_resolution_clock> endTime =
+              std::chrono::high_resolution_clock::now();
+      std::chrono::duration<double> time_s = endTime - startTime;
+
+      double checksum = calcChecksum();
+
+      postCallKernelCleanup();
+
+      return {time_s.count(), checksum, 0.0};
+    }
+
+private:
+    /** Performs the steps required before calling the SpMSpM kernel that
+     * should be timed */
+    virtual void preLoopRequirements() = 0;
+
+    /** Perform the SpMSpM kernel. */
+    virtual void callSpmspm() = 0;
+
+    /** Perform any steps required after calling the SpMSpM kernel that should
+     * be timed */
+    virtual void postLoopRequirements() = 0;
+
+    /** Do the necessary cleanup after the kernel has been finished that
+     * should not be timed */
+    virtual void postCallKernelCleanup() = 0;
+
+    /** Calculate a checksum from the result matrix C. */
+    constexpr double calcChecksum() {
+      return 0;
+      if (C_nnz_ == 0) {
+        return (double)0.0; // No non-zeros, return zero checksum
+      } else if (C_nnz_ == 1) {
+        return (double)C_vals_[0]; // Single non-zero, return its value
+      } else {
+        return (double)C_vals_[0] + (double)C_vals_[C_nnz_ - 1];
+      }
+    }
+
+protected:
+    /** Set up the starting matrices */
+    void initInputMatrices() {    
+      toSparseFormat();
+    }
+
+    /** Move matrices into the sparse representation of for the given library */
+    virtual void toSparseFormat() = 0;
+
+    /** Call the external consume() function on the matrices */
+    void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); }/** Recursive function to populate sparse matrices */
+
+    /** The number of iterations to perform per problem size. */
+    const int iterations_;
+
+    /** Matrix dimension M. */
+    int m_ = 0;
+
+    /** Matrix dimension N. */
+    int n_ = 0;
+
+    /** Matrix dimension K. */
+    int k_ = 0;
+
+    /** Dense representation of input matrix A. */
+    T* A_;
+
+    /** Dense representation of input matrix B. */
+    T* B_;
+
+    /** Dense representation of output matrix C. */
+    T* C_;
+
+    /** CSR representation of output matrix C. */
+    int64_t C_nnz_;
+    int64_t* C_rows_;
+    int64_t* C_cols_;
+    T* C_vals_;
+
+    int64_t A_nnz_ = 0;
+    int64_t B_nnz_ = 0;
+
+    double sparsity_;
+
+    matrixType type_;
+};
\ No newline at end of file
diff --git a/include/main.hh b/include/main.hh
index cc0bb8f..37a8d9a 100644
--- a/include/main.hh
+++ b/include/main.hh
@@ -5,7 +5,10 @@
 #include <string>
 
 #include "doGemm.hh"
+#include "doSpmdnm.hh"
+#include "doSpmspm.hh"
 #include "doGemv.hh"
+#include "doSpmdnv.hh"
 #include "utilities.hh"
 
 /** A function which prints standard configuration information to stdout. */
@@ -14,5 +17,5 @@ void printBenchmarkConfig(const int iters, const int upperLimit);
 /** A function to parse a string to integer. */
 int parseInt(const char* str);
 
-/** A function which parsen the runtime arguments. */
-void getParameters(int argc, char* argv[]);
\ No newline at end of file
+/** A function which parses the runtime arguments. */
+void getParameters(int argc, char** argv);
\ No newline at end of file
diff --git a/include/utilities.hh b/include/utilities.hh
index ac0aeb0..9e7c64a 100644
--- a/include/utilities.hh
+++ b/include/utilities.hh
@@ -1,5 +1,15 @@
 #pragma once
 
+#include <random>
+#include <chrono>
+#include <cmath>
+#include <algorithm>
+#include <vector>
+#include <queue>
+#include <iostream>
+#include <set>
+#include <numeric>
+
 // Define CPU related macros
 #if defined CPU_ARMPL
 #define CPU_LIB_NAME "Arm Performance Libraries"
@@ -53,6 +63,12 @@ enum class gpuOffloadType : uint8_t {
   unified,
 };
 
+enum class matrixType : uint8_t {
+  rmat = 0,
+  random,
+  finiteElements,
+};
+
 // Define struct which contains a runtime, checksum value, and gflop/s value
 struct time_checksum_gflop {
   double runtime = 0.0;
@@ -76,4 +92,466 @@ struct cpuGpu_offloadThreshold {
 // performed.
 extern "C" {
 int consume(void* a, void* b, void* c);
-}
\ No newline at end of file
+}
+
+
+template <typename fp_type, typename int_type> 
+void printCSR(uint64_t nRows,
+              uint64_t nnz,
+              const int_type* rows,
+              const int_type* cols,
+              const fp_type* vals) {
+  std::cout << "ROWS:" << std::endl;
+  std::cout << "\t[";
+  for (uint64_t i = 0; i <= nRows; i++) {
+    std::cout << rows[i];
+    if (i < nRows) {
+      std::cout << ", ";
+    }
+  }
+  std::cout << "]" << std::endl;
+
+  std::cout << "COLS:" << std::endl;
+  std::cout << "\t[";
+  for (uint64_t i = 0; i < nnz; i++) {
+    std::cout << cols[i];
+    if (i < nnz - 1) {
+      std::cout << ", ";
+    }
+  }
+  std::cout << "]" << std::endl;
+
+  std::cout << "VALS:" << std::endl;
+  std::cout << "\t[";
+  for (uint64_t i = 0; i < nnz; i++) {
+    std::cout << vals[i];
+    if (i < nnz - 1) {
+      std::cout << ", ";
+    }
+  }
+  std::cout << "]" << std::endl;
+}
+
+template <typename fp_type, typename int_type>
+void checkCSRValid(uint64_t nRows,
+                   uint64_t nCols,
+                   uint64_t nnz,
+                   const int_type* rows,
+                   const int_type* cols,
+                   const fp_type* vals) {
+  if (rows[0] != 0) {
+    std::cerr << "[ERROR]: CSR INVALID - row_pointer[0] is not 0" << std::endl;
+    printCSR(nRows, nnz, rows, cols, vals);
+    exit(1);
+  }
+
+  for (uint64_t r = 0; r < nRows; r++) {
+    if (rows[r] > rows[r + 1]) {
+      std::cerr << "[ERROR]: CSR INVALID - row_pointer[" << r << "] > row_pointer[" << (r + 1) << "]" << std::endl;
+    printCSR(nRows, nnz, rows, cols, vals);
+      exit(1);
+    }
+  }
+
+  if (rows[nRows] != (int_type)nnz) {
+    std::cerr << "[ERROR]: CSR INVALID - row_pointer[nRows] != nnz" << std::endl;
+    printCSR(nRows, nnz, rows, cols, vals);
+    exit(1);
+  }
+
+  for (uint64_t i = 0; i < nnz; i++) {
+    if (cols[i] < 0 || cols[i] >= (int_type)nCols) {
+      std::cerr << "[ERROR]: CSR INVALID - column index out of bounds" << std::endl;
+      printCSR(nRows, nnz, rows, cols, vals);
+      exit(1);
+    }
+  }
+
+  for (uint64_t r = 0; r < nRows; r++) {
+    for (int_type j = rows[r]; j + 1 < (rows[r + 1]); j++) {
+      if (cols[j] > cols[j + 1]) {
+        std::cerr << "[ERROR]: CSR INVALID - column indices not sorted in row " << r << std::endl;
+        printCSR(nRows, nnz, rows, cols, vals);
+        exit(1);
+      }
+      if (cols[j] == cols[j + 1]) {
+        std::cerr << "[ERROR]: CSR INVALID - duplicate column indices in row " << r << std::endl;
+        printCSR(nRows, nnz, rows, cols, vals);
+        exit(1);
+      }
+    }
+  }
+}
+
+
+/**
+ * @brief Generate an R-MAT matrix directly in CSR format.
+ *
+ * This function samples `nnz` edges (nonzeros) from the R-MAT distribution and
+ * writes the result directly into CSR arrays:
+ *   - vals[k] : value of the k-th nonzero (here set to 1 by default)
+ *   - cols[k] : column index of the k-th nonzero
+ *   - rows[i] : starting offset in (vals, cols) for row i
+ *               (classic CSR row pointer of length nrows+1)
+ *
+ * Memory usage is O(nnz) (plus a temporary edge list), avoiding any dense
+ * matrix construction.
+ *
+ * IMPORTANT BEHAVIOR:
+ *  - Undirected: When `undirected == true`, this code only enforces u <= v
+ *    during sampling (so edges are oriented consistently). It does NOT insert
+ *    the symmetric counterpart (v,u). If you want a symmetric matrix, you must
+ *    explicitly duplicate edges (except diagonal) before CSR conversion.
+ *  - Seeding: Uses a global or external `SEED` to make the generator
+ *    deterministic/reproducible. Ensure `SEED` is defined in your translation unit.
+ *
+ * Complexity:
+ *  - Sampling:    O(nnz * log(max(nrows,ncols))) bit-decisions per edge
+ *  - Sorting:     O(nnz log nnz) (by row, then col)
+ *  - CSR build:   O(nnz + nrows)
+ *
+ * @tparam T         Numeric type for values (e.g., float, double, int)
+ * @tparam int_type  Integer type for indices (e.g., int, int32_t, int64_t)
+ *
+ * @param vals   Output array of length nnz (nonzero values)
+ * @param cols   Output array of length nnz (column indices)
+ * @param rows   Output array of length nrows+1 (row pointer)
+ * @param nrows  Number of rows in the matrix
+ * @param ncols  Number of columns in the matrix
+ * @param nnz    Number of nonzeros to generate
+ * @param a,b,c,d R-MAT quadrant probabilities (must sum to 1; typical: 0.57,0.19,0.19,0.05)
+ * @param noise  Optional jitter in probabilities each bit step (0.0 = none)
+ * @param no_self_loops If true, edges with u == v are discarded and resampled
+ * @param undirected    If true, enforce u <= v in the sampled edge; does NOT mirror edges
+ *
+ * @note Typical R-MAT parameters for realistic graphs:
+ *       - a=0.45, b=0.15, c=0.15, d=0.25 (Kronecker-like)
+ *       - a=0.57, b=0.19, c=0.19, d=0.05 (more skewed)
+ *
+ * @note For sparse linear algebra benchmarks, this generates matrices with:
+ *       - Irregular sparsity patterns (not banded/block-structured)
+ *       - Variable row/column densities challenging load balancing
+ *       - Realistic cache behavior representative of graph applications
+ *
+ * @warning Uses 64-bit indexing to prevent overflow for large matrices (n > 46k)
+ * @warning Non-thread-safe due to shared random number generator
+ *
+ * References:
+ * - Chakrabarti, D., Zhan, Y., & Faloutsos, C. (2004). R-MAT: A recursive model
+ *   for graph mining. SIAM International Conference on Data Mining.
+ * - Leskovec, J., et al. (2010). Kronecker graphs: An approach to modeling networks.
+ *   Journal of Machine Learning Research, 11, 985-1042.
+ */
+template <typename T, typename int_type>
+void rMatCSR(T* vals, int_type* cols, int_type* rows,
+             int_type nrows, int_type ncols, int_type nnz, 
+             uint64_t seed = SEED,
+             double a = 0.57,
+             double b = 0.19,
+             double c = 0.19,
+             double d = 0.05,
+             double noise = 0.0,
+             bool no_self_loops = false,
+             bool undirected = false) {
+  // Number of bits needed to index into the row/col ranges.
+  // R-MAT decides each bit from MSB→LSB by picking a quadrant.
+  int row_bits = static_cast<int>(std::ceil(std::log2(nrows)));
+  int col_bits = static_cast<int>(std::ceil(std::log2(ncols)));
+
+  // Set up RNG.  Uses srand for value generation, and uniform[0,1)
+  // for quadrant selection
+  srand(seed);
+  std::default_random_engine gen;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  gen.seed(seed);
+
+
+  // Temporary storage of sampled edges as (row, col) pairs.
+  // We reserve exactly nnz slots and will push_back exactly nnz valid edges.
+  std::vector<std::pair<int_type, int_type>> edges;
+  edges.reserve(nnz);
+
+  // Keep sampling until we have nnz valid edges.
+  // Invalid candidates (out-of-bounds due to non-powers-of-two, self-loops, etc.)
+  // are discarded by continuing the loop without incrementing the edge count.
+  int edge_idx = 0;
+  while (edge_idx < nnz) {
+    int u = 0; // Sampled row index (as int, cast to int_type later)
+    int v = 0; // Sampled column index
+
+    // Base quadrant probabilities (A,B,C,D). We optionally jitter these
+    // at each bit decision if 'noise' > 0.
+    double A = a, B = b, C = c, D = d;
+
+    // For each bit (from most-significant to least), decide which quadrant
+    // the edge falls into and set the corresponding bit of (u,v).
+    for (int bit = 0; bit < std::max(row_bits, col_bits); ++bit) {
+      // Optional noise: perturb A,B,C,D slightly, then renormalize.
+      if (noise > 0.0) {
+        auto jitter = [&](double val) {
+          // Perturb within ±noise, clamp to [0,1] lower bound via max(0,•)
+          return std::max(0.0, val + (dist(gen) * 2.0 - 1.0) * noise);
+        };
+        A = jitter(a);
+        B = jitter(b);
+        C = jitter(c);
+        D = jitter(d);
+        double sum = A + B + C + D;
+        // Guard against degenerate total (shouldn’t happen unless noise is extreme)
+        A = (sum > 0) ? (A / sum) : 0.25;
+        B = (sum > 0) ? (B / sum) : 0.25;
+        C = (sum > 0) ? (C / sum) : 0.25;
+        D = (sum > 0) ? (D / sum) : 0.25;
+      }
+
+      // Draw r ~ U(0,1) and select quadrant by cumulative thresholds.
+      double r = dist(gen);
+      double t1 = A;
+      double t2 = A + B;
+      double t3 = A + B + C;
+
+      int row_bit = 0, col_bit = 0;
+      if (r < t1) {
+      // Quadrant 00
+        row_bit = 0; col_bit = 0;
+      } else if (r < t2) {
+      // Quadrant 01
+        row_bit = 0; col_bit = 1;
+      } else if (r < t3) {
+      // Quadrant 10
+        row_bit = 1; col_bit = 0;
+      } else {
+      // Quadrant 11
+        row_bit = 1; col_bit = 1;
+      }
+
+      // Only set bits that are within the bit-width of rows/cols respectively.
+      if (bit < row_bits) u = (u << 1) | row_bit;
+      if (bit < col_bits) v = (v << 1) | col_bit;
+    }
+
+    // If dimensions are not powers of two, some combinations will exceed bounds.
+    if (u >= nrows) u = u % nrows;
+    if (v >= ncols) v = v % ncols;
+    // If undirected, orient edges consistently (store the "upper-triangular" orientation).
+    // NOTE: This does NOT create symmetric pairs; it only enforces a canonical ordering.
+    if (undirected && u > v) std::swap(u, v);
+    // If a duplicate, do not commit edge
+    if (std::find(edges.begin(), edges.end(), std::make_pair((int_type)u, (int_type)v)) != edges.end()) continue;
+
+    // Commit the sampled edge.
+    edges.emplace_back((int_type)u, (int_type)v);
+    ++edge_idx;
+  }
+  
+
+  // Sort edges primarily by row, and secondarily by column.
+  // CSR expects nonzeros grouped by row; sorting also makes columns within
+  // each row non-decreasing, which is often desirable.
+  std::sort(edges.begin(), edges.end(),
+            [](auto& a, auto& b) {
+                return (a.first < b.first) ||
+                        (a.first == b.first && a.second < b.second);
+            });
+
+  // Initialize row pointer array with zeros.
+  // rows[i] will eventually hold the starting index in (vals, cols) of row i.
+  // rows[nrows] will equal nnz after prefix-sum (the total number of nonzeros).
+  for (size_t i = 0; i < static_cast<size_t>(nrows + 1); i++) rows[i] = 0;
+
+  // Linear pass over sorted edges to fill cols/vals and count entries per row.
+  // We write the k-th edge's column into cols[k] and its value into vals[k].
+  // Simultaneously, we increment a per-row count into rows[r+1].
+  for (size_t i = 0; i < static_cast<size_t>(nnz); ++i) {
+    const int_type r = edges[static_cast<size_t>(i)].first;
+    const int_type c = edges[static_cast<size_t>(i)].second;
+
+    cols[static_cast<size_t>(i)] = c;
+    vals[static_cast<size_t>(i)] = (T)((double)(rand() % 100) / 3.0);
+
+    // Count one nonzero in row r by bumping rows[r+1].
+    // After this loop, rows[k+1] holds the count of nonzeros in row k.
+    rows[static_cast<size_t>(r) + 1]++;
+  }
+  for (size_t i = 0; i < static_cast<size_t>(nrows); i++) {
+      rows[static_cast<size_t>(i) + 1] += rows[static_cast<size_t>(i)];
+  }
+
+  checkCSRValid(nrows, ncols, nnz, rows, cols, vals);
+}
+
+template <typename T, typename int_type>
+void randomCSR(T* vals, int_type* cols, int_type* rows,
+               int nrows, int ncols, int nnz, unsigned int seed = SEED) {
+  if ((int64_t)nnz >= (int64_t)nrows * (int64_t)ncols) {
+    std::cerr << "ERROR: nnz exceeds maximum possible non-zeros." << std::endl;
+    exit(1);
+  } else if (nnz <= 0) {
+    std::cerr << "ERROR: nnz must be positive." << std::endl;
+    exit(1);
+  }
+
+  srand(seed);
+  std::default_random_engine gen;
+  std::uniform_int_distribution<int_type> col_dist(0, ncols - 1);
+  gen.seed(seed);
+
+  // Generate number of non-zeros per row
+  std::vector<int_type> row_counts(nrows, 0);
+  int total_nonzeros = 0;
+  while (total_nonzeros < nnz) {
+    int_type r = rand() % nrows;
+    if (row_counts[r] >= ncols) continue; // Skip if row is already full
+    row_counts[r]++;
+    total_nonzeros++;
+  }
+
+  // Create the row pointer array
+  rows[0] = 0;
+  for (int r = 0; r < nrows; r++) {
+    rows[r + 1] = rows[r] + row_counts[r];
+  }
+
+  int index = 0;
+  // Make a bitmap of the columns that are going to be used in this row
+  std::vector<bool> rCols(ncols, false);
+  for (int r = 0; r < nrows; r++) {
+    int c = 0;
+    while (c < row_counts[r]) {
+      int_type col = col_dist(gen);
+      if (!rCols[col]) {
+        rCols[col] = true;
+        c++;
+      }
+    }
+    // Create the column index array
+    for (int_type cIndex = 0; cIndex < ncols; cIndex++) {
+      if (rCols[cIndex]) {
+        cols[index] = cIndex;
+        index++;
+        rCols[cIndex] = false;  // Reset the bitmap for the next row
+      }
+    }
+  }
+  
+  // Randomise the values array
+  index = 0; 
+  for (int r = 0; r < nrows; r++) {
+    for (int j = 0; j < row_counts[r]; j++) {
+      vals[index] = (T)((double)(rand() % 100) / 3.0);
+      index++;
+    }
+  }
+  checkCSRValid(nrows, ncols, nnz, rows, cols, vals);
+}
+
+template <typename int_type>
+int64_t calcCNNZ(int_type A_n_rows, int_type A_nnz, int_type* A_rows, int_type* A_cols,
+                 int_type B_n_cols, int_type B_nnz, int_type* B_rows, int_type* B_cols) {
+  int64_t C_nnz = 0;
+
+  for (int_type i = 0; i < A_n_rows; i++) {
+    for (int_type j = A_rows[i]; j < A_rows[i + 1]; j++) {
+      int_type a_col = A_cols[j];
+      if (a_col < 0 || a_col >= B_n_cols) {
+        std::cerr << "[ERROR]: calcCNNZ - A column index out of bounds for B" << std::endl;
+        continue;
+      }
+      for (int_type k = B_rows[a_col]; k < B_rows[a_col + 1]; k++) {
+        if (B_cols[k] == i) {
+          C_nnz++;
+          break;
+        }
+      }
+    }
+  }
+
+  return C_nnz;
+}
+
+/**
+ * @brief Generates a densely-filled banded matrix.
+ *
+ * It first calculates the minimum bandwidth 'k' required to store at least
+ * 'nnz' elements. It then fills the band (diagonals -k to +k) row by row,
+ * respecting matrix boundaries, until exactly 'nnz' elements are written.
+ */
+template <typename T, typename int_type>
+void finiteElementCSR(T* vals, int_type* cols, int_type* rows,
+                      int nrows, int ncols, int_type nnz,
+                      unsigned int seed = SEED) 
+{
+    long long max_nnz = (long long)nrows * ncols;
+    if (nnz > max_nnz) {
+        std::cerr << "Warning: Clamping NNZ." << std::endl;
+        nnz = max_nnz;
+    }
+
+    if (nnz == 0) {
+        for (int r = 0; r <= nrows; r++) rows[r] = 0;
+        return;
+    }
+
+    std::mt19937 gen(seed);
+    std::uniform_real_distribution<T> val_dist(-1.5, 1.5);
+
+    // --- 1. Find the bandwidth 'k' needed to fit 'nnz' ---
+    int_type k = 0; // k is the "radius" of the band
+    long long nnz_in_band = 0;
+    while (nnz_in_band < nnz) {
+        nnz_in_band = 0;
+        for (int r = 0; r < nrows; r++) {
+            int_type c_midpoint = r * ncols / nrows;
+            int_type c_min = std::max<int_type>(0, c_midpoint - k);
+            int_type c_max = std::min<int_type>(ncols - 1, c_midpoint + k);
+            nnz_in_band += (c_max - c_min + 1);
+        }
+
+        if (nnz_in_band >= nnz) break; // Found a big enough band
+        
+        k++;
+        
+        // Safety break if k grows larger than the matrix
+        if (k > std::max(nrows, ncols)) {
+             std::cerr << "Warning: Bandwidth loop failed. Clamping NNZ." << std::endl;
+             nnz = nnz_in_band; // nnz is now the max possible
+             break;
+        }
+    }
+
+    // --- 2. Fill the CSR arrays using the discovered bandwidth 'k' ---
+    rows[0] = 0;
+    int_type current_nnz = 0;
+
+    for (int r = 0; r < nrows; r++) {
+        // Find the correct column bounds for this row
+            int_type c_midpoint = r * ncols / nrows;
+            int_type c_min = std::max<int_type>(0, c_midpoint - k);
+            int_type c_max = std::min<int_type>(ncols - 1, c_midpoint + k);
+
+        // Fill the band for this row
+        for (int_type c = c_min; c <= c_max; c++) {
+            // Stop *exactly* at nnz
+            if (current_nnz >= nnz) {
+                break;
+            }
+
+            vals[current_nnz] = val_dist(gen);
+            cols[current_nnz] = c;
+            current_nnz++;
+        }
+
+        rows[r + 1] = current_nnz;
+
+        if (current_nnz >= nnz) {
+            // We're done. Fill the rest of the row pointers.
+            for (int rest_r = r + 1; rest_r < nrows; rest_r++) {
+                rows[rest_r + 1] = nnz;
+            }
+            break; // Exit the main row loop
+        }
+    }
+    
+    // Ensure the final pointer is correct
+    rows[nrows] = current_nnz;
+}
diff --git a/oneMKL/CPU/gemm.hh b/oneMKL/CPU/gemm.hh
index bdb7ba5..0ae554a 100644
--- a/oneMKL/CPU/gemm.hh
+++ b/oneMKL/CPU/gemm.hh
@@ -50,8 +50,7 @@ class gemm_cpu : public gemm<T> {
                   std::max(1, m_));
     } else {
       // Un-specialised class will not do any work - print error and exit.
-      std::cout << "ERROR - Datatype for OneMKL CPU GEMM kernel not supported."
-                << std::endl;
+      std::cerr << "ERROR - Datatype for OneMKL CPU GEMM kernel not supported." << std::endl;
       exit(1);
     }
     // Ensure compiler doesn't optimise away the work being done
diff --git a/oneMKL/CPU/gemv.hh b/oneMKL/CPU/gemv.hh
index b53a83c..8a5a013 100644
--- a/oneMKL/CPU/gemv.hh
+++ b/oneMKL/CPU/gemv.hh
@@ -47,8 +47,7 @@ class gemv_cpu : public gemv<T> {
                   std::max(1, m_), x_, vecIncrement_, beta, y_, vecIncrement_);
     } else {
       // Un-specialised class will not do any work - print error and exit.
-      std::cout << "ERROR - Datatype for OneMKL CPU GEMV kernel not supported."
-                << std::endl;
+      std::cout << "ERROR - Datatype for OneMKL CPU GEMV kernel not supported." << std::endl;
       exit(1);
     }
     // Ensure compiler doesn't optimise away the work being done
diff --git a/oneMKL/CPU/spmdnm.hh b/oneMKL/CPU/spmdnm.hh
new file mode 100644
index 0000000..6e5a132
--- /dev/null
+++ b/oneMKL/CPU/spmdnm.hh
@@ -0,0 +1,171 @@
+#pragma once
+
+#ifdef CPU_ONEMKL
+#include <mkl.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include "../../include/kernels/CPU/spmdnm.hh"
+#include "../../include/utilities.hh"
+
+namespace cpu {
+/** A class for sparse matrix-dense matrix BLAS kernels. */
+template <typename T>
+class spmdnm_cpu : public spmdnm<T> {
+public:
+    using spmdnm<T>::spmdnm;
+    using spmdnm<T>::callConsume;
+    using spmdnm<T>::initInputMatrices;
+    using spmdnm<T>::m_;
+    using spmdnm<T>::n_;
+    using spmdnm<T>::k_;
+    using spmdnm<T>::B_;
+    using spmdnm<T>::C_;
+    using spmdnm<T>::sparsity_;
+    using spmdnm<T>::type_;
+    using spmdnm<T>::nnz_;
+
+    void initialise(int m, int n, int k, double sparsity,
+                    matrixType type, bool binary = false) {
+      m_ = m;
+      n_ = n;
+      k_ = k;
+
+      m_mkl_ = m;
+      n_mkl_ = n;
+      k_mkl_ = k;
+
+      sparsity_ = sparsity;
+      type_ = type;
+
+      /** Determine the number of nnz elements in A and B */
+      nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+      B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64);
+      C_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64);
+
+      initInputMatrices();
+    }
+
+protected:
+    void toSparseFormat() override {
+      A_vals_ = (T*)mkl_malloc(sizeof(T) * nnz_, 64);
+      A_cols_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * nnz_, 64);
+      // Make a temporary rows array of the ususal CSR type, to then turn into the two-array MKL version
+      MKL_INT* A_rows_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (m_ + 1), 64);
+      A_rowsb_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * m_, 64);
+      A_rowse_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * m_, 64);
+
+      if (type_ == matrixType::rmat) {
+        rMatCSR<T, MKL_INT>(A_vals_, A_cols_, A_rows_, m_, k_, nnz_);
+      } else if (type_ == matrixType::random) {
+        randomCSR<T, MKL_INT>(A_vals_, A_cols_, A_rows_, m_, k_, nnz_);
+      } else if (type_ == matrixType::finiteElements) {
+        finiteElementCSR<T, MKL_INT>(A_vals_, A_cols_, A_rows_, m_, k_, nnz_);
+      } else {
+        std::cerr << "Unknown matrix type" << std::endl;
+        exit(1);
+      }
+
+      for (uint64_t i = 0; i < m_; i++) {
+        A_rowsb_[i] = A_rows_[i];
+        A_rowse_[i] = A_rows_[i + 1];
+      }
+      // Clean up the temporary array
+      mkl_free(A_rows_);
+    }
+
+private:
+    void preLoopRequirements() override {
+      if constexpr (std::is_same_v<T, float>) {
+        status_ = mkl_sparse_s_create_csr(&A_csr_,
+                                          indexing_,
+                                          m_,
+                                          k_,
+                                          A_rowsb_,
+                                          A_rowse_,
+                                          A_cols_,
+                                          A_vals_);
+        if (status_ != SPARSE_STATUS_SUCCESS) {
+          std::cout << "ERROR " << status_ << std::endl;
+          exit(1);
+        }
+      } else if constexpr (std::is_same_v<T, double>) {
+        status_ = mkl_sparse_d_create_csr(&A_csr_,
+                                          indexing_,
+                                          m_,
+                                          k_,
+                                          A_rowsb_,
+                                          A_rowse_,
+                                          A_cols_,
+                                          A_vals_);
+        if (status_ != SPARSE_STATUS_SUCCESS) {
+          std::cout << "ERROR " << status_ << std::endl;
+          exit(1);
+        }
+      }
+    }
+    
+    void callSpmdnm() override {
+      if constexpr (std::is_same_v<T, float>) {
+        status_ = mkl_sparse_s_mm(operation_, alpha, A_csr_, description_,
+                                  layout_, B_, n_mkl_, n_mkl_, beta, C_,
+                                  n_mkl_);
+      } else if constexpr (std::is_same_v<T, double>) {
+        status_ = mkl_sparse_d_mm(operation_, alpha, A_csr_, description_,
+                                  layout_, B_, n_mkl_, n_mkl_, beta, C_,
+                                  n_mkl_);
+      } else {
+        // Un-specialised class will not do any work - print error and exit.
+        std::cerr << "ERROR - Datatype for OneMKL CPU SpGEMV kernel not "
+                     "supported." << std::endl;
+        exit(1);
+      }
+
+      callConsume();
+    }
+
+    void postLoopRequirements() override {
+      status_ = mkl_sparse_destroy(A_csr_);
+      if (status_ != SPARSE_STATUS_SUCCESS) {
+        std::cerr << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+    }
+
+    void postCallKernelCleanup() override {
+      mkl_free(A_rowsb_);
+      mkl_free(A_rowse_);
+      mkl_free(A_cols_);
+      mkl_free(A_vals_);
+      mkl_free(B_);
+      mkl_free(C_);
+    }
+
+    sparse_status_t status_;
+
+    sparse_index_base_t indexing_ = SPARSE_INDEX_BASE_ZERO;
+    sparse_operation_t operation_ = SPARSE_OPERATION_NON_TRANSPOSE;
+    
+    matrix_descr description_ = {SPARSE_MATRIX_TYPE_GENERAL,
+                                 SPARSE_FILL_MODE_LOWER,
+                                 SPARSE_DIAG_NON_UNIT};
+    sparse_layout_t layout_ = SPARSE_LAYOUT_ROW_MAJOR;
+
+    MKL_INT m_mkl_;
+    MKL_INT n_mkl_;
+    MKL_INT k_mkl_;
+
+    T* A_vals_;
+    MKL_INT* A_cols_;
+    MKL_INT* A_rowsb_;
+    MKL_INT* A_rowse_;
+
+    sparse_matrix_t A_csr_;
+
+    const T alpha = ALPHA;
+    const T beta = BETA;
+};
+}
+
+#endif
\ No newline at end of file
diff --git a/oneMKL/CPU/spmdnv.hh b/oneMKL/CPU/spmdnv.hh
new file mode 100644
index 0000000..a5414f9
--- /dev/null
+++ b/oneMKL/CPU/spmdnv.hh
@@ -0,0 +1,151 @@
+#pragma once
+
+#ifdef CPU_ONEMKL
+#include <mkl.h>
+
+#include <algorithm>
+
+#include "../../include/kernels/CPU/spmdnv.hh"
+#include "../../include/utilities.hh"
+
+namespace cpu {
+template <typename T>
+class spmdnv_cpu : public spmdnv<T> {
+public:
+    using spmdnv<T>::spmdnv;
+    using spmdnv<T>::callConsume;
+    using spmdnv<T>::initInputMatrixVector;
+    using spmdnv<T>::m_;
+    using spmdnv<T>::n_;
+    using spmdnv<T>::x_;
+    using spmdnv<T>::y_;
+    using spmdnv<T>::sparsity_;
+    using spmdnv<T>::type_;
+    using spmdnv<T>::nnz_;
+
+    void initialise(int m, int n, double sparsity, matrixType type, 
+                    bool binary = false) {
+      m_ = m;
+      n_ = n;
+      sparsity_ = sparsity;
+      type_ = type;
+
+      nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_));
+
+      x_ = (T*)mkl_malloc(sizeof(T) * n_, 64);
+      y_ = (T*)mkl_malloc(sizeof(T) * m_, 64);
+
+      initInputMatrixVector();
+    }
+
+protected:
+    void toSparseFormat() override {
+      A_vals_ = (T*)mkl_malloc(sizeof(T) * nnz_, 64);
+      A_cols_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * nnz_, 64);
+      MKL_INT* A_rows_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (m_ + 1), 64);
+      A_rowsb_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * m_, 64);
+      A_rowse_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * m_, 64);
+
+      if (type_ == matrixType::rmat) {
+        rMatCSR<T, MKL_INT>(A_vals_, A_cols_, A_rows_, m_, n_, nnz_);
+      } else if (type_ == matrixType::random) {
+        randomCSR<T, MKL_INT>(A_vals_, A_cols_, A_rows_, m_, n_, nnz_);
+      } else if (type_ == matrixType::finiteElements) {
+        finiteElementCSR<T, MKL_INT>(A_vals_, A_cols_, A_rows_, m_, n_, nnz_);
+      } else {
+        std::cerr << "Unknown matrix type" << std::endl;
+        exit(1);
+      }
+      
+      for (uint64_t i = 0; i < m_; i++) {
+        A_rowsb_[i] = A_rows_[i];
+        A_rowse_[i] = A_rows_[i + 1];
+      }
+
+      mkl_free(A_rows_);
+    }
+
+private:
+    void preLoopRequirements() override {
+      if constexpr (std::is_same_v<T, float>) {
+        status_ = mkl_sparse_s_create_csr(&A_csr_,
+                                          indexing_,
+                                          m_,
+                                          n_,
+                                          A_rowsb_,
+                                          A_rowse_,
+                                          A_cols_,
+                                          A_vals_);
+      } else if constexpr (std::is_same_v<T, double>) {
+        status_ = mkl_sparse_d_create_csr(&A_csr_,
+                                          indexing_,
+                                          m_,
+                                          n_,
+                                          A_rowsb_,
+                                          A_rowse_,
+                                          A_cols_,
+                                          A_vals_);
+      }
+      if (status_ != SPARSE_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+    }
+
+    void callSpMDnV() override {
+      if constexpr (std::is_same_v<T, float>) {
+        status_ = mkl_sparse_s_mv(operation_, alpha, A_csr_, description_, x_,
+                                  beta, y_);
+      } else if constexpr (std::is_same_v<T, double>) {
+        status_ = mkl_sparse_d_mv(operation_, alpha, A_csr_, description_, x_,
+                                  beta, y_);
+      }
+      if (status_ != SPARSE_STATUS_SUCCESS) {
+        std::cerr << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+      callConsume();
+    }
+
+    void postLoopRequirements() override {
+      status_ = mkl_sparse_destroy(A_csr_);
+      if (status_ != SPARSE_STATUS_SUCCESS) {
+        std::cerr << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+    }
+
+    void postCallKernelCleanup() override {
+      mkl_free(A_rowsb_);
+      mkl_free(A_rowse_);
+      mkl_free(A_cols_);
+      mkl_free(A_vals_);
+      mkl_free(x_);
+      mkl_free(y_);
+    }
+
+    sparse_status_t status_;
+
+    sparse_index_base_t indexing_ = SPARSE_INDEX_BASE_ZERO;
+    sparse_operation_t operation_ = SPARSE_OPERATION_NON_TRANSPOSE;
+    matrix_descr description_ = {SPARSE_MATRIX_TYPE_GENERAL,
+                                 SPARSE_FILL_MODE_LOWER,
+                                 SPARSE_DIAG_NON_UNIT};
+
+    MKL_INT m_mkl_;
+    MKL_INT n_mkl_;
+
+    T* A_vals_;
+    MKL_INT* A_cols_;
+    MKL_INT* A_rowsb_;
+    MKL_INT* A_rowse_;
+
+    sparse_matrix_t A_csr_;
+
+    const T alpha = ALPHA;
+    const T beta = BETA;
+};
+}
+
+
+#endif
diff --git a/oneMKL/CPU/spmspm.hh b/oneMKL/CPU/spmspm.hh
new file mode 100644
index 0000000..2671ebb
--- /dev/null
+++ b/oneMKL/CPU/spmspm.hh
@@ -0,0 +1,256 @@
+#pragma once
+
+#ifdef CPU_ONEMKL
+#include <mkl.h>
+#include <mkl_spblas.h>
+
+#include <algorithm>
+
+#include "../../include/kernels/CPU/spmspm.hh"
+#include "../../include/utilities.hh"
+
+namespace cpu {
+/** A class for sparse matrix-sparse matrix CPU BLAS kernels. */
+template <typename T>
+class spmspm_cpu : public spmspm<T> {
+public:
+    using spmspm<T>::spmspm;
+    using spmspm<T>::initInputMatrices;
+    using spmspm<T>::callConsume;
+    using spmspm<T>::m_;
+    using spmspm<T>::n_;
+    using spmspm<T>::k_;
+    using spmspm<T>::sparsity_;
+    using spmspm<T>::type_;
+    using spmspm<T>::A_nnz_;
+    using spmspm<T>::B_nnz_;
+    using spmspm<T>::C_nnz_;
+    using spmspm<T>::C_vals_;
+
+    void initialise(int m, int n, int k, double sparsity,
+                    matrixType type, bool binary = false) {
+      m_ = m;
+      n_ = n;
+      k_ = k;
+
+      m_mkl_ = m;
+      n_mkl_ = n;
+      k_mkl_ = k;
+
+      sparsity_ = sparsity;
+      type_ = type;
+
+      /** Determine the number of nnz elements in A and B */
+      A_nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+      B_nnz_ = 1 + (uint64_t)((double)k_ * (double)n_ * (1.0 - sparsity_));
+      initInputMatrices();
+    }
+
+protected:
+    void toSparseFormat() override {
+      A_vals_ = (T*)mkl_malloc(sizeof(T) * A_nnz_, 64);
+      A_cols_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * A_nnz_, 64);
+      MKL_INT* A_rows = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (m_ + 1), 64);
+      A_rowsb_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * m_, 64);
+      A_rowse_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * m_, 64);
+      B_vals_ = (T*)mkl_malloc(sizeof(T) * B_nnz_, 64);
+      B_cols_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * B_nnz_, 64);
+      MKL_INT* B_rows = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (k_ + 1), 64);
+      B_rowsb_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * k_, 64);
+      B_rowse_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * k_, 64);
+
+      int seedOffset = 0;
+      do {
+        if (type_ == matrixType::rmat) {
+          rMatCSR<T, MKL_INT>(A_vals_, A_cols_, A_rows, m_, k_, A_nnz_, SEED + seedOffset++);
+          rMatCSR<T, MKL_INT>(B_vals_, B_cols_, B_rows, k_, n_, B_nnz_, SEED + seedOffset++);
+        } else if (type_ == matrixType::random) {
+          randomCSR<T, MKL_INT>(A_vals_, A_cols_, A_rows, m_, k_, A_nnz_, SEED + seedOffset++);
+          randomCSR<T, MKL_INT>(B_vals_, B_cols_, B_rows, k_, n_, B_nnz_, SEED + seedOffset++);
+        } else if (type_ == matrixType::finiteElements) {
+          finiteElementCSR<T, MKL_INT>(A_vals_, A_cols_, A_rows, m_, k_, A_nnz_, SEED + seedOffset++);
+          finiteElementCSR<T, MKL_INT>(B_vals_, B_cols_, B_rows, k_, n_, B_nnz_, SEED + seedOffset++);
+        } else {
+          std::cerr << "Unknown matrix type" << std::endl;
+          exit(1);
+        }
+      } while (calcCNNZ<MKL_INT>(m_, A_nnz_, A_rows, A_cols_, k_, B_nnz_, B_rows, B_cols_) == 0);
+
+      for (uint64_t i = 0; i < m_; i++) {
+        A_rowsb_[i] = A_rows[i];
+        A_rowse_[i] = A_rows[i + 1];
+      }
+
+      mkl_free(A_rows);
+
+      for (uint64_t i = 0; i < k_; i++) {
+        B_rowsb_[i] = B_rows[i];
+        B_rowse_[i] = B_rows[i + 1];
+      }
+      mkl_free(B_rows);
+    }
+
+private:
+
+    void preLoopRequirements() override {
+      if constexpr (std::is_same_v<T, float>) {
+        status_ = mkl_sparse_s_create_csr(&A_csr_,
+                                          indexing_,
+                                          m_,
+                                          k_,
+                                          A_rowsb_,
+                                          A_rowse_,
+                                          A_cols_,
+                                          A_vals_);
+        if (status_ != SPARSE_STATUS_SUCCESS) {
+          std::cerr << "ERROR " << status_ << std::endl;
+          exit(1);
+        }
+
+        status_ = mkl_sparse_s_create_csr(&B_csr_,
+                                          indexing_,
+                                          k_,
+                                          n_,
+                                          B_rowsb_,
+                                          B_rowse_,
+                                          B_cols_,
+                                          B_vals_);
+        if (status_ != SPARSE_STATUS_SUCCESS) {
+          std::cerr << "ERROR " << status_ << std::endl;
+          exit(1);
+        }
+      } else if constexpr (std::is_same_v<T, double>) {
+        status_ = mkl_sparse_d_create_csr(&A_csr_,
+                                          indexing_,
+                                          m_,
+                                          k_,
+                                          A_rowsb_,
+                                          A_rowse_,
+                                          A_cols_,
+                                          A_vals_);
+        if (status_ != SPARSE_STATUS_SUCCESS) {
+          std::cerr << "ERROR " << status_ << std::endl;
+          exit(1);
+        }
+
+
+        status_ = mkl_sparse_d_create_csr(&B_csr_,
+                                          indexing_,
+                                          k_,
+                                          n_,
+                                          B_rowsb_,
+                                          B_rowse_,
+                                          B_cols_,
+                                          B_vals_);
+        if (status_ != SPARSE_STATUS_SUCCESS) {
+          std::cerr << "ERROR " << status_ << std::endl;
+          exit(1);
+        }
+      }
+    }
+
+    void callSpmspm() override {
+      status_ = mkl_sparse_spmm(operation_, A_csr_, B_csr_, &C_csr_);
+      if (status_ != SPARSE_STATUS_SUCCESS) {
+        std::cerr << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+
+      status_ = mkl_sparse_order(C_csr_);
+      if (status_ != SPARSE_STATUS_SUCCESS) {
+        std::cerr << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+      callConsume();
+    }
+
+    void postLoopRequirements() override {
+      if constexpr(std::is_same_v<T, float>) {
+        status_ = mkl_sparse_s_export_csr(C_csr_,
+                                          &indexing_,
+                                          &m_mkl_,
+                                          &n_mkl_,
+                                          &C_rowsb_,
+                                          &C_rowse_,
+                                          &C_cols_,
+                                          &C_vals_);
+      } else if constexpr (std::is_same_v<T, double>) {
+        status_ = mkl_sparse_d_export_csr(C_csr_,
+                                          &indexing_,
+                                          &m_mkl_,
+                                          &n_mkl_,
+                                          &C_rowsb_,
+                                          &C_rowse_,
+                                          &C_cols_,
+                                          &C_vals_);
+      }
+      if (status_ != SPARSE_STATUS_SUCCESS) {
+        std::cerr << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+
+      C_nnz_ = C_rowse_[m_ - 1];
+    }
+
+    void postCallKernelCleanup() override {
+      status_ = mkl_sparse_destroy(A_csr_);
+      if (status_ != SPARSE_STATUS_SUCCESS) {
+        std::cerr << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+      status_ = mkl_sparse_destroy(B_csr_);
+      if (status_ != SPARSE_STATUS_SUCCESS) {
+        std::cerr << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+      status_ = mkl_sparse_destroy(C_csr_);
+      if (status_ != SPARSE_STATUS_SUCCESS) {
+        std::cerr << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+
+      mkl_free(A_vals_);
+      mkl_free(A_cols_);
+      mkl_free(A_rowsb_);
+      mkl_free(A_rowse_);
+
+      mkl_free(B_vals_);
+      mkl_free(B_cols_);
+      mkl_free(B_rowsb_);
+      mkl_free(B_rowse_);
+    }
+
+    sparse_status_t status_;
+
+    sparse_index_base_t indexing_ = SPARSE_INDEX_BASE_ZERO;
+    sparse_operation_t operation_ = SPARSE_OPERATION_NON_TRANSPOSE;
+
+    MKL_INT m_mkl_;
+    MKL_INT n_mkl_;
+    MKL_INT k_mkl_;
+
+    T* A_vals_;
+    MKL_INT* A_cols_;
+    MKL_INT* A_rowsb_;
+    MKL_INT* A_rowse_;
+
+    T* B_vals_;
+    MKL_INT* B_cols_;
+    MKL_INT* B_rowsb_;
+    MKL_INT* B_rowse_;
+
+    MKL_INT* C_cols_;
+    MKL_INT* C_rowsb_;
+    MKL_INT* C_rowse_;
+
+    sparse_matrix_t A_csr_;
+    sparse_matrix_t B_csr_;
+    sparse_matrix_t C_csr_;
+
+    const T alpha = ALPHA;
+    const T beta = BETA;
+};
+}
+
+
+#endif
\ No newline at end of file
diff --git a/oneMKL/GPU/common.hh b/oneMKL/GPU/common.hh
index 30fccfa..13f3715 100644
--- a/oneMKL/GPU/common.hh
+++ b/oneMKL/GPU/common.hh
@@ -3,8 +3,9 @@
 #ifdef GPU_ONEMKL
 
 #include <mkl.h>
-
+#include <memory>
 #include <oneapi/mkl/blas.hpp>
+#include <oneapi/mkl/spblas.hpp>
 #include <sycl/sycl.hpp>
 
 // Create an exception handler for asynchronous SYCL exceptions
@@ -14,8 +15,7 @@ static const std::function<void(sycl::exception_list)> exception_handler =
         try {
           std::rethrow_exception(e);
         } catch (std::exception const& e) {
-          std::cout << "ERROR -  Caught asynchronous SYCL exception : "
-                    << e.what() << std::endl;
+          std::cerr << "ERROR -  Caught asynchronous SYCL exception : " << e.what() << std::endl;
         }
       }
     };
diff --git a/oneMKL/GPU/gemm.hh b/oneMKL/GPU/gemm.hh
index 44fa3b2..deb7723 100644
--- a/oneMKL/GPU/gemm.hh
+++ b/oneMKL/GPU/gemm.hh
@@ -111,10 +111,9 @@ class gemm_gpu : public gemm<T> {
               (int64_t)std::max(1, m_), {})
               .wait_and_throw();
         } catch (sycl::exception const& e) {
-          std::cout << "ERROR - Caught synchronous SYCL exception during GEMM "
-                       "(Always):\n"
-                    << e.what() << std::endl
-                    << "OpenCL status: " << e.code().value() << std::endl;
+          std::cerr << "ERROR - Caught synchronous SYCL exception during GEMM "
+                       "(Always):\n" << e.what() << std::endl;
+          std::cerr << "OpenCL status: " << e.code().value() << std::endl;
         }
         // Offload output data from device to host
         gpuQueue_.memcpy(C_, C_device_, sizeof(T) * m_ * n_);
@@ -131,10 +130,9 @@ class gemm_gpu : public gemm<T> {
               (int64_t)std::max(1, m_), {})
               .wait_and_throw();
         } catch (sycl::exception const& e) {
-          std::cout << "ERROR - Caught synchronous SYCL exception during GEMM "
-                       "(Once):\n"
-                    << e.what() << std::endl
-                    << "OpenCL status: " << e.code().value() << std::endl;
+          std::cerr << "ERROR - Caught synchronous SYCL exception during GEMM "
+                       "(Once): " << e.what() << std::endl;
+          std::cerr << "OpenCL status: " << e.code().value() << std::endl;
         }
         break;
       }
@@ -147,10 +145,9 @@ class gemm_gpu : public gemm<T> {
               (int64_t)std::max(1, k_), beta, C_, (int64_t)std::max(1, m_), {})
               .wait_and_throw();
         } catch (sycl::exception const& e) {
-          std::cout << "ERROR - Caught synchronous SYCL exception during GEMM "
-                       "(Unified):\n"
-                    << e.what() << std::endl
-                    << "OpenCL status: " << e.code().value() << std::endl;
+          std::cerr << "ERROR - Caught synchronous SYCL exception during GEMM "
+                       "(Unified): " << e.what() << std::endl;
+          std::cerr << "OpenCL status: " << e.code().value() << std::endl;
         }
         break;
       }
@@ -172,7 +169,7 @@ class gemm_gpu : public gemm<T> {
         break;
       }
       case gpuOffloadType::unified: {
-        // TODO - Ensure all data resides on host once work has completed
+        // Ensure all data resides on host once work has completed
         gpuQueue_.wait_and_throw();
         break;
       }
diff --git a/oneMKL/GPU/gemv.hh b/oneMKL/GPU/gemv.hh
index ffe9f6c..6c3264b 100644
--- a/oneMKL/GPU/gemv.hh
+++ b/oneMKL/GPU/gemv.hh
@@ -109,10 +109,9 @@ class gemv_gpu : public gemv<T> {
               y_device_, vecIncrement_, {})
               .wait_and_throw();
         } catch (sycl::exception const& e) {
-          std::cout << "ERROR - Caught synchronous SYCL exception during GEMV "
-                       "(Always):\n"
-                    << e.what() << std::endl
-                    << "OpenCL status: " << e.code().value() << std::endl;
+          std::cerr << "ERROR - Caught synchronous SYCL exception during GEMV "
+                       "(Always):\n" << e.what() << std::endl;
+          std::cerr << "OpenCL status: " << e.code().value() << std::endl;
         }
         // Offload output data from device to host
         gpuQueue_.memcpy(y_, y_device_, sizeof(T) * m_);
@@ -128,10 +127,9 @@ class gemv_gpu : public gemv<T> {
               y_device_, vecIncrement_, {})
               .wait_and_throw();
         } catch (sycl::exception const& e) {
-          std::cout << "ERROR - Caught synchronous SYCL exception during GEMV "
-                       "(Once):\n"
-                    << e.what() << std::endl
-                    << "OpenCL status: " << e.code().value() << std::endl;
+          std::cerr << "ERROR - Caught synchronous SYCL exception during GEMV "
+                       "(Once):\n" << e.what() << std::endl;
+          std::cerr << "OpenCL status: " << e.code().value() << std::endl;
         }
         break;
       }
@@ -144,10 +142,9 @@ class gemv_gpu : public gemv<T> {
               vecIncrement_, {})
               .wait_and_throw();
         } catch (sycl::exception const& e) {
-          std::cout << "ERROR - Caught synchronous SYCL exception during GEMV "
-                       "(Unified):\n"
-                    << e.what() << std::endl
-                    << "OpenCL status: " << e.code().value() << std::endl;
+          std::cerr << "ERROR - Caught synchronous SYCL exception during GEMV "
+                       "(Unified):\n" << e.what() << std::endl;
+          std::cerr << "OpenCL status: " << e.code().value() << std::endl;
         }
         break;
       }
@@ -169,7 +166,6 @@ class gemv_gpu : public gemv<T> {
         break;
       }
       case gpuOffloadType::unified: {
-        // TODO - Ensure all data resides on host once work has completed
         gpuQueue_.wait_and_throw();
         break;
       }
diff --git a/oneMKL/GPU/spmdnm.hh b/oneMKL/GPU/spmdnm.hh
new file mode 100644
index 0000000..b334fcd
--- /dev/null
+++ b/oneMKL/GPU/spmdnm.hh
@@ -0,0 +1,340 @@
+#pragma once
+
+#ifdef GPU_ONEMKL
+
+#include "../../include/kernels/GPU/spmdnm.hh"
+#include "../../include/utilities.hh"
+#include "common.hh"
+
+#include <iostream>
+
+namespace gpu {
+template <typename T>
+class spmdnm_gpu : public spmdnm<T> {
+public:
+    using spmdnm<T>::spmdnm;
+    using spmdnm<T>::initInputMatrices;
+    using spmdnm<T>::nnz_;
+    using spmdnm<T>::m_;
+    using spmdnm<T>::n_;
+    using spmdnm<T>::k_;
+    using spmdnm<T>::B_;
+    using spmdnm<T>::C_;
+    using spmdnm<T>::offload_;
+    using spmdnm<T>::sparsity_;
+    using spmdnm<T>::type_;
+
+    void initialise(gpuOffloadType offload, int m, int n, int k,
+                double sparsity, matrixType type, 
+                bool binary = false) override {
+      // Perform set-up which doesn't need to happen every problem size change.
+      if (firstRun_) {
+        firstRun_ = false;
+        try {
+          myGpu_ = sycl::device(sycl::gpu_selector_v);
+        } catch (const std::exception& e) {
+          std::cerr << "ERROR - No GPU device found: " << e.what() << '\n';
+          exit(1);
+        }
+        gpuQueue_ = sycl::queue(myGpu_, exception_handler);
+      }  
+      
+      try {
+        // Initialize ALL pointers to nullptr FIRST
+        B_ = nullptr;
+        C_ = nullptr;
+        A_vals_ = nullptr;
+        A_cols_ = nullptr;
+        A_rows_ = nullptr;
+        A_vals_device_ = nullptr;
+        A_cols_device_ = nullptr;
+        A_rows_device_ = nullptr;
+        B_device_ = nullptr;
+        C_device_ = nullptr;
+
+        offload_ = offload;
+        sparsity_ = sparsity;
+        type_ = type;
+        m_ = m;
+        n_ = n;
+        k_ = k;
+
+        layout_ = oneapi::mkl::layout::row_major;
+        operationA_ = oneapi::mkl::transpose::nontrans;
+        operationB_ = oneapi::mkl::transpose::nontrans;
+        index_ = oneapi::mkl::index_base::zero;
+
+        nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+
+        if (offload_ == gpuOffloadType::unified) {
+          B_ = (T*)sycl::malloc_shared(sizeof(T) * k_ * n_, gpuQueue_);
+          C_ = (T*)sycl::malloc_shared(sizeof(T) * m_ * n_, gpuQueue_);
+        } else {
+          // Host memory allocation
+          B_ = (T*)sycl::malloc_host(sizeof(T) * k_ * n_, gpuQueue_);
+          C_ = (T*)sycl::malloc_host(sizeof(T) * m_ * n_, gpuQueue_);
+
+          // Device memory allocation
+          B_device_ = (T*)sycl::malloc_device(sizeof(T) * k_ * n_, gpuQueue_);
+          C_device_ = (T*)sycl::malloc_device(sizeof(T) * m_ * n_, gpuQueue_);
+        }
+        initInputMatrices();
+      } catch (const std::exception& e) {
+        std::cerr << "ERROR in initialise(): " << e.what() << std::endl;
+        exit(1);
+      }
+    }
+
+
+protected:
+    void toSparseFormat() override {
+      if (offload_ == gpuOffloadType::always) {
+        A_vals_store_ = (T*)malloc(nnz_ * sizeof(T));
+        A_cols_store_ = (int64_t*)malloc(nnz_ * sizeof(int64_t));
+        A_rows_store_ = (int64_t*)malloc((m_ + 1) * sizeof(int64_t));
+        if (type_ == matrixType::rmat) {
+          rMatCSR<T, int64_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, nnz_);
+        } else if (type_ == matrixType::random) {
+          randomCSR<T, int64_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, nnz_);
+        } else if (type_ == matrixType::finiteElements) {
+          finiteElementCSR<T, int64_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, nnz_);
+        } else {
+          std::cerr << "ERROR - Unknown matrix type" << std::endl;
+          exit(1);
+        }
+      }
+
+
+      if (offload_ == gpuOffloadType::unified) {
+          A_vals_ = (T*)sycl::malloc_shared(sizeof(T) * nnz_, gpuQueue_);
+          A_cols_ = (int64_t*)sycl::malloc_shared(sizeof(int64_t) * nnz_, gpuQueue_);
+          A_rows_ = (int64_t*)sycl::malloc_shared(sizeof(int64_t) * (m_ + 1), gpuQueue_);
+      } else {
+          A_vals_ = (T*)sycl::malloc_host(sizeof(T) * nnz_, gpuQueue_);
+          A_cols_ = (int64_t*)sycl::malloc_host(sizeof(int64_t) * nnz_, gpuQueue_);
+          A_rows_ = (int64_t*)sycl::malloc_host(sizeof(int64_t) * (m_ + 1), gpuQueue_);
+
+          A_vals_device_ = (T*)sycl::malloc_device(sizeof(T) * nnz_, gpuQueue_);
+          A_cols_device_ = (int64_t*)sycl::malloc_device(sizeof(int64_t) * nnz_, gpuQueue_);
+          A_rows_device_ = (int64_t*)sycl::malloc_device(sizeof(int64_t) * (m_ + 1), gpuQueue_);
+      }
+      
+      memcpy(A_rows_, A_rows_store_, sizeof(int64_t) * (m_ + 1));
+      memcpy(A_cols_, A_cols_store_, sizeof(int64_t) * nnz_);
+      memcpy(A_vals_, A_vals_store_, sizeof(T) * nnz_);
+    }
+
+private:
+    void preLoopRequirements() override {
+      switch(offload_) {
+        case gpuOffloadType::always: break;
+        case gpuOffloadType::once: {
+          // Moving memory over to device from host
+          gpuQueue_.memcpy(A_vals_device_, A_vals_, sizeof(T) * nnz_);
+          gpuQueue_.memcpy(A_cols_device_, A_cols_, sizeof(int64_t) * nnz_);
+          gpuQueue_.memcpy(A_rows_device_, A_rows_, sizeof(int64_t) * (m_ + 1));
+          gpuQueue_.memcpy(B_device_, B_, sizeof(T) * k_ * n_);
+          gpuQueue_.wait(); // Is this needed?
+          oneapi::mkl::sparse::init_matrix_handle(&A_device_);
+          oneapi::mkl::sparse::set_csr_data(gpuQueue_,
+                                            A_device_,
+                                            m_,
+                                            k_,
+                                            index_,
+                                            A_rows_device_,
+                                            A_cols_device_,
+                                            A_vals_device_);
+          gpuQueue_.wait_and_throw();
+          break;
+        }
+        case gpuOffloadType::unified: {
+          // For unified memory, set up matrix handle once
+          oneapi::mkl::sparse::init_matrix_handle(&A_device_);
+          oneapi::mkl::sparse::set_csr_data(gpuQueue_,
+                                            A_device_,
+                                            m_,
+                                            k_,
+                                            index_,
+                                            A_rows_,
+                                            A_cols_,
+                                            A_vals_);
+          gpuQueue_.wait_and_throw();
+          break;
+        }
+      }
+    }
+
+    void callSpmdnm() override {
+      switch (offload_) {
+        case gpuOffloadType::always: {
+          // Copy data to device for this iteration
+          gpuQueue_.memcpy(A_vals_device_, A_vals_, sizeof(T) * nnz_);
+          gpuQueue_.memcpy(A_cols_device_, A_cols_, sizeof(int64_t) * nnz_);
+          gpuQueue_.memcpy(A_rows_device_, A_rows_, sizeof(int64_t) * (m_ + 1));
+          gpuQueue_.memcpy(B_device_, B_, sizeof(T) * k_ * n_);
+          gpuQueue_.wait();
+
+          oneapi::mkl::sparse::init_matrix_handle(&A_device_);
+          oneapi::mkl::sparse::set_csr_data(gpuQueue_,
+                                            A_device_,
+                                            m_,
+                                            k_,
+                                            index_,
+                                            A_rows_device_,
+                                            A_cols_device_,
+                                            A_vals_device_);
+          gpuQueue_.wait();
+
+          // Do computation
+          try {
+            oneapi::mkl::sparse::gemm(gpuQueue_,
+                                      layout_,
+                                      operationA_,
+                                      operationB_,
+                                      alpha,
+                                      A_device_,
+                                      B_device_,
+                                      n_,
+                                      n_,
+                                      beta,
+                                      C_device_,
+                                      n_);
+            gpuQueue_.wait();
+          } catch (sycl::exception const& e) {
+            std::cerr << "ERROR - Caught synchronous SYCL exception during "
+                          "spmdnm (Always):\n" << e.what() << std::endl <<
+                          "OpenCL status: " << e.code().value() << std::endl;
+            exit(1);
+          }
+
+          // Copy result back to host
+          gpuQueue_.memcpy(C_, C_device_, sizeof(T) * m_ * n_);
+          gpuQueue_.wait();
+          
+          // Clean up matrix handle
+          oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &A_device_);
+          gpuQueue_.wait();
+          break;
+        }
+        case gpuOffloadType::once: {
+          // Buffers already exist, just do computation
+          try {
+            oneapi::mkl::sparse::gemm(gpuQueue_,
+                                      layout_,
+                                      operationA_,
+                                      operationB_,
+                                      alpha,
+                                      A_device_,
+                                      B_device_,
+                                      n_,
+                                      n_,
+                                      beta,
+                                      C_device_,
+                                      n_);
+            gpuQueue_.wait();
+          } catch (sycl::exception const& e) {
+            std::cerr << "ERROR - Caught synchronous SYCL exception during "
+                          "spmdnm (Once):\n" << e.what() << std::endl <<
+                          "OpenCL status: " << e.code().value() << std::endl;
+            exit(1);
+          }
+          break;
+        }
+        case gpuOffloadType::unified: {
+          // Direct computation with unified memory
+          try {
+            oneapi::mkl::sparse::gemm(gpuQueue_,
+                                      layout_,
+                                      operationA_,
+                                      operationB_,
+                                      alpha,
+                                      A_device_,
+                                      B_,
+                                      n_,
+                                      n_,
+                                      beta,
+                                      C_,
+                                      n_);
+            gpuQueue_.wait_and_throw();
+          } catch (sycl::exception const& e) {
+              std::cerr << "ERROR - Caught synchronous SYCL exception during spmdnm (Unified): " << e.what() << std::endl << "OpenCL status: " << e.code().value() << std::endl;
+              exit(1);
+          }
+          break;
+        }
+      }
+    }
+
+    void postLoopRequirements() override {
+      // Clean up buffers that were created for the entire loop duration
+      if (offload_ == gpuOffloadType::once) {
+        gpuQueue_.memcpy(C_, C_device_, sizeof(T) * m_ * n_);
+        gpuQueue_.wait();
+      }
+      
+      if (offload_ != gpuOffloadType::always) {
+        oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &A_device_);
+      }
+    }
+
+    void postCallKernelCleanup() override {
+      if (offload_ == gpuOffloadType::unified) {
+        if (B_) { sycl::free(B_, gpuQueue_); B_ = nullptr; }
+        if (C_) { sycl::free(C_, gpuQueue_); C_ = nullptr; }
+        if (A_vals_) { sycl::free(A_vals_, gpuQueue_); A_vals_ = nullptr; }
+        if (A_cols_) { sycl::free(A_cols_, gpuQueue_); A_cols_ = nullptr; }
+        if (A_rows_) { sycl::free(A_rows_, gpuQueue_); A_rows_ = nullptr; }
+
+        free(A_vals_store_);
+        free(A_cols_store_);
+        free(A_rows_store_);
+      } else {
+        if (B_) { sycl::free(B_, gpuQueue_); B_ = nullptr; }
+        if (C_) { sycl::free(C_, gpuQueue_); C_ = nullptr; }
+        if (A_vals_) { sycl::free(A_vals_, gpuQueue_); A_vals_ = nullptr; }
+        if (A_cols_) { sycl::free(A_cols_, gpuQueue_); A_cols_ = nullptr; }
+        if (A_rows_) { sycl::free(A_rows_, gpuQueue_); A_rows_ = nullptr; }
+        
+        if (A_vals_device_) { sycl::free(A_vals_device_, gpuQueue_); A_vals_device_ = nullptr; }
+        if (A_cols_device_) { sycl::free(A_cols_device_, gpuQueue_); A_cols_device_ = nullptr; }
+        if (A_rows_device_) { sycl::free(A_rows_device_, gpuQueue_); A_rows_device_ = nullptr; }
+        if (B_device_) { sycl::free(B_device_, gpuQueue_); B_device_ = nullptr; }
+        if (C_device_) { sycl::free(C_device_, gpuQueue_); C_device_ = nullptr; } 
+      }
+    }
+
+    bool firstRun_ = true;
+
+    /** The GPU Device. */
+    sycl::device myGpu_;
+
+    /** The SYCL execution queue*/
+    sycl::queue gpuQueue_;
+
+    oneapi::mkl::layout layout_;
+    oneapi::mkl::transpose operationA_;
+    oneapi::mkl::transpose operationB_;
+    oneapi::mkl::index_base index_;
+
+    T* A_vals_store_;
+    int64_t* A_cols_store_;
+    int64_t* A_rows_store_;
+
+    T* A_vals_;
+    int64_t* A_cols_;
+    int64_t* A_rows_;
+
+    oneapi::mkl::sparse::matrix_handle_t A_device_;
+
+    T* A_vals_device_;
+    int64_t* A_cols_device_;
+    int64_t* A_rows_device_;
+    T* B_device_;
+    T* C_device_;
+
+    const T alpha = ALPHA;
+    const T beta = BETA;
+};
+}
+
+#endif
diff --git a/oneMKL/GPU/spmdnv.hh b/oneMKL/GPU/spmdnv.hh
new file mode 100644
index 0000000..3457aa8
--- /dev/null
+++ b/oneMKL/GPU/spmdnv.hh
@@ -0,0 +1,409 @@
+#pragma once
+
+#ifdef GPU_ONEMKL
+
+#include "../../include/kernels/GPU/spmdnv.hh"
+#include "../../include/utilities.hh"
+#include "common.hh"
+
+namespace gpu {
+template <typename T>
+class spmdnv_gpu : public spmdnv<T> {
+public:
+    using spmdnv<T>::spmdnv;
+    using spmdnv<T>::initInputMatrixVector;
+    using spmdnv<T>::nnz_;
+    using spmdnv<T>::m_;
+    using spmdnv<T>::n_;
+    using spmdnv<T>::x_;
+    using spmdnv<T>::y_;
+    using spmdnv<T>::offload_;
+    using spmdnv<T>::sparsity_;
+    using spmdnv<T>::type_;
+
+
+    void initialise(gpuOffloadType offload, int m, int n, double sparsity, 
+                    matrixType type)
+    override {
+      try {
+        myGpu_ = sycl::device(sycl::gpu_selector_v);
+      } catch (const std::exception& e) {
+        std::cerr << "ERROR - No GPU device found: " << e.what() << '\n';
+        std::terminate();
+      }
+      auto exception_handler = [](sycl::exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+          try {
+            std::rethrow_exception(e);
+          } catch (sycl::exception const &e) {
+            std::cerr << "Caught asynchronous SYCL exception during sparse::gemv:\n" << e.what() << std::endl;
+          }
+        }
+      };  
+
+      gpuQueue_ = sycl::queue(myGpu_, exception_handler);
+      context_ = gpuQueue_.get_context();
+      
+      x_ = nullptr;
+      y_ = nullptr;
+
+      offload_ = offload;
+      sparsity_ = sparsity;
+      type_ = type;
+      m_ = m;
+      n_ = n;
+
+      index_ = oneapi::mkl::index_base::zero;
+      operation_ = oneapi::mkl::transpose::nontrans;
+
+      nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_));
+
+      if (offload_ == gpuOffloadType::unified) {
+        x_ = sycl::malloc_shared<T>(n_, gpuQueue_);
+        y_ = sycl::malloc_shared<T>(m_, gpuQueue_);
+        if (!x_ || !y_) {
+          std::cerr << "ERROR - Failed to allocate memory for GPU SpMDnV" << std::endl;
+          exit(1);
+        }
+      } else {
+        x_ = sycl::malloc_host<T>(n_, gpuQueue_);
+        y_ = sycl::malloc_host<T>(m_, gpuQueue_);
+        x_device_ = sycl::malloc_device<T>(n_, gpuQueue_);
+        y_device_ = sycl::malloc_device<T>(m_, gpuQueue_);
+        if (!x_ || !y_) {
+          std::cerr << "ERROR - Failed to allocate host memory" << std::endl;
+          exit(1);
+        }
+      }
+      gpuQueue_.wait_and_throw();
+
+      initInputMatrixVector();
+      gpuQueue_.wait_and_throw();
+    }
+
+
+protected:
+    void toSparseFormat() override {
+      if (offload_ == gpuOffloadType::always) {
+        A_vals_store_ = (T*)malloc(nnz_ * sizeof(T));
+        A_cols_store_ = (int64_t*)malloc(nnz_ * sizeof(int64_t));
+        A_rows_store_ = (int64_t*)malloc((m_ + 1) * sizeof(int64_t));
+        if (type_ == matrixType::rmat) {
+          rMatCSR<T, int64_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, n_, nnz_);
+        } else if (type_ == matrixType::random) {
+          randomCSR<T, int64_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, n_, nnz_);
+        } else if (type_ == matrixType::finiteElements) {
+          finiteElementCSR<T, int64_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, n_, nnz_);
+        } else {
+          std::cerr << "Matrix type not supported" << std::endl;
+          exit(1);
+        }
+      }
+
+      if (offload_ == gpuOffloadType::unified) {
+        A_vals_ = sycl::malloc_shared<T>(nnz_, gpuQueue_);
+        A_cols_ = sycl::malloc_shared<int64_t>(nnz_, gpuQueue_);
+        A_rows_ = sycl::malloc_shared<int64_t>(m_ + 1, gpuQueue_);
+      } else {
+        A_vals_ = sycl::malloc_host<T>(nnz_, gpuQueue_);
+        A_cols_ = sycl::malloc_host<int64_t>(nnz_, gpuQueue_);
+        A_rows_ = sycl::malloc_host<int64_t>(m_ + 1, gpuQueue_);
+        A_vals_device_ = (T*)sycl::malloc_device(nnz_ * sizeof(T), gpuQueue_);
+        A_cols_device_ = (int64_t*)sycl::malloc_device(nnz_ * sizeof(int64_t), gpuQueue_);
+        A_rows_device_ = (int64_t*)sycl::malloc_device((m_ + 1) * sizeof(int64_t), gpuQueue_);
+      }
+
+      memcpy(A_rows_, A_rows_store_, static_cast<size_t>(m_ + 1) * sizeof(int64_t));
+      memcpy(A_cols_, A_cols_store_, static_cast<size_t>(nnz_) * sizeof(int64_t));
+      memcpy(A_vals_, A_vals_store_, static_cast<size_t>(nnz_) * sizeof(T));
+    }
+
+private:
+    void preLoopRequirements() override {
+      if (offload_ == gpuOffloadType::once) {
+        gpuQueue_.memcpy(A_vals_device_, A_vals_, sizeof(T) * nnz_);
+        gpuQueue_.memcpy(A_cols_device_, A_cols_, sizeof(int64_t) * nnz_);
+        gpuQueue_.memcpy(A_rows_device_, A_rows_, sizeof(int64_t) * (m_ + 1));
+        gpuQueue_.memcpy(x_device_, x_, sizeof(T) * n_);
+        gpuQueue_.wait_and_throw();
+      }
+    }
+
+    void callSpMDnV() override {
+      switch (offload_) {
+        case gpuOffloadType::always: {
+          gpuQueue_.memcpy(A_vals_device_, A_vals_, sizeof(T) * nnz_);
+          gpuQueue_.memcpy(A_cols_device_, A_cols_, sizeof(int64_t) * nnz_);
+          gpuQueue_.memcpy(A_rows_device_, A_rows_, sizeof(int64_t) * (m_ + 1));
+          gpuQueue_.memcpy(x_device_, x_, sizeof(T) * n_);
+          gpuQueue_.wait_and_throw();
+          // Do computation
+          try {
+            oneapi::mkl::sparse::init_matrix_handle(&handle_);
+            auto set = oneapi::mkl::sparse::set_csr_data(gpuQueue_,
+                                                         handle_,
+                                                         m_,
+                                                         n_,
+                                                         index_,
+                                                         A_rows_device_,
+                                                         A_cols_device_,
+                                                         A_vals_device_);
+            
+            auto optimise = oneapi::mkl::sparse::optimize_gemv(gpuQueue_,
+                                                               operation_,
+                                                               handle_,
+                                                               {set});
+
+            auto gemv = oneapi::mkl::sparse::gemv(gpuQueue_,
+                                                  operation_,
+                                                  alpha,
+                                                  handle_,
+                                                  x_device_,
+                                                  beta,
+                                                  y_device_,
+                                                  {optimise});
+                                                  
+            auto release = oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_, {gemv});
+            release.wait_and_throw();
+          } catch (sycl::exception const& e) {
+            gpuQueue_.wait();
+            oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_);
+            std::cerr << "ERROR - Caught synchronous SYCL exception during SPGEMV (Once):\n" << e.what() << std::endl << "OpenCL status: " << e.code().value() << std::endl;
+          } catch (std::exception const &e) {
+            std::cerr << "\t\tCaught std exception:\n" << e.what() << std::endl;
+            gpuQueue_.wait();
+            oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_);
+            exit(1);
+          }
+          gpuQueue_.memcpy(y_, y_device_, sizeof(T) * m_);
+          break;
+        }
+        case gpuOffloadType::once: {
+          try {
+            oneapi::mkl::sparse::init_matrix_handle(&handle_);
+            if (handle_ == nullptr) {
+              std::cerr << "ERROR - Failed to initialise matrix handle" << std::endl;
+              exit(1);
+            }
+            gpuQueue_.wait_and_throw();
+            auto set = oneapi::mkl::sparse::set_csr_data(gpuQueue_,
+                                                         handle_,
+                                                         m_,
+                                                         n_,
+                                                         index_,
+                                                         A_rows_device_,
+                                                         A_cols_device_,
+                                                         A_vals_device_,
+                                                         {});
+            
+            auto optimise = oneapi::mkl::sparse::optimize_gemv(gpuQueue_,
+                                                               operation_,
+                                                               handle_,
+                                                               {set});
+
+            auto gemv = oneapi::mkl::sparse::gemv(gpuQueue_,
+                                                  operation_,
+                                                  alpha,
+                                                  handle_,
+                                                  x_device_,
+                                                  beta,
+                                                  y_device_,
+                                                  {optimise});
+            auto release = oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_, {gemv});
+            release.wait_and_throw();
+            
+            handle_ = nullptr; // Reset handle to avoid double free
+          } catch (sycl::exception const& e) {
+            gpuQueue_.wait();
+            oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_);
+            std::cerr << "ERROR - Caught synchronous SYCL exception during SpMDnV (Once):\n" << e.what() << std::endl << "OpenCL status: " << e.code().value() << std::endl;
+          } catch (std::exception const &e) {
+            std::cerr << "\t\tCaught std exception:\n" << e.what() << std::endl;
+            gpuQueue_.wait();
+            oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_);
+            exit(1);
+          }
+          break;
+        }
+        case gpuOffloadType::unified: {
+          try {
+            std::vector<int64_t*> int_ptr_vec;
+            int_ptr_vec.push_back(A_cols_);
+            int_ptr_vec.push_back(A_rows_);
+            std::vector<T*> float_ptr_vec;
+            float_ptr_vec.push_back(A_vals_);
+            float_ptr_vec.push_back(x_);
+            float_ptr_vec.push_back(y_);
+
+
+            handle_ = nullptr;
+            oneapi::mkl::sparse::init_matrix_handle(&handle_);
+            if (handle_ == nullptr) {
+              std::cerr << "ERROR - Failed to initialise matrix handle" << std::endl;
+              exit(1);
+            }
+            gpuQueue_.wait_and_throw();
+
+            auto set = oneapi::mkl::sparse::set_csr_data(gpuQueue_,
+                                                         handle_,
+                                                         m_,
+                                                         n_,
+                                                         index_,
+                                                         A_rows_,
+                                                         A_cols_,
+                                                         A_vals_,
+                                                         {});
+            
+            auto optimise = oneapi::mkl::sparse::optimize_gemv(gpuQueue_,
+                                                               operation_,
+                                                               handle_,
+                                                               {set});
+
+            auto gemv = oneapi::mkl::sparse::gemv(gpuQueue_,
+                                                  operation_,
+                                                  alpha,
+                                                  handle_,
+                                                  x_,
+                                                  beta,
+                                                  y_,
+                                                  {optimise});
+
+            auto release = oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_, {gemv});
+            release.wait_and_throw();
+
+            handle_ = nullptr; // Reset handle to avoid double free
+          } catch (sycl::exception const& e) {
+            gpuQueue_.wait();
+            oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_);
+            std::cerr << "ERROR - Caught synchronous SYCL exception during SpMDnV (Once):\n" << e.what() << std::endl << "OpenCL status: " << e.code().value() << std::endl;
+          } catch (std::exception const &e) {
+            std::cerr << "\t\tCaught std exception:\n" << e.what() << std::endl;
+            gpuQueue_.wait();
+            oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_).wait();
+            exit(1);
+          }
+          break;
+        }
+      }
+    }
+
+    void postLoopRequirements() override {
+      if (offload_ == gpuOffloadType::once) {
+        gpuQueue_.memcpy(y_, y_device_, sizeof(T) * m_);
+        gpuQueue_.wait_and_throw();
+      }
+    }
+
+    void postCallKernelCleanup() override {
+      switch (offload_) {
+        case gpuOffloadType::always:
+        case gpuOffloadType::once: {
+          if (A_vals_ != nullptr) {
+            sycl::free(A_vals_, context_);
+            A_vals_ = nullptr;
+          }
+          if (A_cols_ != nullptr) {
+            sycl::free(A_cols_, context_);
+            A_cols_ = nullptr;
+          }
+          if (A_rows_ != nullptr) {
+            sycl::free(A_rows_, context_);
+            A_rows_ = nullptr;
+          }
+          if (A_vals_device_ != nullptr) {
+            sycl::free(A_vals_device_, context_);
+            A_vals_device_ = nullptr;
+          }
+          if (A_cols_device_ != nullptr) {
+            sycl::free(A_cols_device_, context_);
+            A_cols_device_ = nullptr;
+          }
+          if (A_rows_device_ != nullptr) {
+            sycl::free(A_rows_device_, context_);
+            A_rows_device_ = nullptr;
+          }
+          if (x_ != nullptr) {
+            sycl::free(x_, context_);
+            x_ = nullptr;
+          }
+          if (y_ != nullptr) {
+            sycl::free(y_, context_);
+            y_ = nullptr;
+          }
+          if (x_device_ != nullptr) {
+            sycl::free(x_device_, context_);
+            x_device_ = nullptr;
+          }
+          if (y_device_ != nullptr) {
+            sycl::free(y_device_, context_);
+            y_device_ = nullptr;
+          }
+        }
+        case gpuOffloadType::unified: {
+          if (A_vals_ != nullptr) {
+            sycl::free(A_vals_, context_);
+            A_vals_ = nullptr;
+          }
+          if (A_cols_ != nullptr) {
+            sycl::free(A_cols_, context_);
+            A_cols_ = nullptr;
+          }
+          if (A_rows_ != nullptr) {
+            sycl::free(A_rows_, context_);
+            A_rows_ = nullptr;
+          }
+          if (x_ != nullptr) {
+            sycl::free(x_, context_);
+            x_ = nullptr;
+          }
+          if (y_ != nullptr) {
+            sycl::free(y_, context_);
+            y_ = nullptr;
+          }
+          break;
+        }
+      }
+      if (offload_ == gpuOffloadType::unified) {
+        free(A_vals_store_);
+        free(A_cols_store_);
+        free(A_rows_store_);
+      }
+      gpuQueue_.wait_and_throw();
+    }
+
+    /** Whether the initialise function has been called before. */
+    bool alreadyInitialised_ = false;
+
+    /** The GPU Device. */
+    sycl::device myGpu_;
+
+    /** The SYCL execution queue*/
+    sycl::queue gpuQueue_;
+
+    sycl::context context_;
+
+    oneapi::mkl::index_base index_;
+    oneapi::mkl::transpose operation_;
+
+    T* A_vals_store_ = nullptr;
+    int64_t* A_cols_store_ = nullptr;
+    int64_t* A_rows_store_ = nullptr;
+
+    T* A_vals_ = nullptr;
+    int64_t* A_cols_ = nullptr;
+    int64_t* A_rows_ = nullptr;
+
+    oneapi::mkl::sparse::matrix_handle_t handle_ = nullptr;
+
+    T* A_vals_device_ = nullptr;
+    int64_t* A_cols_device_ = nullptr;
+    int64_t* A_rows_device_ = nullptr;
+    T* x_device_ = nullptr;
+    T* y_device_ = nullptr;
+
+    const T alpha = ALPHA;
+    const T beta = BETA;
+};
+}
+
+#endif
diff --git a/oneMKL/GPU/spmspm.hh b/oneMKL/GPU/spmspm.hh
new file mode 100644
index 0000000..1bdec86
--- /dev/null
+++ b/oneMKL/GPU/spmspm.hh
@@ -0,0 +1,870 @@
+#pragma once
+
+#ifdef GPU_ONEMKL
+
+#include <memory>
+#include "../../include/kernels/GPU/spmspm.hh"
+#include "../../include/utilities.hh"
+#include "common.hh"
+
+namespace gpu {
+template <typename T>
+class spmspm_gpu : public spmspm<T> {
+public:
+    using spmspm<T>::spmspm;
+    using spmspm<T>::initInputMatrices;
+    using spmspm<T>::A_nnz_;
+    using spmspm<T>::B_nnz_;
+    using spmspm<T>::C_nnz_;
+    using spmspm<T>::m_;
+    using spmspm<T>::n_;
+    using spmspm<T>::k_;
+    using spmspm<T>::C_rows_;
+    using spmspm<T>::C_cols_;
+    using spmspm<T>::C_vals_;
+    using spmspm<T>::offload_;
+    using spmspm<T>::sparsity_;
+    using spmspm<T>::type_;
+
+    void initialise(gpuOffloadType offload, int m, int n, int k,
+                    double sparsity, matrixType type, 
+                    bool binary = false) override {
+      firstRun_ = true;
+      if (!initialised_) {
+        // Set up the sycl parameters
+        device_ = sycl::device(sycl::gpu_selector_v);
+        queue_ = sycl::queue(device_, exception_handler);
+        context_ = queue_.get_context();
+        auto dev = queue_.get_device();
+        initialised_ = true;
+      }
+
+      // Storing initialise parameters into global variables
+      m_ = m;
+      n_ = n;
+      k_ = k;
+      sparsity_ = sparsity;
+      type_ = type;
+      offload_ = offload;
+
+      // Calculating starting matrix NNZ values
+      A_nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+      B_nnz_ = 1 + (uint64_t)((double)k_ * (double)n_ * (1.0 - sparsity_));
+
+      switch (offload_) {
+        case gpuOffloadType::always: {
+          A_rows_store_ = (int64_t*)malloc(static_cast<size_t>(m_ + 1) * sizeof(int64_t));
+          A_cols_store_ = (int64_t*)malloc(static_cast<size_t>(A_nnz_) * sizeof(int64_t));
+          A_vals_store_ = (T*)malloc(static_cast<size_t>(A_nnz_) * sizeof(T));
+          B_rows_store_ = (int64_t*)malloc(static_cast<size_t>(k_ + 1) * sizeof(int64_t));
+          B_cols_store_ = (int64_t*)malloc(static_cast<size_t>(B_nnz_) * sizeof(int64_t));
+          B_vals_store_ = (T*)malloc(static_cast<size_t>(B_nnz_) * sizeof(T));
+
+          A_rows_ = sycl::malloc_host<int64_t>(static_cast<size_t>(m_ + 1), queue_);
+          A_cols_ = sycl::malloc_host<int64_t>(static_cast<size_t>(A_nnz_), queue_);
+          A_vals_ = sycl::malloc_host<T>(static_cast<size_t>(A_nnz_), queue_);
+          A_rows_device_ = sycl::malloc_device<int64_t>(static_cast<size_t>(m_ + 1), queue_);
+          A_cols_device_ = sycl::malloc_device<int64_t>(static_cast<size_t>(A_nnz_), queue_);
+          A_vals_device_ = sycl::malloc_device<T>(static_cast<size_t>(A_nnz_), queue_);
+
+          B_rows_ = sycl::malloc_host<int64_t>(static_cast<size_t>(k_ + 1), queue_);
+          B_cols_ = sycl::malloc_host<int64_t>(static_cast<size_t>(B_nnz_), queue_);
+          B_vals_ = sycl::malloc_host<T>(static_cast<size_t>(B_nnz_), queue_);
+          B_rows_device_ = sycl::malloc_device<int64_t>(static_cast<size_t>(k_ + 1), queue_);
+          B_cols_device_ = sycl::malloc_device<int64_t>(static_cast<size_t>(B_nnz_), queue_);
+          B_vals_device_ = sycl::malloc_device<T>(static_cast<size_t>(B_nnz_), queue_);
+
+          C_rows_ = nullptr;
+          C_cols_ = nullptr;
+          C_vals_ = nullptr;
+          C_rows_device_ = nullptr;
+          C_cols_device_ = nullptr;
+          C_vals_device_ = nullptr;
+          break;
+        }
+        case gpuOffloadType::once: {
+          A_rows_ = sycl::malloc_host<int64_t>(static_cast<size_t>(m_ + 1), queue_);
+          A_cols_ = sycl::malloc_host<int64_t>(static_cast<size_t>(A_nnz_), queue_);
+          A_vals_ = sycl::malloc_host<T>(static_cast<size_t>(A_nnz_), queue_);
+          A_rows_device_ = sycl::malloc_device<int64_t>(static_cast<size_t>(m_ + 1), queue_);
+          A_cols_device_ = sycl::malloc_device<int64_t>(static_cast<size_t>(A_nnz_), queue_);
+          A_vals_device_ = sycl::malloc_device<T>(static_cast<size_t>(A_nnz_), queue_);
+
+          B_rows_ = sycl::malloc_host<int64_t>(static_cast<size_t>(k_ + 1), queue_);
+          B_cols_ = sycl::malloc_host<int64_t>(static_cast<size_t>(B_nnz_), queue_);
+          B_vals_ = sycl::malloc_host<T>(static_cast<size_t>(B_nnz_), queue_);
+          B_rows_device_ = sycl::malloc_device<int64_t>(static_cast<size_t>(k_ + 1), queue_);
+          B_cols_device_ = sycl::malloc_device<int64_t>(static_cast<size_t>(B_nnz_), queue_);
+          B_vals_device_ = sycl::malloc_device<T>(static_cast<size_t>(B_nnz_), queue_);
+
+          C_rows_ = nullptr;
+          C_cols_ = nullptr;
+          C_vals_ = nullptr;
+          C_rows_device_ = nullptr;
+          C_cols_device_ = nullptr;
+          C_vals_device_ = nullptr;
+          break;
+        }
+        case gpuOffloadType::unified: {
+          A_rows_ = sycl::malloc_shared<int64_t>(static_cast<size_t>(m_ + 1), queue_);
+          A_cols_ = sycl::malloc_shared<int64_t>(static_cast<size_t>(A_nnz_), queue_);
+          A_vals_ = sycl::malloc_shared<T>(static_cast<size_t>(A_nnz_), queue_);
+
+          B_rows_ = sycl::malloc_shared<int64_t>(static_cast<size_t>(k_ + 1), queue_);
+          B_cols_ = sycl::malloc_shared<int64_t>(static_cast<size_t>(B_nnz_), queue_);
+          B_vals_ = sycl::malloc_shared<T>(static_cast<size_t>(B_nnz_), queue_);
+
+          C_rows_ = nullptr;
+          C_cols_ = nullptr;
+          C_vals_ = nullptr;
+          break;
+        }
+      }
+      queue_.wait_and_throw();
+      initInputMatrices();
+    }
+
+protected:
+    void toSparseFormat() override {
+      if (offload_ == gpuOffloadType::always) {
+        int seedOffset = 0;
+        do {
+          if (type_ == matrixType::rmat) {
+            rMatCSR<T, int64_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, A_nnz_, SEED + seedOffset++);
+            rMatCSR<T, int64_t>(B_vals_store_, B_cols_store_, B_rows_store_, k_, n_, B_nnz_, SEED + seedOffset++);
+          } else if (type_ == matrixType::random) {
+            randomCSR<T, int64_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, A_nnz_, SEED + seedOffset++);
+            randomCSR<T, int64_t>(B_vals_store_, B_cols_store_, B_rows_store_, k_, n_, B_nnz_, SEED + seedOffset++);
+          } else if (type_ == matrixType::finiteElements) {
+            finiteElementCSR<T, int64_t>(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, A_nnz_, SEED + seedOffset++);
+            finiteElementCSR<T, int64_t>(B_vals_store_, B_cols_store_, B_rows_store_, k_, n_, B_nnz_, SEED + seedOffset++);
+          } else {
+            std::cerr << "Unknown matrix type" << std::endl;
+            exit(1);
+          }
+        } while (calcCNNZ<int64_t>(m_, A_nnz_, A_rows_store_, A_cols_store_, k_, B_nnz_, B_rows_store_, B_cols_store_) == 0);
+      }
+
+      memcpy(A_rows_, A_rows_store_, static_cast<size_t>(m_ + 1) * sizeof(int64_t));
+      memcpy(A_cols_, A_cols_store_, static_cast<size_t>(A_nnz_) * sizeof(int64_t));
+      memcpy(A_vals_, A_vals_store_, static_cast<size_t>(A_nnz_) * sizeof(T));
+      memcpy(B_rows_, B_rows_store_, static_cast<size_t>(k_ + 1) * sizeof(int64_t));
+      memcpy(B_cols_, B_cols_store_, static_cast<size_t>(B_nnz_) * sizeof(int64_t));
+      memcpy(B_vals_, B_vals_store_, static_cast<size_t>(B_nnz_) * sizeof(T));
+    }
+
+private:
+    void preLoopRequirements() override {
+      switch (offload_) {
+        case gpuOffloadType::always: {
+          // Nothing to do, does it all in the callSpmspm loop
+          break;
+        }
+        case gpuOffloadType::once: {
+          auto ARows = queue_.copy<int64_t>(A_rows_, A_rows_device_, static_cast<size_t>(m_ + 1));
+          auto ACols = queue_.copy<int64_t>(A_cols_, A_cols_device_, static_cast<size_t>(A_nnz_));
+          auto AVals = queue_.copy<T>(A_vals_, A_vals_device_, static_cast<size_t>(A_nnz_));
+
+          auto BRows = queue_.copy<int64_t>(B_rows_, B_rows_device_, static_cast<size_t>(k_ + 1));
+          auto BCols = queue_.copy<int64_t>(B_cols_, B_cols_device_, static_cast<size_t>(B_nnz_));
+          auto BVals = queue_.copy<T>(B_vals_, B_vals_device_, static_cast<size_t>(B_nnz_));
+
+          ARows.wait();
+          ACols.wait();
+          AVals.wait();
+          BRows.wait();
+          BCols.wait();
+          BVals.wait();
+          break;
+        }
+        case gpuOffloadType::unified: {
+          // Nothing to do here as shared memory
+          break;
+        }
+      }
+    }
+
+    void callSpmspm() override {
+      switch (offload_) {
+        case gpuOffloadType::always: {
+          if (!firstRun_) {
+            sycl::free(C_rows_, queue_);
+            sycl::free(C_cols_, queue_);
+            sycl::free(C_vals_, queue_);
+          }
+
+          auto ARows = queue_.copy<int64_t>(A_rows_, A_rows_device_, static_cast<size_t>(m_ + 1));
+          auto ACols = queue_.copy<int64_t>(A_cols_, A_cols_device_, static_cast<size_t>(A_nnz_));
+          auto AVals = queue_.copy<T>(A_vals_, A_vals_device_, static_cast<size_t>(A_nnz_));
+
+          auto BRows = queue_.copy<int64_t>(B_rows_, B_rows_device_, static_cast<size_t>(k_ + 1));
+          auto BCols = queue_.copy<int64_t>(B_cols_, B_cols_device_, static_cast<size_t>(B_nnz_));
+          auto BVals = queue_.copy<T>(B_vals_, B_vals_device_, static_cast<size_t>(B_nnz_));
+
+          C_rows_device_ = sycl::malloc_device<int64_t>(static_cast<size_t>(m_ + 1), queue_);
+
+          try {
+            oneapi::mkl::sparse::init_matrix_handle(&A_handle_);
+            oneapi::mkl::sparse::init_matrix_handle(&B_handle_);
+            oneapi::mkl::sparse::init_matrix_handle(&C_handle_);
+
+            auto setA = oneapi::mkl::sparse::set_csr_data(queue_,
+                                                          A_handle_,
+                                                          m_,
+                                                          k_,
+                                                          AIndex_,
+                                                          A_rows_device_,
+                                                          A_cols_device_,
+                                                          A_vals_device_,
+                                                          {ARows, ACols, AVals});
+            auto setB = oneapi::mkl::sparse::set_csr_data(queue_,
+                                                          B_handle_,
+                                                          k_,
+                                                          n_,
+                                                          BIndex_,
+                                                          B_rows_device_,
+                                                          B_cols_device_,
+                                                          B_vals_device_,
+                                                          {BRows, BCols, BVals});
+            auto setC = oneapi::mkl::sparse::set_csr_data(queue_,
+                                                          C_handle_,
+                                                          m_,
+                                                          n_,
+                                                          CIndex_,
+                                                          C_rows_device_,
+                                                          (int64_t*)nullptr,
+                                                          (T*)nullptr,
+                                                          {});
+
+            oneapi::mkl::sparse::init_matmat_descr(&description_);
+
+            oneapi::mkl::sparse::set_matmat_data(description_,
+                                                viewA_,
+                                                opA_,
+                                                viewB_,
+                                                opB_,
+                                                viewC_);
+            
+            request_ = oneapi::mkl::sparse::matmat_request::get_work_estimation_buf_size;
+            sizeTempBuffer = sycl::malloc_host<int64_t>(1, queue_);
+
+            auto ev1_1 = oneapi::mkl::sparse::matmat(queue_,
+                                                    A_handle_,
+                                                    B_handle_,
+                                                    C_handle_,
+                                                    request_,
+                                                    description_,
+                                                    sizeTempBuffer,
+                                                    nullptr,
+                                                    {setA, setB, setC});
+            ev1_1.wait();
+
+            tempBuffer = sycl::malloc_device<uint8_t>(sizeTempBuffer[0], queue_);
+
+            request_ = oneapi::mkl::sparse::matmat_request::work_estimation;
+            auto ev1_3 = oneapi::mkl::sparse::matmat(queue_,
+                                                    A_handle_,
+                                                    B_handle_,
+                                                    C_handle_,
+                                                    request_,
+                                                    description_,
+                                                    sizeTempBuffer,
+                                                    tempBuffer,
+                                                    {ev1_1});
+
+            request_ = oneapi::mkl::sparse::matmat_request::get_compute_buf_size;
+            
+            sizeTempBuffer2 = sycl::malloc_host<int64_t>(1, queue_);
+
+            auto ev2_1 = oneapi::mkl::sparse::matmat(queue_,
+                                                    A_handle_,
+                                                    B_handle_,
+                                                    C_handle_,
+                                                    request_,
+                                                    description_,
+                                                    sizeTempBuffer2,
+                                                    nullptr,
+                                                    {ev1_3});
+            ev2_1.wait();
+
+            tempBuffer2 = sycl::malloc_device<uint8_t>(sizeTempBuffer2[0], queue_);
+
+            request_ = oneapi::mkl::sparse::matmat_request::compute;
+            auto ev2_3 = oneapi::mkl::sparse::matmat(queue_,
+                                                    A_handle_,
+                                                    B_handle_,
+                                                    C_handle_,
+                                                    request_,
+                                                    description_,
+                                                    sizeTempBuffer2,
+                                                    tempBuffer2,
+                                                    {ev2_1});
+
+            request_ = oneapi::mkl::sparse::matmat_request::get_nnz;
+            
+            cNnzBuffer = sycl::malloc_host<int64_t>(1, queue_);
+
+            auto ev3_1 = oneapi::mkl::sparse::matmat(queue_,
+                                                     A_handle_,
+                                                     B_handle_,
+                                                     C_handle_,
+                                                     request_,
+                                                     description_,
+                                                     cNnzBuffer,
+                                                     nullptr,
+                                                     {ev2_3});
+            ev3_1.wait_and_throw();
+
+            C_nnz_ = cNnzBuffer[0];
+            C_cols_device_ = sycl::malloc_device<int64_t>(static_cast<size_t>(C_nnz_), queue_);
+            C_vals_device_ = sycl::malloc_device<T>(static_cast<size_t>(C_nnz_), queue_);
+
+            setC = oneapi::mkl::sparse::set_csr_data(queue_,
+                                                    C_handle_,
+                                                    m_,
+                                                    n_,
+                                                    CIndex_,
+                                                    C_rows_device_,
+                                                    C_cols_device_,
+                                                    C_vals_device_,
+                                                    {ev3_1});
+
+            request_ = oneapi::mkl::sparse::matmat_request::finalize;
+            auto ev3_3 = oneapi::mkl::sparse::matmat(queue_,
+                                                    A_handle_,
+                                                    B_handle_,
+                                                    C_handle_,
+                                                    request_,
+                                                    description_,
+                                                    nullptr,
+                                                    nullptr,
+                                                    {setC});
+
+            auto ev_sort = oneapi::mkl::sparse::sort_matrix(queue_, C_handle_, {ev3_3});
+
+            C_rows_ = sycl::malloc_host<int64_t>(static_cast<size_t>(m_ + 1), queue_);
+            C_cols_ = sycl::malloc_host<int64_t>(static_cast<size_t>(C_nnz_), queue_);
+            C_vals_ = sycl::malloc_host<T>(static_cast<size_t>(C_nnz_), queue_);
+
+            auto CRows = queue_.copy<int64_t>(C_rows_device_, C_rows_, static_cast<size_t>(m_ + 1));
+            auto CCols = queue_.copy<int64_t>(C_cols_device_, C_cols_, static_cast<size_t>(C_nnz_));
+            auto CVals = queue_.copy<T>(C_vals_device_, C_vals_, static_cast<size_t>(C_nnz_));
+            CRows.wait();
+            CCols.wait();
+            CVals.wait();
+
+            oneapi::mkl::sparse::release_matmat_descr(&description_);
+            oneapi::mkl::sparse::release_matrix_handle(queue_, &A_handle_).wait();
+            oneapi::mkl::sparse::release_matrix_handle(queue_, &B_handle_).wait();
+            oneapi::mkl::sparse::release_matrix_handle(queue_, &C_handle_).wait();
+          } catch (sycl::exception const &e) {
+            std::cerr << "\t\tCaught synchronous SYCL exception:\n" << e.what() << std::endl;
+            queue_.wait();
+            oneapi::mkl::sparse::release_matmat_descr(&description_);
+            oneapi::mkl::sparse::release_matrix_handle(queue_, &A_handle_).wait();
+            oneapi::mkl::sparse::release_matrix_handle(queue_, &B_handle_).wait();
+            oneapi::mkl::sparse::release_matrix_handle(queue_, &C_handle_).wait();
+          }
+          sycl::free(sizeTempBuffer, queue_);
+          sycl::free(sizeTempBuffer2, queue_);
+          sycl::free(tempBuffer, queue_);
+          sycl::free(tempBuffer2, queue_);
+          sycl::free(cNnzBuffer, queue_);
+          sycl::free(C_rows_device_, queue_);
+          sycl::free(C_cols_device_, queue_);
+          sycl::free(C_vals_device_, queue_);
+          break;
+        }
+        case gpuOffloadType::once: {
+          // If already allocated, free the device C arrays
+          if (!firstRun_) {
+            sycl::free(C_rows_device_, queue_);
+            sycl::free(C_cols_device_, queue_);
+            sycl::free(C_vals_device_, queue_);
+          }
+          
+          C_rows_device_ = sycl::malloc_device<int64_t>(static_cast<size_t>(m_ + 1), queue_);
+
+          oneapi::mkl::sparse::init_matrix_handle(&A_handle_);
+          oneapi::mkl::sparse::init_matrix_handle(&B_handle_);
+          oneapi::mkl::sparse::init_matrix_handle(&C_handle_);
+
+          auto setA = oneapi::mkl::sparse::set_csr_data(queue_,
+                                                        A_handle_,
+                                                        m_,
+                                                        k_,
+                                                        AIndex_,
+                                                        A_rows_device_,
+                                                        A_cols_device_,
+                                                        A_vals_device_,
+                                                        {});
+          auto setB = oneapi::mkl::sparse::set_csr_data(queue_,
+                                                        B_handle_,
+                                                        k_,
+                                                        n_,
+                                                        BIndex_,
+                                                        B_rows_device_,
+                                                        B_cols_device_,
+                                                        B_vals_device_,
+                                                        {});
+          auto setC = oneapi::mkl::sparse::set_csr_data(queue_,
+                                                        C_handle_,
+                                                        m_,
+                                                        n_,
+                                                        CIndex_,
+                                                        C_rows_device_,
+                                                        (int64_t*)nullptr,
+                                                        (T*)nullptr,
+                                                        {});
+
+          oneapi::mkl::sparse::init_matmat_descr(&description_);
+
+          oneapi::mkl::sparse::set_matmat_data(description_,
+                                               viewA_,
+                                               opA_,
+                                               viewB_,
+                                               opB_,
+                                               viewC_);
+          
+          request_ = oneapi::mkl::sparse::matmat_request::get_work_estimation_buf_size;
+          sizeTempBuffer = sycl::malloc_host<int64_t>(1, queue_);
+          auto ev1_1 = oneapi::mkl::sparse::matmat(queue_,
+                                                   A_handle_,
+                                                   B_handle_,
+                                                   C_handle_,
+                                                   request_,
+                                                   description_,
+                                                   sizeTempBuffer,
+                                                   nullptr,
+                                                   {setA, setB, setC});
+          ev1_1.wait();
+
+          tempBuffer = sycl::malloc_device<uint8_t>(sizeTempBuffer[0], queue_);
+          if (!tempBuffer) throw std::runtime_error("Could not allocate memory");
+
+          request_ = oneapi::mkl::sparse::matmat_request::work_estimation;
+          auto ev1_3 = oneapi::mkl::sparse::matmat(queue_,
+                                                   A_handle_,
+                                                   B_handle_,
+                                                   C_handle_,
+                                                   request_,
+                                                   description_,
+                                                   sizeTempBuffer,
+                                                   tempBuffer,
+                                                   {ev1_1});
+
+          request_ = oneapi::mkl::sparse::matmat_request::get_compute_buf_size;
+          sizeTempBuffer2 = sycl::malloc_host<int64_t>(1, queue_);
+          if (!sizeTempBuffer2) throw std::runtime_error("Could not allocate memory");
+          auto ev2_1 = oneapi::mkl::sparse::matmat(queue_,
+                                                   A_handle_,
+                                                   B_handle_,
+                                                   C_handle_,
+                                                   request_,
+                                                   description_,
+                                                   sizeTempBuffer2,
+                                                   nullptr,
+                                                   {ev1_3});
+          ev2_1.wait();
+
+          tempBuffer2 = sycl::malloc_device<uint8_t>(sizeTempBuffer2[0], queue_);
+          if (!tempBuffer2) throw std::runtime_error("Could not allocate memory");
+
+          request_ = oneapi::mkl::sparse::matmat_request::compute;
+          auto ev2_3 = oneapi::mkl::sparse::matmat(queue_,
+                                                   A_handle_,
+                                                   B_handle_,
+                                                   C_handle_,
+                                                   request_,
+                                                   description_,
+                                                   sizeTempBuffer2,
+                                                   tempBuffer2,
+                                                   {ev2_1});
+
+          request_ = oneapi::mkl::sparse::matmat_request::get_nnz;
+          cNnzBuffer = sycl::malloc_host<int64_t>(1, queue_);
+          if (!cNnzBuffer) throw std::runtime_error("Could not allocate memory");
+          auto ev3_1 = oneapi::mkl::sparse::matmat(queue_,
+                                                   A_handle_,
+                                                   B_handle_,
+                                                   C_handle_,
+                                                   request_,
+                                                   description_,
+                                                   cNnzBuffer,
+                                                   nullptr,
+                                                   {ev2_3});
+          ev3_1.wait();
+
+          C_nnz_ = cNnzBuffer[0];
+          C_cols_device_ = sycl::malloc_device<int64_t>(static_cast<size_t>(C_nnz_), queue_);
+          C_vals_device_ = sycl::malloc_device<T>(static_cast<size_t>(C_nnz_), queue_);
+
+          setC = oneapi::mkl::sparse::set_csr_data(queue_,
+                                                        C_handle_,
+                                                        m_,
+                                                        n_,
+                                                        CIndex_,
+                                                        C_rows_device_,
+                                                        C_cols_device_,
+                                                        C_vals_device_,
+                                                        {ev3_1});
+
+          request_ = oneapi::mkl::sparse::matmat_request::finalize;
+          auto ev3_3 = oneapi::mkl::sparse::matmat(queue_,
+                                                   A_handle_,
+                                                   B_handle_,
+                                                   C_handle_,
+                                                   request_,
+                                                   description_,
+                                                   nullptr,
+                                                   nullptr,
+                                                   {setC});
+
+          auto ev_sort = oneapi::mkl::sparse::sort_matrix(queue_, C_handle_, {ev3_3});
+
+          oneapi::mkl::sparse::release_matmat_descr(&description_);
+          oneapi::mkl::sparse::release_matrix_handle(queue_, &A_handle_).wait();
+          oneapi::mkl::sparse::release_matrix_handle(queue_, &B_handle_).wait();
+          oneapi::mkl::sparse::release_matrix_handle(queue_, &C_handle_).wait();
+          sycl::free(sizeTempBuffer, queue_);
+          sycl::free(sizeTempBuffer2, queue_);
+          sycl::free(tempBuffer, queue_);
+          sycl::free(tempBuffer2, queue_);
+          sycl::free(cNnzBuffer, queue_);
+          break;
+        }
+        case gpuOffloadType::unified: {
+          // If already allocated, free the device C arrays
+          if (!firstRun_) {
+            sycl::free(C_rows_, queue_);
+            sycl::free(C_cols_, queue_);
+            sycl::free(C_vals_, queue_);
+          }
+
+          C_rows_ = sycl::malloc_shared<int64_t>(static_cast<size_t>(m_ + 1), queue_);
+
+          oneapi::mkl::sparse::init_matrix_handle(&A_handle_);
+          oneapi::mkl::sparse::init_matrix_handle(&B_handle_);
+          oneapi::mkl::sparse::init_matrix_handle(&C_handle_);
+
+          auto setA = oneapi::mkl::sparse::set_csr_data(queue_,
+                                                        A_handle_,
+                                                        m_,
+                                                        k_,
+                                                        AIndex_,
+                                                        A_rows_,
+                                                        A_cols_,
+                                                        A_vals_,
+                                                        {});
+          auto setB = oneapi::mkl::sparse::set_csr_data(queue_,
+                                                        B_handle_,
+                                                        k_,
+                                                        n_,
+                                                        BIndex_,
+                                                        B_rows_,
+                                                        B_cols_,
+                                                        B_vals_,
+                                                        {});
+          auto setC = oneapi::mkl::sparse::set_csr_data(queue_,
+                                                        C_handle_,
+                                                        m_,
+                                                        n_,
+                                                        CIndex_,
+                                                        C_rows_,
+                                                        (int64_t*)nullptr,
+                                                        (T*)nullptr,
+                                                        {});
+
+          oneapi::mkl::sparse::init_matmat_descr(&description_);
+
+          oneapi::mkl::sparse::set_matmat_data(description_,
+                                               viewA_,
+                                               opA_,
+                                               viewB_,
+                                               opB_,
+                                               viewC_);
+          
+          request_ = oneapi::mkl::sparse::matmat_request::get_work_estimation_buf_size;
+          sizeTempBuffer = sycl::malloc_host<int64_t>(1, queue_);
+          if (!sizeTempBuffer) throw std::runtime_error("Could not allocate memory");
+          auto ev1_1 = oneapi::mkl::sparse::matmat(queue_,
+                                                   A_handle_,
+                                                   B_handle_,
+                                                   C_handle_,
+                                                   request_,
+                                                   description_,
+                                                   sizeTempBuffer,
+                                                   nullptr,
+                                                   {setA, setB, setC});
+          ev1_1.wait();
+
+          tempBuffer = sycl::malloc_device<uint8_t>(sizeTempBuffer[0], queue_);
+          if (!tempBuffer) throw std::runtime_error("Could not allocate memory");
+
+          request_ = oneapi::mkl::sparse::matmat_request::work_estimation;
+          auto ev1_3 = oneapi::mkl::sparse::matmat(queue_,
+                                                   A_handle_,
+                                                   B_handle_,
+                                                   C_handle_,
+                                                   request_,
+                                                   description_,
+                                                   sizeTempBuffer,
+                                                   tempBuffer,
+                                                   {ev1_1});
+
+          request_ = oneapi::mkl::sparse::matmat_request::get_compute_buf_size;
+          sizeTempBuffer2 = sycl::malloc_host<int64_t>(1, queue_);
+          if (!sizeTempBuffer2) throw std::runtime_error("Could not allocate memory");
+          auto ev2_1 = oneapi::mkl::sparse::matmat(queue_,
+                                                   A_handle_,
+                                                   B_handle_,
+                                                   C_handle_,
+                                                   request_,
+                                                   description_,
+                                                   sizeTempBuffer2,
+                                                   nullptr,
+                                                   {ev1_3});
+          ev2_1.wait();
+
+          tempBuffer2 = sycl::malloc_device<uint8_t>(sizeTempBuffer2[0], queue_);
+          if (!tempBuffer2) throw std::runtime_error("Could not allocate memory");
+
+          request_ = oneapi::mkl::sparse::matmat_request::compute;
+          auto ev2_3 = oneapi::mkl::sparse::matmat(queue_,
+                                                   A_handle_,
+                                                   B_handle_,
+                                                   C_handle_,
+                                                   request_,
+                                                   description_,
+                                                   sizeTempBuffer2,
+                                                   tempBuffer2,
+                                                   {ev2_1});
+
+          request_ = oneapi::mkl::sparse::matmat_request::get_nnz;
+          cNnzBuffer = sycl::malloc_shared<int64_t>(1, queue_);
+          if (!cNnzBuffer) throw std::runtime_error("Could not allocate memory");
+          auto ev3_1 = oneapi::mkl::sparse::matmat(queue_,
+                                                   A_handle_,
+                                                   B_handle_,
+                                                   C_handle_,
+                                                   request_,
+                                                   description_,
+                                                   cNnzBuffer,
+                                                   nullptr,
+                                                   {ev2_3});
+          ev3_1.wait();
+
+          C_nnz_ = cNnzBuffer[0];
+          C_cols_ = sycl::malloc_shared<int64_t>(static_cast<size_t>(C_nnz_), queue_);
+          C_vals_ = sycl::malloc_shared<T>(static_cast<size_t>(C_nnz_), queue_);
+          if (!C_vals_) throw std::runtime_error("Could not allocate memory");
+
+          setC = oneapi::mkl::sparse::set_csr_data(queue_,
+                                                        C_handle_,
+                                                        m_,
+                                                        n_,
+                                                        CIndex_,
+                                                        C_rows_,
+                                                        C_cols_,
+                                                        C_vals_,
+                                                        {ev3_1});
+
+          request_ = oneapi::mkl::sparse::matmat_request::finalize;
+          auto ev3_3 = oneapi::mkl::sparse::matmat(queue_,
+                                                   A_handle_,
+                                                   B_handle_,
+                                                   C_handle_,
+                                                   request_,
+                                                   description_,
+                                                   nullptr,
+                                                   nullptr,
+                                                   {setC});
+
+          auto ev_sort = oneapi::mkl::sparse::sort_matrix(queue_, C_handle_, {ev3_3});
+
+          oneapi::mkl::sparse::release_matmat_descr(&description_);
+          oneapi::mkl::sparse::release_matrix_handle(queue_, &A_handle_).wait();
+          oneapi::mkl::sparse::release_matrix_handle(queue_, &B_handle_).wait();
+          oneapi::mkl::sparse::release_matrix_handle(queue_, &C_handle_).wait();
+          sycl::free(sizeTempBuffer, queue_);
+          sycl::free(sizeTempBuffer2, queue_);
+          sycl::free(tempBuffer, queue_);
+          sycl::free(tempBuffer2, queue_);
+          sycl::free(cNnzBuffer, queue_);
+          break;
+        }
+      }
+      firstRun_ = false;
+    }
+
+    void postLoopRequirements() override {
+      switch (offload_) {
+        case gpuOffloadType::always: {
+          break;
+        }
+        case gpuOffloadType::once: {
+          C_rows_ = sycl::malloc_host<int64_t>(static_cast<size_t>(m_ + 1), queue_);
+
+          C_cols_ = sycl::malloc_host<int64_t>(static_cast<size_t>(C_nnz_), queue_);
+
+          C_vals_ = sycl::malloc_host<T>(static_cast<size_t>(C_nnz_), queue_);
+
+          auto CRows = queue_.copy<int64_t>(C_rows_device_, C_rows_, static_cast<size_t>(m_ + 1));
+          auto CCols = queue_.copy<int64_t>(C_cols_device_, C_cols_, static_cast<size_t>(C_nnz_));
+          auto CVals = queue_.copy<T>(C_vals_device_, C_vals_, static_cast<size_t>(C_nnz_));
+          CRows.wait();
+          CCols.wait();
+          CVals.wait();
+
+          sycl::free(C_rows_device_, queue_);
+          sycl::free(C_cols_device_, queue_);
+          sycl::free(C_vals_device_, queue_);
+          break;
+        }
+        case gpuOffloadType::unified: {
+          break;
+        }
+      }
+    }
+
+    void postCallKernelCleanup() override {
+      switch (offload_) {
+        case gpuOffloadType::always: {
+          sycl::free(A_rows_, queue_);
+          sycl::free(A_cols_, queue_);
+          sycl::free(A_vals_, queue_);
+          sycl::free(A_rows_device_, queue_);
+          sycl::free(A_cols_device_, queue_);
+          sycl::free(A_vals_device_, queue_);
+
+          sycl::free(B_rows_, queue_);
+          sycl::free(B_cols_, queue_);
+          sycl::free(B_vals_, queue_);
+          sycl::free(B_rows_device_, queue_);
+          sycl::free(B_cols_device_, queue_);
+          sycl::free(B_vals_device_, queue_);
+
+          sycl::free(C_rows_, queue_);
+          sycl::free(C_cols_, queue_);
+          sycl::free(C_vals_, queue_);
+          break;
+        }
+        case gpuOffloadType::once: {
+          sycl::free(A_rows_, queue_);
+          sycl::free(A_cols_, queue_);
+          sycl::free(A_vals_, queue_);
+          sycl::free(A_rows_device_, queue_);
+          sycl::free(A_cols_device_, queue_);
+          sycl::free(A_vals_device_, queue_);
+
+          sycl::free(B_rows_, queue_);
+          sycl::free(B_cols_, queue_);
+          sycl::free(B_vals_, queue_);
+          sycl::free(B_rows_device_, queue_);
+          sycl::free(B_cols_device_, queue_);
+          sycl::free(B_vals_device_, queue_);
+
+          sycl::free(C_rows_, queue_);
+          sycl::free(C_cols_, queue_);
+          sycl::free(C_vals_, queue_);
+          break;
+        }
+        case gpuOffloadType::unified: {
+          sycl::free(A_rows_, queue_);
+          sycl::free(A_cols_, queue_);
+          sycl::free(A_vals_, queue_);
+          sycl::free(B_rows_, queue_);
+          sycl::free(B_cols_, queue_);
+          sycl::free(B_vals_, queue_);
+          sycl::free(C_rows_, queue_);
+          sycl::free(C_cols_, queue_);
+          sycl::free(C_vals_, queue_);
+
+          free(A_rows_store_);
+          free(A_cols_store_);
+          free(A_vals_store_);
+          free(B_rows_store_);
+          free(B_cols_store_);
+          free(B_vals_store_);
+          break;
+        }
+      }
+    }
+
+    // First-run check to confirm whether to clean up old arrays or not
+    bool firstRun_ = true;
+
+    bool initialised_ = false;
+
+    // Sycl parameters
+    sycl::queue queue_;
+    sycl::device device_;
+    sycl::context context_;
+
+    // oneMKL parameters
+    oneapi::mkl::transpose opA_ = oneapi::mkl::transpose::nontrans;
+    oneapi::mkl::transpose opB_ = oneapi::mkl::transpose::nontrans;
+
+    oneapi::mkl::sparse::matrix_view_descr viewA_ = oneapi::mkl::sparse::matrix_view_descr::general;
+    oneapi::mkl::sparse::matrix_view_descr viewB_ = oneapi::mkl::sparse::matrix_view_descr::general;
+    oneapi::mkl::sparse::matrix_view_descr viewC_ = oneapi::mkl::sparse::matrix_view_descr::general;
+
+    oneapi::mkl::index_base AIndex_ = oneapi::mkl::index_base::zero;
+    oneapi::mkl::index_base BIndex_ = oneapi::mkl::index_base::zero;
+    oneapi::mkl::index_base CIndex_ = oneapi::mkl::index_base::zero;
+
+    oneapi::mkl::sparse::matrix_handle_t A_handle_ = nullptr;
+    oneapi::mkl::sparse::matrix_handle_t B_handle_ = nullptr;
+    oneapi::mkl::sparse::matrix_handle_t C_handle_ = nullptr;
+
+    oneapi::mkl::sparse::matmat_descr_t description_ = nullptr;
+    oneapi::mkl::sparse::matmat_request request_;
+
+    size_t alloc_sz = 0;
+
+    // A CSR arrays
+    int64_t* A_rows_store_ = nullptr;
+    int64_t* A_cols_store_ = nullptr;
+    T* A_vals_store_ = nullptr;
+    //    LOCAL
+    int64_t* A_rows_ = nullptr;
+    int64_t* A_cols_ = nullptr;
+    T* A_vals_ = nullptr;
+    //    DEVICE
+    int64_t* A_rows_device_ = nullptr;
+    int64_t* A_cols_device_ = nullptr;
+    T* A_vals_device_ = nullptr;
+
+    // B CSR arrays
+    int64_t* B_rows_store_ = nullptr;
+    int64_t* B_cols_store_ = nullptr;
+    T* B_vals_store_ = nullptr;
+    //    LOCAL
+    int64_t* B_rows_ = nullptr;
+    int64_t* B_cols_ = nullptr;
+    T* B_vals_ = nullptr;
+    //    DEVICE
+    int64_t* B_rows_device_ = nullptr;
+    int64_t* B_cols_device_ = nullptr;
+    T* B_vals_device_ = nullptr;
+
+    // C CSR arrays
+    //    LOCAL -- carried through from parent class -- needed externally for checksum
+    //    DEVICE
+    int64_t* C_rows_device_ = nullptr;
+    int64_t* C_cols_device_ = nullptr;
+    T* C_vals_device_ = nullptr;
+
+    // Temporary buffers
+    int64_t* sizeTempBuffer = nullptr;
+    int64_t* sizeTempBuffer2 = nullptr;
+    int64_t* cNnzBuffer = nullptr;
+    void* tempBuffer = nullptr;
+    void* tempBuffer2 = nullptr;
+
+    const T alpha = ALPHA;
+    const T beta = BETA;
+};
+}
+
+#endif
diff --git a/rocBLAS/common.hh b/rocBLAS/common.hh
index 01ea03a..78ef4a7 100644
--- a/rocBLAS/common.hh
+++ b/rocBLAS/common.hh
@@ -8,8 +8,10 @@
     if (hipError_t e = (f); e != hipSuccess) {                          \
       std::cout << "HIP error: " << __FILE__ << ":" << __LINE__ << ": " \
                 << hipGetErrorString(e) << std::endl;                   \
+      std::cout << "[DEBUG] -- " << #f << std::endl;                    \
       exit(1);                                                          \
     }                                                                   \
   } while (false)
 
-#endif
\ No newline at end of file
+
+#endif 
\ No newline at end of file
diff --git a/rocBLAS/gemv.hh b/rocBLAS/gemv.hh
index e1e7a02..1a79b8e 100644
--- a/rocBLAS/gemv.hh
+++ b/rocBLAS/gemv.hh
@@ -43,10 +43,28 @@ class gemv_gpu : public gemv<T> {
    *  - Unified: Initialise data as unified memory; no data movement semantics
    *             required */
   void initialise(gpuOffloadType offload, int m, int n) override {
+    if (print_) {
+      switch (offload) {
+        case gpuOffloadType::always: {
+          std::cout << "===========  ALWAYS  ===========" << std::endl;
+          break;
+        }
+        case gpuOffloadType::once: {
+          std::cout << "===========   ONCE   ===========" << std::endl;
+          break;
+        }
+        case gpuOffloadType::unified: {
+          std::cout << "===========  UNIFIED ===========" << std::endl;
+          break;
+        }
+      }
+    }
+
     if (!alreadyInitialised_) {
       alreadyInitialised_ = true;
       // Perform set-up which doesn't need to happen every problem size change.
       // Create a handle for rocBLAS
+      if (print_) std::cout << "Creating handle" << std::endl;
       rocblas_status status = rocblas_create_handle(&handle_);
       if (status != rocblas_status_success) {
         std::cout << "Failed to make rocBLAS handle: " << status << std::endl;
@@ -54,7 +72,12 @@ class gemv_gpu : public gemv<T> {
       }
 
       // Get device identifier
+      int count;
+      hipCheckError(hipGetDeviceCount(&count));
+      if (print_) std::cout << "Number of devices: " << count << std::endl;
+      if (print_) std::cout << "Getting device ID" << std::endl;
       hipCheckError(hipGetDevice(&gpuDevice_));
+      if (print_) std::cout << "Device ID: " << gpuDevice_ << std::endl;
 
       // Initialise 3 streams to asynchronously move data between host and
       // device
@@ -63,6 +86,7 @@ class gemv_gpu : public gemv<T> {
       hipCheckError(hipStreamCreate(&s3_));
 
       // Enable passing alpha parameter from pointer to host memory
+      if (print_) std::cout << "Setting pointer mode to host" << std::endl;
       status = rocblas_set_pointer_mode(handle_, rocblas_pointer_mode_host);
       if (status != rocblas_status_success) {
         std::cout << "Failed to set rocBLAS pointer mode: " << status
@@ -76,15 +100,18 @@ class gemv_gpu : public gemv<T> {
     n_ = n;
 
     if (offload_ == gpuOffloadType::unified) {
+      if (print_) std::cout << "\tAllocating unified memory" << std::endl;
       hipCheckError(hipMallocManaged(&A_, sizeof(T) * m_ * n_));
       hipCheckError(hipMallocManaged(&x_, sizeof(T) * n_));
       hipCheckError(hipMallocManaged(&y_, sizeof(T) * m_));
     } else {
       // Allocate matrices on host
+      if (print_) std::cout << "\tAllocating host memory" << std::endl;
       hipCheckError(hipHostMalloc((void**)&A_, sizeof(T) * m_ * n_));
       hipCheckError(hipHostMalloc((void**)&x_, sizeof(T) * n_));
       hipCheckError(hipHostMalloc((void**)&y_, sizeof(T) * m_));
       // Allocate matrices on device
+      if (print_) std::cout << "\tAllocating device memory" << std::endl;
       hipCheckError(hipMalloc((void**)&A_device_, sizeof(T) * m_ * n_));
       hipCheckError(hipMalloc((void**)&x_device_, sizeof(T) * n_));
       hipCheckError(hipMalloc((void**)&y_device_, sizeof(T) * m_));
@@ -104,6 +131,7 @@ class gemv_gpu : public gemv<T> {
         break;
       }
       case gpuOffloadType::once: {
+        if (print_) std::cout << "\tMoving data to GPU" << std::endl;
         // Offload input data from host to the device.
         hipCheckError(hipMemcpyAsync(A_device_, A_, sizeof(T) * m_ * n_,
                                      hipMemcpyHostToDevice, s1_));
@@ -114,9 +142,9 @@ class gemv_gpu : public gemv<T> {
         break;
       }
       case gpuOffloadType::unified: {
+        if (print_) std::cout << "\tPrefetching data to GPU" << std::endl;
         // Prefetch input data to device
-        hipCheckError(
-            hipMemPrefetchAsync(A_, sizeof(T) * m_ * n_, gpuDevice_, s1_));
+        hipCheckError(hipMemPrefetchAsync(A_, sizeof(T) * m_ * n_, gpuDevice_, s1_));
         hipCheckError(hipMemPrefetchAsync(x_, sizeof(T) * n_, gpuDevice_, s2_));
         hipCheckError(hipMemPrefetchAsync(y_, sizeof(T) * m_, gpuDevice_, s3_));
         break;
@@ -128,6 +156,7 @@ class gemv_gpu : public gemv<T> {
   void callGemv() override {
     switch (offload_) {
       case gpuOffloadType::always: {
+        if (print_) std::cout << "\tMoving data to GPU" << std::endl;
         // Offload input data from host to the device.
         hipCheckError(hipMemcpyAsync(A_device_, A_, sizeof(T) * m_ * n_,
                                      hipMemcpyHostToDevice, s1_));
@@ -136,6 +165,7 @@ class gemv_gpu : public gemv<T> {
         hipCheckError(hipMemcpyAsync(y_device_, y_, sizeof(T) * m_,
                                      hipMemcpyHostToDevice, s3_));
         // Call rocBLAS GEMV kernel
+        if (print_) std::cout << "\tCalling rocBLAS GEMV kernel" << std::endl;
         if constexpr (std::is_same_v<T, float>) {
           rocblas_status stat = rocblas_sgemv(
               handle_, transA_, m_, n_, &alpha, A_device_, std::max(1, m_),
@@ -155,6 +185,7 @@ class gemv_gpu : public gemv<T> {
             exit(1);
           }
         }
+        if (print_) std::cout << "\tMoving data to CPU" << std::endl;
         // Offload output data from device to host
         hipCheckError(hipMemcpyAsync(y_, y_device_, sizeof(T) * m_,
                                      hipMemcpyDeviceToHost, s3_));
@@ -164,6 +195,7 @@ class gemv_gpu : public gemv<T> {
       }
       case gpuOffloadType::once: {
         // Call rocBLAS GEMV kernel
+        if (print_) std::cout << "\tCalling rocBLAS GEMV kernel" << std::endl;
         if constexpr (std::is_same_v<T, float>) {
           rocblas_status stat = rocblas_sgemv(
               handle_, transA_, m_, n_, &alpha, A_device_, std::max(1, m_),
@@ -187,6 +219,7 @@ class gemv_gpu : public gemv<T> {
       }
       case gpuOffloadType::unified: {
         // Call rocBLAS GEMV kernel
+        if (print_) std::cout << "\tCalling rocBLAS GEMV kernel" << std::endl;
         if constexpr (std::is_same_v<T, float>) {
           rocblas_status stat = rocblas_sgemv(
               handle_, transA_, m_, n_, &alpha, A_, std::max(1, m_), x_,
@@ -220,6 +253,7 @@ class gemv_gpu : public gemv<T> {
         break;
       }
       case gpuOffloadType::once: {
+        if (print_) std::cout << "\tMoving data to CPU" << std::endl;
         // Offload output data from device to host
         hipCheckError(hipMemcpyAsync(y_, y_device_, sizeof(T) * m_,
                                      hipMemcpyDeviceToHost, s3_));
@@ -228,6 +262,7 @@ class gemv_gpu : public gemv<T> {
         break;
       }
       case gpuOffloadType::unified: {
+        if (print_) std::cout << "\tMoving data to CPU" << std::endl;
         // Ensure all output data resides on host once work has completed
         hipCheckError(
             hipMemPrefetchAsync(y_, sizeof(T) * m_, hipCpuDeviceId, s3_));
@@ -242,20 +277,25 @@ class gemv_gpu : public gemv<T> {
    * after Kernel has been called. */
   void postCallKernelCleanup() override {
     if (offload_ == gpuOffloadType::unified) {
+      if (print_) std::cout << "\tFreeing unified memory arrays" << std::endl;
       hipCheckError(hipFree(A_));
       hipCheckError(hipFree(x_));
       hipCheckError(hipFree(y_));
     } else {
       // Free the memory held on host and device
+      if (print_) std::cout << "\tFreeing host memory arrays" << std::endl;
       hipCheckError(hipHostFree((void*)A_));
       hipCheckError(hipHostFree((void*)x_));
       hipCheckError(hipHostFree((void*)y_));
+      if (print_) std::cout << "\tFreeing device memory arrays" << std::endl;
       hipCheckError(hipFree(A_device_));
       hipCheckError(hipFree(x_device_));
       hipCheckError(hipFree(y_device_));
     }
   }
 
+  bool print_ = true;
+
   /** Whether the initialise function has been called before. */
   bool alreadyInitialised_ = false;
 
diff --git a/rocBLAS/spmdnm.hh b/rocBLAS/spmdnm.hh
new file mode 100644
index 0000000..c41828f
--- /dev/null
+++ b/rocBLAS/spmdnm.hh
@@ -0,0 +1,690 @@
+#pragma once
+
+#ifdef GPU_ROCBLAS
+#include <hip/hip_runtime_api.h>
+#include <rocsparse/rocsparse.h>
+
+#include "../include/kernels/GPU/spmdnm.hh"
+#include "../include/utilities.hh"
+#include "common.hh"
+
+#include <iostream>
+
+namespace gpu {
+template <typename T>
+class spmdnm_gpu : public spmdnm<T> {
+public:
+  using spmdnm<T>::spmdnm;
+  using spmdnm<T>::initInputMatrices;
+  using spmdnm<T>::nnz_;
+  using spmdnm<T>::m_;
+  using spmdnm<T>::n_;
+  using spmdnm<T>::k_;
+  using spmdnm<T>::A_;
+  using spmdnm<T>::B_;
+  using spmdnm<T>::C_;
+  using spmdnm<T>::offload_;
+  using spmdnm<T>::sparsity_;
+
+  ~spmdnm_gpu() {
+    if (initialised_) {
+      rocsparse_destroy_handle(handle_);
+      hipCheckError(hipStreamDestroy(s1_));
+      hipCheckError(hipStreamDestroy(s2_));
+      hipCheckError(hipStreamDestroy(s3_));
+    }
+  }
+
+  void initialise(gpuOffloadType offload, int m, int n, int k,
+                  double sparsity, bool binary = false) override {
+      // Set up problem parameters
+    if (print_) {
+      switch (offload) {
+        case gpuOffloadType::always: {
+          std::cout << "===========  ALWAYS  ===========" << std::endl;
+          break;
+        }
+        case gpuOffloadType::once: {
+          std::cout << "===========   ONCE   ===========" << std::endl;
+          break;
+        }
+        case gpuOffloadType::unified: {
+          std::cout << "===========  UNIFIED ===========" << std::endl;
+          break;
+        }
+      }
+    }
+    if (print_) std::cout << "Initialising " << m << "x" << k << " . " << k << "x" << n << std::endl;
+    m_ = m;
+    n_ = n;
+    k_ = k;
+    sparsity_ = sparsity;
+    offload_ = offload;
+    nnz_ = 1 + (int64_t)((double)m_ * (double)n_ * (1.0 - sparsity_));
+
+    // Set up rocSPARSE metadata
+    base_ = rocsparse_index_base_zero;
+    type_ = rocsparse_matrix_type_general;
+    operation_ = rocsparse_operation_none;
+    index_ = rocsparse_indextype_i64;
+    order_ = rocsparse_order_column;
+    algorithm_ = rocsparse_spmm_alg_csr_nnz_split; // This is the only algo for this one
+
+    if constexpr (std::is_same_v<T, float>) {
+      dataType_ = rocsparse_datatype_f32_r;
+    } else if constexpr (std::is_same_v<T, double>) {
+      dataType_ = rocsparse_datatype_f64_r;
+    } else {
+      throw std::runtime_error("Unsupported data type for spmdnm_gpu");
+    }
+
+    if (print_) std::cout << "\tAbout to set up handle and hip streams" << std::endl;
+    if (!initialised_) {
+      status_ = rocsparse_create_handle(&handle_);
+      checkStatus("Failed rocsparse_create_handle");
+
+      // Get the GPU
+      hipCheckError(hipGetDevice(&gpuDevice_));
+      // Make streams for asynchronous GPU comunication
+      hipCheckError(hipStreamCreate(&s1_));
+      hipCheckError(hipStreamCreate(&s2_));
+      hipCheckError(hipStreamCreate(&s3_));
+    }
+
+    if (print_) std::cout << "\tAbout to malloc arrays" << std::endl;
+    if (offload_ == gpuOffloadType::unified) {
+      hipCheckError(hipMallocManaged(&A_, sizeof(T) * m_ * k_));
+      hipCheckError(hipMallocManaged(&A_rows_, sizeof(int64_t) * (m_ + 1)));
+      hipCheckError(hipMallocManaged(&A_cols_, sizeof(int64_t) * nnz_));
+      hipCheckError(hipMallocManaged(&A_vals_, sizeof(T) * nnz_));
+      hipCheckError(hipMallocManaged(&B_, sizeof(T) * k_ * n_));
+      hipCheckError(hipMallocManaged(&C_, sizeof(T) * m_ * n_));
+    } else {
+      // Host data structures
+      hipCheckError(hipHostMalloc((void**)&A_, sizeof(T) * m_ * k_));
+      hipCheckError(hipHostMalloc((void**)&A_rows_, sizeof(int64_t) * (m_ + 1)));
+      hipCheckError(hipHostMalloc((void**)&A_cols_, sizeof(int64_t) * nnz_));
+      hipCheckError(hipHostMalloc((void**)&A_vals_, sizeof(T) * nnz_));
+      hipCheckError(hipHostMalloc((void**)&B_, sizeof(T) * k_ * n_));
+      hipCheckError(hipHostMalloc((void**)&C_, sizeof(T) * m_ * n_));
+      // GPU data structures
+      hipCheckError(hipMalloc((void**)&A_rows_device_, sizeof(int64_t) * (m_ + 1)));
+      hipCheckError(hipMalloc((void**)&A_cols_device_, sizeof(int64_t) * nnz_));
+      hipCheckError(hipMalloc((void**)&A_vals_device_, sizeof(T) * nnz_));
+      hipCheckError(hipMalloc((void**)&B_device_, sizeof(T) * k_ * n_));
+      hipCheckError(hipMalloc((void**)&C_device_, sizeof(T) * m_ * n_));
+    }
+
+    if (print_) std::cout << "\tInitialising matrices" << std::endl;
+    initInputMatrices();
+  }
+
+
+protected:
+  void toSparseFormat() override {
+    int64_t nnz_encountered = 0;
+
+    A_rows_[0] = 0;
+
+    for (int64_t row = 0; row < m_; row++) {
+      for (int64_t col = 0; col < k_; col++) {
+        if (A_[(row * k_) + col] != 0.0) {
+          A_cols_[nnz_encountered] = col;
+          A_vals_[nnz_encountered] = static_cast<T>(A_[(row * k_) + col]);
+          nnz_encountered++;
+        }
+      }
+      A_rows_[row + 1] = nnz_encountered;
+    }
+  }
+
+private:
+  void preLoopRequirements() override {
+    if (print_) std::cout << "pre-loop stuff" << std::endl;
+    switch (offload_) {
+      case gpuOffloadType::always: {
+        break;
+      }
+      case gpuOffloadType::once: {
+        if (print_) std::cout << "\tMoving data to GPU" << std::endl;
+        hipCheckError(hipMemcpyAsync(A_rows_device_,
+                                     A_rows_,
+                                     sizeof(int64_t) * (m_ + 1),
+                                     hipMemcpyHostToDevice,
+                                     s1_));
+        hipCheckError(hipMemcpyAsync(A_cols_device_,
+                                     A_cols_,
+                                     sizeof(int64_t) * nnz_,
+                                     hipMemcpyHostToDevice,
+                                     s1_));
+        hipCheckError(hipMemcpyAsync(A_vals_device_,
+                                     A_vals_,
+                                     sizeof(T) * nnz_,
+                                     hipMemcpyHostToDevice,
+                                     s1_));
+        hipCheckError(hipMemcpyAsync(B_device_,
+                                     B_,
+                                     sizeof(T) * k_ * n_,
+                                     hipMemcpyHostToDevice,
+                                     s2_));
+        hipCheckError(hipMemcpyAsync(C_device_,
+                                     C_,
+                                     sizeof(T) * m_ * n_,
+                                     hipMemcpyHostToDevice,
+                                     s3_));
+        hipCheckError(hipDeviceSynchronize());
+        break;
+      }
+      case gpuOffloadType::unified: {
+        if (print_) std::cout << "\tMoving data to GPU" << std::endl;
+        hipCheckError(hipMemPrefetchAsync(A_rows_, 
+                                          sizeof(int64_t) * (m_ + 1), 
+                                          gpuDevice_, 
+                                          s1_));
+        hipCheckError(hipMemPrefetchAsync(A_cols_, 
+                                          sizeof(int64_t) * nnz_, 
+                                          gpuDevice_, 
+                                          s1_));
+        hipCheckError(hipMemPrefetchAsync(A_vals_, 
+                                          sizeof(T) * nnz_, 
+                                          gpuDevice_, 
+                                          s1_));
+        hipCheckError(hipMemPrefetchAsync(B_, 
+                                          sizeof(T) * k_ * n_, 
+                                          gpuDevice_, 
+                                          s2_));
+        hipCheckError(hipMemPrefetchAsync(C_, 
+                                          sizeof(T) * m_ * n_, 
+                                          gpuDevice_, 
+                                          s3_));
+        hipCheckError(hipDeviceSynchronize());
+        break;
+      }
+    }
+  }
+
+  void callSpmdnm() override {
+    if (print_) std::cout << "callSpmdnm" << std::endl;
+    switch (offload_) {
+      case gpuOffloadType::always: {
+        if (print_) std::cout << "\tMoving data to GPU" << std::endl;
+        hipCheckError(hipMemcpyAsync(A_rows_device_,
+                                     A_rows_,
+                                     sizeof(int64_t) * (m_ + 1),
+                                     hipMemcpyHostToDevice,
+                                     s1_));
+        hipCheckError(hipMemcpyAsync(A_cols_device_,
+                                     A_cols_,
+                                     sizeof(int64_t) * nnz_,
+                                     hipMemcpyHostToDevice,
+                                     s1_));
+        hipCheckError(hipMemcpyAsync(A_vals_device_,
+                                     A_vals_,
+                                     sizeof(T) * nnz_,
+                                     hipMemcpyHostToDevice,
+                                     s1_));
+        hipCheckError(hipMemcpyAsync(B_device_,
+                                     B_,
+                                     sizeof(T) * k_ * n_,
+                                     hipMemcpyHostToDevice,
+                                     s2_));
+        hipCheckError(hipMemcpyAsync(C_device_,
+                                     C_,
+                                     sizeof(T) * m_ * n_,
+                                     hipMemcpyHostToDevice,
+                                     s3_));
+        hipCheckError(hipDeviceSynchronize());
+
+
+        if (print_) std::cout << "\tCreating rocSPARSE structures" << std::endl;
+        // Set up the rocSPARSE structures for the GEMV
+        status_ = rocsparse_create_csr_descr(&A_description_, m_, k_, nnz_, A_rows_device_,
+                                             A_cols_device_, A_vals_device_, index_, index_,
+                                             base_, dataType_);
+        checkStatus("Failed rocsparse_create_csr_descr for A");
+
+        status_ = rocsparse_create_dnmat_descr(&B_description_, k_, n_, k_, B_device_,
+                                               dataType_, order_);
+        checkStatus("Failed rocsparse_create_dnmat_descr for B");
+
+        status_ = rocsparse_create_dnmat_descr(&C_description_, m_, n_, m_, C_device_,
+                                               dataType_, order_);
+        checkStatus("Failed rocsparse_create_dnmat_descr for C");
+        hipCheckError(hipDeviceSynchronize());
+
+        size_t buffer_size = 0;
+        status_ = rocsparse_spmm(handle_,
+                                 operation_,
+                                 operation_,
+                                 &alpha,
+                                 A_description_,
+                                 B_description_,
+                                 &beta,
+                                 C_description_,
+                                 dataType_,
+                                 algorithm_,
+                                 rocsparse_spmm_stage_buffer_size,
+                                 &buffer_size,
+                                 nullptr);
+        checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_buffer_size");
+
+        void* buffer = nullptr;
+        if (print_) std::cout << "\tAllocating buffer with buffer_size = " << buffer_size << std::endl;
+        if (buffer_size > 0) hipCheckError(hipMalloc(&buffer, buffer_size));
+
+        status_ = rocsparse_spmm(handle_,
+                                 operation_,
+                                 operation_,
+                                 &alpha,
+                                 A_description_,
+                                 B_description_,
+                                 &beta,
+                                 C_description_,
+                                 dataType_,
+                                 algorithm_,
+                                 rocsparse_spmm_stage_preprocess,
+                                 &buffer_size,
+                                 buffer);
+        checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_preprocess");
+
+        hipCheckError(hipDeviceSynchronize());
+        status_ = rocsparse_spmm(handle_,
+                                 operation_,
+                                 operation_,
+                                 &alpha,
+                                 A_description_,
+                                 B_description_,
+                                 &beta,
+                                 C_description_,
+                                 dataType_,
+                                 algorithm_,
+                                 rocsparse_spmm_stage_compute,
+                                 &buffer_size,
+                                 buffer);
+        checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_compute");
+
+        hipCheckError(hipDeviceSynchronize());
+        if (print_) std::cout << "\tdestroying rocSPARSE structures" << std::endl;
+        // Now clean up
+        status_ = rocsparse_destroy_spmat_descr(A_description_);
+        checkStatus("Failed rocsparse_destroy_spmat_descr for A");
+        status_ = rocsparse_destroy_dnmat_descr(B_description_);
+        checkStatus("Failed rocsparse_destroy_dnmat_descr for B");
+        status_ = rocsparse_destroy_dnmat_descr(C_description_);
+        checkStatus("Failed rocsparse_destroy_dnmat_descr for C");
+        if (buffer != nullptr) hipCheckError(hipFree(buffer));
+        hipCheckError(hipDeviceSynchronize());
+
+        // Move result back to the CPU
+        if (print_) std::cout << "\tMoving data to CPU" << std::endl;
+        hipCheckError(hipMemcpyAsync(C_, 
+                                     C_device_, 
+                                     sizeof(T) * m_ * n_, 
+                                     hipMemcpyDeviceToHost, 
+                                     s3_));
+        hipCheckError(hipDeviceSynchronize());
+        break;
+      }
+      case gpuOffloadType::once: {
+        if (print_) std::cout << "\tCreating rocSPARSE structures" << std::endl;
+        // Set up the rocSPARSE structures for the GEMV
+        status_ = rocsparse_create_csr_descr(&A_description_,
+                                             m_,
+                                             k_,
+                                             nnz_,
+                                             A_rows_device_,
+                                             A_cols_device_,
+                                             A_vals_device_,
+                                             index_,
+                                             index_,
+                                             base_,
+                                             dataType_);
+        checkStatus("Failed rocsparse_create_csr_descr for A");
+
+        status_ = rocsparse_create_dnmat_descr(&B_description_,
+                                               k_,
+                                               n_,
+                                               k_,
+                                               B_device_,
+                                               dataType_,
+                                               order_);
+        checkStatus("Failed rocsparse_create_dnmat_descr for B");
+
+        status_ = rocsparse_create_dnmat_descr(&C_description_,
+                                               m_,
+                                               n_,
+                                               m_,
+                                               C_device_,
+                                               dataType_,
+                                               order_);
+        checkStatus("Failed rocsparse_create_dnmat_descr for C");
+        hipCheckError(hipDeviceSynchronize());
+
+        size_t buffer_size = 0;
+        status_ = rocsparse_spmm(handle_,
+                                 operation_,
+                                 operation_,
+                                 &alpha,
+                                 A_description_,
+                                 B_description_,
+                                 &beta,
+                                 C_description_,
+                                 dataType_,
+                                 algorithm_,
+                                 rocsparse_spmm_stage_buffer_size,
+                                 &buffer_size,
+                                 nullptr);
+        checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_buffer_size");
+
+        void* buffer = nullptr;
+        if (print_) std::cout << "\tAllocating buffer with buffer_size = " << buffer_size << std::endl;
+        if (buffer_size > 0) hipCheckError(hipMalloc(&buffer, buffer_size));
+
+        status_ = rocsparse_spmm(handle_,
+                                 operation_,
+                                 operation_,
+                                 &alpha,
+                                 A_description_,
+                                 B_description_,
+                                 &beta,
+                                 C_description_,
+                                 dataType_,
+                                 algorithm_,
+                                 rocsparse_spmm_stage_preprocess,
+                                 &buffer_size,
+                                 buffer);
+        checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_preprocess");
+
+        hipCheckError(hipDeviceSynchronize());
+        status_ = rocsparse_spmm(handle_,
+                                 operation_,
+                                 operation_,
+                                 &alpha,
+                                 A_description_,
+                                 B_description_,
+                                 &beta,
+                                 C_description_,
+                                 dataType_,
+                                 algorithm_,
+                                 rocsparse_spmm_stage_compute,
+                                 &buffer_size,
+                                 buffer);
+        checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_compute");
+
+        hipCheckError(hipDeviceSynchronize());
+        if (print_) std::cout << "\tdestroying rocSPARSE structures" << std::endl;
+        // Now clean up
+        status_ = rocsparse_destroy_spmat_descr(A_description_);
+        checkStatus("Failed rocsparse_destroy_spmat_descr for A");
+        status_ = rocsparse_destroy_dnmat_descr(B_description_);
+        checkStatus("Failed rocsparse_destroy_dnmat_descr for B");
+        status_ = rocsparse_destroy_dnmat_descr(C_description_);
+        checkStatus("Failed rocsparse_destroy_dnmat_descr for C");
+        if (buffer) hipCheckError(hipFree(buffer));
+        hipCheckError(hipDeviceSynchronize());
+        break;
+      }
+      case gpuOffloadType::unified: {
+        if (print_) std::cout << "\tCreating rocSPARSE structures" << std::endl;
+        // Set up the rocSPARSE structures for the GEMV
+        status_ = rocsparse_create_csr_descr(&A_description_,
+                                             m_,
+                                             k_,
+                                             nnz_,
+                                             A_rows_,
+                                             A_cols_,
+                                             A_vals_,
+                                             index_,
+                                             index_,
+                                             base_,
+                                             dataType_);
+        checkStatus("Failed rocsparse_create_csr_descr for A");
+
+        status_ = rocsparse_create_dnmat_descr(&B_description_,
+                                               k_,
+                                               n_,
+                                               k_,
+                                               B_,
+                                               dataType_,
+                                               order_);
+        checkStatus("Failed rocsparse_create_dnmat_descr for B");
+
+        status_ = rocsparse_create_dnmat_descr(&C_description_,
+                                               m_,
+                                               n_,
+                                               m_,
+                                               C_,
+                                               dataType_,
+                                               order_);
+        checkStatus("Failed rocsparse_create_dnmat_descr for C");
+        hipCheckError(hipDeviceSynchronize());
+
+        size_t buffer_size;
+        status_ = rocsparse_spmm(handle_,
+                                 operation_,
+                                 operation_,
+                                 &alpha,
+                                 A_description_,
+                                 B_description_,
+                                 &beta,
+                                 C_description_,
+                                 dataType_,
+                                 algorithm_,
+                                 rocsparse_spmm_stage_buffer_size,
+                                 &buffer_size,
+                                 nullptr);
+        checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_buffer_size");
+
+        void* buffer = nullptr;
+        if (print_) std::cout << "\tAllocating buffer with buffer_size = " << buffer_size << std::endl;
+        if (buffer_size > 0) hipCheckError(hipMallocManaged(&buffer, buffer_size));
+
+        status_ = rocsparse_spmm(handle_,
+                                 operation_,
+                                 operation_,
+                                 &alpha,
+                                 A_description_,
+                                 B_description_,
+                                 &beta,
+                                 C_description_,
+                                 dataType_,
+                                 algorithm_,
+                                 rocsparse_spmm_stage_preprocess,
+                                 &buffer_size,
+                                 buffer);
+        checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_preprocess");
+
+        hipCheckError(hipDeviceSynchronize());
+        status_ = rocsparse_spmm(handle_,
+                                 operation_,
+                                 operation_,
+                                 &alpha,
+                                 A_description_,
+                                 B_description_,
+                                 &beta,
+                                 C_description_,
+                                 dataType_,
+                                 algorithm_,
+                                 rocsparse_spmm_stage_compute,
+                                 &buffer_size,
+                                 buffer);
+        checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_compute");
+
+        hipCheckError(hipDeviceSynchronize());
+        if (print_) std::cout << "\tdestroying rocSPARSE structures" << std::endl;
+        // Now clean up
+        status_ = rocsparse_destroy_spmat_descr(A_description_);
+        checkStatus("Failed rocsparse_destroy_spmat_descr for A");
+        status_ = rocsparse_destroy_dnmat_descr(B_description_);
+        checkStatus("Failed rocsparse_destroy_dnmat_descr for B");
+        status_ = rocsparse_destroy_dnmat_descr(C_description_);
+        checkStatus("Failed rocsparse_destroy_dnmat_descr for C");
+        if (buffer) hipCheckError(hipFree(buffer));
+        hipCheckError(hipDeviceSynchronize());
+        break;
+      }
+    }
+  }
+
+  void postLoopRequirements() override {
+    switch (offload_) {
+      case gpuOffloadType::always: {
+        break;
+      }
+      case gpuOffloadType::once: {
+        // Move result back to the CPU
+        if (print_) std::cout << "\tMoving data to CPU" << std::endl;
+        hipCheckError(hipMemcpyAsync(C_, 
+                                     C_device_, 
+                                     sizeof(T) * m_ * n_, 
+                                     hipMemcpyDeviceToHost, 
+                                     s3_));
+        hipCheckError(hipDeviceSynchronize());
+        break;
+      }
+      case gpuOffloadType::unified: {
+        // Ensure all output data resides on host once work has completed
+        if (print_) std::cout << "\tMoving data to CPU" << std::endl;
+        hipCheckError(hipMemPrefetchAsync(C_, 
+                                          sizeof(T) * m_ * n_, 
+                                          hipCpuDeviceId, 
+                                          s3_));
+        // Ensure device has finished all work.
+        hipCheckError(hipDeviceSynchronize());
+        break;
+      }
+    }
+  }
+
+  void postCallKernelCleanup() override {
+    if (print_) std::cout << "Post-kernel cleanup" << std::endl;
+    if (offload_ == gpuOffloadType::unified) {
+      if (print_) std::cout << "\tFreeing unified arrays" << std::endl;
+      hipCheckError(hipFree(A_));
+      hipCheckError(hipFree(A_rows_));
+      hipCheckError(hipFree(A_cols_));
+      hipCheckError(hipFree(A_vals_));
+      hipCheckError(hipFree(B_));
+      hipCheckError(hipFree(C_));
+    } else {
+      if (print_) std::cout << "\tFreeing CPU arrays" << std::endl;
+      hipCheckError(hipHostFree((void*)A_));
+      hipCheckError(hipHostFree((void*)A_rows_));
+      hipCheckError(hipHostFree((void*)A_cols_));
+      hipCheckError(hipHostFree((void*)A_vals_));
+      hipCheckError(hipHostFree((void*)B_));
+      hipCheckError(hipHostFree((void*)C_));
+
+      if (print_) std::cout << "\tFreeing GPU arrays" << std::endl;
+      hipCheckError(hipFree(A_rows_device_));
+      hipCheckError(hipFree(A_cols_device_));
+      hipCheckError(hipFree(A_vals_device_));
+      hipCheckError(hipFree(B_device_));
+      hipCheckError(hipFree(C_device_));
+    }
+  }
+
+  void checkStatus(std::string message) {
+    if (status_ != rocsparse_status_success) {
+      std::cerr << message << std::endl;
+      switch (status_) {
+        case rocsparse_status_success: {
+          std::cerr << "rocsparse_status_success" << std::endl;
+          break;
+        }
+        case rocsparse_status_invalid_handle: {
+          std::cerr << "rocsparse_status_invalid_handle" << std::endl;
+          break;
+        }
+        case rocsparse_status_not_implemented: {
+          std::cerr << "rocsparse_status_not_implemented" << std::endl;
+          break;
+        }
+        case rocsparse_status_invalid_pointer: {
+          std::cerr << "rocsparse_status_invalid_pointer" << std::endl;
+          break;
+        }  
+        case rocsparse_status_invalid_size: {
+          std::cerr << "rocsparse_status_invalid_size" << std::endl;
+          break;
+        }
+        case rocsparse_status_memory_error: {
+          std::cerr << "rocsparse_status_memory_error" << std::endl;
+          break;
+        }
+        case rocsparse_status_internal_error: {
+          std::cerr << "rocsparse_status_internal_error" << std::endl;
+          break;
+        }
+        case rocsparse_status_invalid_value: {
+          std::cerr << "rocsparse_status_invalid_value" << std::endl;
+          break;
+        }
+        case rocsparse_status_arch_mismatch: {
+          std::cerr << "rocsparse_status_arch_mismatch" << std::endl;
+          break;
+        }
+        case rocsparse_status_zero_pivot: {
+          std::cerr << "rocsparse_status_zero_pivot" << std::endl;
+          break;
+        }
+        case rocsparse_status_not_initialized: {
+          std::cerr << "rocsparse_status_not_initialized" << std::endl;
+          break;
+        }
+        case rocsparse_status_type_mismatch: {
+          std::cerr << "rocsparse_status_type_mismatch" << std::endl;
+          break;
+        }
+        case rocsparse_status_requires_sorted_storage: {
+          std::cerr << "rocsparse_status_requires_sorted_storage" << std::endl;
+          break;
+        }
+        case rocsparse_status_thrown_exception: {
+          std::cerr << "rocsparse_status_thrown_exception" << std::endl;
+          break;
+        }
+        default: {
+          std::cerr << "Unknown status code: " << status_ << std::endl;
+        }
+      }
+      exit(1);
+    }
+  }
+
+  bool initialised_ = false;
+  bool print_ = false;
+
+  rocsparse_status status_;
+  rocsparse_operation operation_;
+  rocsparse_handle handle_;
+  rocsparse_index_base base_;
+  rocsparse_datatype dataType_;
+  rocsparse_matrix_type type_;
+  rocsparse_indextype index_;
+  rocsparse_spmm_alg algorithm_;
+  rocsparse_order order_;
+
+  rocsparse_spmat_descr A_description_;
+  rocsparse_dnmat_descr B_description_;
+  rocsparse_dnmat_descr C_description_;
+
+  int64_t* A_rows_;
+  int64_t* A_cols_;
+  T* A_vals_;
+
+  int64_t* A_rows_device_;
+  int64_t* A_cols_device_;
+  T* A_vals_device_;
+  T* B_device_;
+  T* C_device_;
+
+  int gpuDevice_;
+  hipStream_t s1_, s2_, s3_;
+
+  const T alpha = ALPHA;
+  const T beta = BETA;
+};
+}
+
+#endif
diff --git a/rocBLAS/spmdnv.hh b/rocBLAS/spmdnv.hh
new file mode 100644
index 0000000..a89932c
--- /dev/null
+++ b/rocBLAS/spmdnv.hh
@@ -0,0 +1,600 @@
+#pragma once
+
+#ifdef GPU_ROCBLAS
+#include <hip/hip_runtime_api.h>
+#include <rocsparse/rocsparse.h>
+
+#include "../include/kernels/GPU/spmdnv.hh"
+#include "../include/utilities.hh"
+#include "common.hh"
+
+namespace gpu {
+template <typename T>
+class spmdnv_gpu : public spmdnv<T> {
+public:
+    using spmdnv<T>::spmdnv;
+    using spmdnv<T>::initInputMatrixVector;
+    using spmdnv<T>::nnz_;
+    using spmdnv<T>::m_;
+    using spmdnv<T>::n_;
+    using spmdnv<T>::A_;
+    using spmdnv<T>::x_;
+    using spmdnv<T>::y_;
+    using spmdnv<T>::offload_;
+    using spmdnv<T>::sparsity_;
+
+    ~spmdnv_gpu() {
+      if (initialised_) {
+        rocsparse_destroy_handle(handle_);
+        hipCheckError(hipStreamDestroy(s1_));
+        hipCheckError(hipStreamDestroy(s2_));
+        hipCheckError(hipStreamDestroy(s3_));
+      }
+    }
+
+    void initialise(gpuOffloadType offload, int m, int n, double sparsity)
+    override {
+      // Set up problem parameters
+      if (print_) {
+        switch (offload) {
+          case gpuOffloadType::always: {
+            std::cout << "===========  ALWAYS  ===========" << std::endl;
+            break;
+          }
+          case gpuOffloadType::once: {
+            std::cout << "===========   ONCE   ===========" << std::endl;
+            break;
+          }
+          case gpuOffloadType::unified: {
+            std::cout << "===========  UNIFIED ===========" << std::endl;
+            break;
+          }
+        }
+      }
+      if (print_) std::cout << "Initialising with matrix of " << m << "x" << n << std::endl;
+      m_ = m;
+      n_ = n;
+      sparsity_ = sparsity;
+      offload_ = offload;
+
+      nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_));
+
+
+      // Set up rocSPARSE metadata
+      index_ = rocsparse_indextype_i64;
+      type_ = rocsparse_matrix_type_general;
+      operation_ = rocsparse_operation_none;
+      base_ = rocsparse_index_base_zero;
+      algorithm_ = rocsparse_spmv_alg_default; // There are a couple of CSR algorithms -- investigate which is best!
+      if constexpr (std::is_same_v<T, float>) {
+        dataType_ = rocsparse_datatype_f32_r;
+      } else if constexpr (std::is_same_v<T, double>) {
+        dataType_ = rocsparse_datatype_f64_r;
+      } else {
+        throw std::runtime_error("Unsupported data type for spmdnv_gpu");
+      }
+
+
+      if (print_) std::cout << "\tAbout to set up handle and hip streams" << std::endl;
+      if (!initialised_) {
+        // Get the GPU
+        int count;
+        hipCheckError(hipGetDeviceCount(&count));
+        if (print_) std::cout << "Number of devices: " << count << std::endl;
+        if (print_) std::cout << "Getting device ID" << std::endl;
+        if (print_) std::cout << "\t\tGetting GPU device" << std::endl;
+        hipCheckError(hipGetDevice(&gpuDevice_));
+        if (print_) std::cout << "Device ID: " << gpuDevice_ << std::endl;
+        
+        // Make streams for asynchronous GPU comunication
+        if (print_) std::cout << "\t\tCreating GPU streams" << std::endl;
+        hipCheckError(hipStreamCreate(&s1_));
+        hipCheckError(hipStreamCreate(&s2_));
+        hipCheckError(hipStreamCreate(&s3_));
+
+        if (print_) std::cout << "\t\tSetting up GPU handle" << std::endl;
+        status_ = rocsparse_create_handle(&handle_);
+        checkStatus("Failed rocsparse_create_handle");
+      }
+
+      if (print_) std::cout << "\tAbout to malloc arrays" << std::endl;
+      if (offload_ == gpuOffloadType::unified) {
+        hipCheckError(hipMallocManaged(&A_, sizeof(T) * m_ * n_));
+        hipCheckError(hipMallocManaged(&A_rows_, sizeof(int64_t) * (m_ + 1)));
+        hipCheckError(hipMallocManaged(&A_cols_, sizeof(int64_t) * nnz_));
+        hipCheckError(hipMallocManaged(&A_vals_, sizeof(T) * nnz_));
+        hipCheckError(hipMallocManaged(&x_, sizeof(T) * n_));
+        hipCheckError(hipMallocManaged(&y_, sizeof(T) * m_));
+      } else {
+        // Host data structures
+        hipCheckError(hipHostMalloc((void**)&A_, sizeof(T) * m_ * n_));
+        hipCheckError(hipHostMalloc((void**)&A_rows_, sizeof(int64_t) * (m_ + 1)));
+        hipCheckError(hipHostMalloc((void**)&A_cols_, sizeof(int64_t) * nnz_));
+        hipCheckError(hipHostMalloc((void**)&A_vals_, sizeof(T) * nnz_));
+        hipCheckError(hipHostMalloc((void**)&x_, sizeof(T) * n_));
+        hipCheckError(hipHostMalloc((void**)&y_, sizeof(T) * m_));
+        // GPU data structures
+        hipCheckError(hipMalloc((void**)&A_rows_device_, sizeof(int64_t) * (m_ + 1)));
+        hipCheckError(hipMalloc((void**)&A_cols_device_, sizeof(int64_t) * nnz_));
+        hipCheckError(hipMalloc((void**)&A_vals_device_, sizeof(T) * nnz_));
+        hipCheckError(hipMalloc((void**)&x_device_, sizeof(T) * n_));
+        hipCheckError(hipMalloc((void**)&y_device_, sizeof(T) * m_));
+      }
+
+      if (print_) std::cout << "\tInitialising matrix and vector" << std::endl;
+      initInputMatrixVector();
+    }
+
+
+protected:
+    void toSparseFormat() override {
+      if (print_) std::cout << "\tTo Sparse" << std::endl;
+      int64_t nnz_encountered = 0;
+
+      A_rows_[0] = 0;
+
+      for (int64_t row = 0; row < m_; row++) {
+        for (int64_t col = 0; col < n_; col++) {
+          if (A_[(row * n_) + col] != 0.0) {
+            A_cols_[nnz_encountered] = col;
+            A_vals_[nnz_encountered] = static_cast<T>(A_[(row * n_) + col]);
+            nnz_encountered++;
+          }
+        }
+        A_rows_[row + 1] = nnz_encountered;
+      }
+    }
+
+private:
+    /**
+     * Before we enter the loop of calling the kernel, 
+     * we need to move any data we may need.
+     */
+    void preLoopRequirements() override {
+      if (print_) std::cout << "pre-loop stuff" << std::endl;
+      switch (offload_) {
+        case gpuOffloadType::always: {
+          // For Always there is nothing to do here, 
+          // as all memory is moved each time the 
+          // kernel is called
+          break;
+        }
+        case gpuOffloadType::once: {
+          if (print_) std::cout << "\tMoving data to GPU" << std::endl;
+          hipCheckError(hipMemcpyAsync(A_rows_device_, 
+                                       A_rows_, 
+                                       sizeof(int64_t) * (m_ + 1), 
+                                       hipMemcpyHostToDevice, 
+                                       s1_));
+          hipCheckError(hipMemcpyAsync(A_cols_device_, 
+                                       A_cols_, 
+                                       sizeof(int64_t) * nnz_, 
+                                       hipMemcpyHostToDevice, 
+                                       s1_));
+          hipCheckError(hipMemcpyAsync(A_vals_device_, 
+                                       A_vals_, 
+                                       sizeof(T) * nnz_, 
+                                       hipMemcpyHostToDevice, 
+                                       s1_));
+          hipCheckError(hipMemcpyAsync(x_device_, 
+                                       x_, 
+                                       sizeof(T) * n_, 
+                                       hipMemcpyHostToDevice, 
+                                       s2_));
+          hipCheckError(hipMemcpyAsync(y_device_, 
+                                       y_, 
+                                       sizeof(T) * m_, 
+                                       hipMemcpyHostToDevice, 
+                                       s3_));
+          hipCheckError(hipDeviceSynchronize());
+          break;
+        }
+        case gpuOffloadType::unified: {
+          if (print_) std::cout << "\tMoving data to GPU" << std::endl;
+          hipCheckError(hipMemPrefetchAsync(A_rows_, sizeof(int64_t) * (m_ + 1), gpuDevice_, s1_));
+          hipCheckError(hipMemPrefetchAsync(A_cols_, sizeof(int64_t) * nnz_, gpuDevice_, s1_));
+          hipCheckError(hipMemPrefetchAsync(A_vals_, sizeof(T) * nnz_, gpuDevice_, s1_));
+          hipCheckError(hipMemPrefetchAsync(x_, sizeof(T) * n_, gpuDevice_, s2_));
+          hipCheckError(hipMemPrefetchAsync(y_, sizeof(T) * m_, gpuDevice_, s3_));
+          hipCheckError(hipDeviceSynchronize());
+          break;
+        }
+      }
+    }
+
+    void callSpMDnV() override {
+      if (print_) std::cout << "callSpMDnV" << std::endl;
+      switch (offload_) {
+        case gpuOffloadType::always: {
+          // Start by moving all the data over to the GPU
+          if (print_) std::cout << "\tMoving data to GPU" << std::endl;
+          hipCheckError(hipMemcpyAsync(A_rows_device_, 
+                                       A_rows_, 
+                                       sizeof(int64_t) * (m_ + 1), 
+                                       hipMemcpyHostToDevice, 
+                                       s1_));
+          hipCheckError(hipMemcpyAsync(A_cols_device_, 
+                                       A_cols_, 
+                                       sizeof(int64_t) * nnz_, 
+                                       hipMemcpyHostToDevice, 
+                                       s1_));
+          hipCheckError(hipMemcpyAsync(A_vals_device_, 
+                                       A_vals_, 
+                                       sizeof(T) * nnz_, 
+                                       hipMemcpyHostToDevice, 
+                                       s1_));
+          hipCheckError(hipMemcpyAsync(x_device_, 
+                                       x_, 
+                                       sizeof(T) * n_, 
+                                       hipMemcpyHostToDevice, 
+                                       s2_));
+          hipCheckError(hipMemcpyAsync(y_device_, 
+                                       y_, 
+                                       sizeof(T) * m_, 
+                                       hipMemcpyHostToDevice, 
+                                       s3_));
+          hipCheckError(hipDeviceSynchronize());
+
+          if (print_) std::cout << "\tCreating rocSPARSE structures" << std::endl;
+          // Set up the rocSPARSE structures for the SpMDnV
+          status_ = rocsparse_create_csr_descr(&description_,
+                                               m_,
+                                               n_,
+                                               nnz_,
+                                               A_rows_device_,
+                                               A_cols_device_,
+                                               A_vals_device_,
+                                               index_,
+                                               index_,
+                                               base_,
+                                               dataType_);
+          checkStatus("Failed rocsparse_create_csr_descr");
+
+          status_ = rocsparse_create_dnvec_descr(&x_description_,
+                                                 n_,
+                                                 x_device_,
+                                                 dataType_);
+          checkStatus("Failed rocsparse_create_dnvec_descr for x");
+
+          status_ = rocsparse_create_dnvec_descr(&y_description_,
+                                                 m_,
+                                                 y_device_,
+                                                 dataType_);
+          checkStatus("Failed rocsparse_create_dnvec_descr for y");
+          hipCheckError(hipDeviceSynchronize());
+
+          size_t buffer_size = 0;
+          status_ = rocsparse_spmv(handle_,
+                                      operation_,
+                                      &alpha,
+                                      description_,
+                                      x_description_,
+                                      &beta,
+                                      y_description_,
+                                      dataType_,
+                                      algorithm_,
+                                      rocsparse_spmv_stage_buffer_size,
+                                      &buffer_size,
+                                      nullptr);
+          checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_buffer_size");
+          hipCheckError(hipDeviceSynchronize());
+          
+          void* temp_buffer;
+          hipCheckError(hipMalloc(&temp_buffer, buffer_size));
+          hipCheckError(hipDeviceSynchronize());
+
+          status_ = rocsparse_spmv(handle_,
+                                      operation_,
+                                      &alpha,
+                                      description_,
+                                      x_description_,
+                                      &beta,
+                                      y_description_,
+                                      dataType_,
+                                      algorithm_,
+                                      rocsparse_spmv_stage_preprocess,
+                                      &buffer_size,
+                                      temp_buffer);
+          checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_preprocess");
+          hipCheckError(hipDeviceSynchronize());
+
+          status_ = rocsparse_spmv(handle_,
+                                      operation_,
+                                      &alpha,
+                                      description_,
+                                      x_description_,
+                                      &beta,
+                                      y_description_,
+                                      dataType_,
+                                      algorithm_,
+                                      rocsparse_spmv_stage_compute,
+                                      &buffer_size,
+                                      temp_buffer);
+          checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_compute");
+          hipCheckError(hipDeviceSynchronize());
+
+          if (print_) std::cout << "\tdestroying rocSPARSE structures" << std::endl;
+          // Now clean up
+          status_ = rocsparse_destroy_spmat_descr(description_);
+          checkStatus("Failed rocsparse_destroy_spmat_descr");
+          hipCheckError(hipFree(temp_buffer));
+
+          // Move result back to the CPU
+          if (print_) std::cout << "\tMoving data to CPU" << std::endl;
+          hipCheckError(hipMemcpyAsync(y_, y_device_, sizeof(T) * m_, hipMemcpyDeviceToHost, s3_));
+          hipCheckError(hipDeviceSynchronize());
+          break;
+        }
+        case gpuOffloadType::once: {
+          // Set up the rocSPARSE structures for the SpMDnV
+          if (print_) std::cout << "\tCreating rocSPARSE structures" << std::endl;
+          // Set up the rocSPARSE structures for the SpMDnV
+          status_ = rocsparse_create_csr_descr(&description_,
+                                               m_,
+                                               n_,
+                                               nnz_,
+                                               A_rows_device_,
+                                               A_cols_device_,
+                                               A_vals_device_,
+                                               index_,
+                                               index_,
+                                               base_,
+                                               dataType_);
+          checkStatus("Failed rocsparse_create_csr_descr");
+
+          status_ = rocsparse_create_dnvec_descr(&x_description_,
+                                                 n_,
+                                                 x_device_,
+                                                 dataType_);
+          checkStatus("Failed rocsparse_create_dnvec_descr for x");
+
+          status_ = rocsparse_create_dnvec_descr(&y_description_,
+                                                 m_,
+                                                 y_device_,
+                                                 dataType_);
+          checkStatus("Failed rocsparse_create_dnvec_descr for y");
+          hipCheckError(hipDeviceSynchronize());
+
+          size_t buffer_size = 0;
+          status_ = rocsparse_spmv(handle_,
+                                      operation_,
+                                      &alpha,
+                                      description_,
+                                      x_description_,
+                                      &beta,
+                                      y_description_,
+                                      dataType_,
+                                      algorithm_,
+                                      rocsparse_spmv_stage_buffer_size,
+                                      &buffer_size,
+                                      nullptr);
+          checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_buffer_size");
+          hipCheckError(hipDeviceSynchronize());
+          
+          void* temp_buffer;
+          hipCheckError(hipMalloc(&temp_buffer, buffer_size));
+          hipCheckError(hipDeviceSynchronize());
+
+          status_ = rocsparse_spmv(handle_,
+                                      operation_,
+                                      &alpha,
+                                      description_,
+                                      x_description_,
+                                      &beta,
+                                      y_description_,
+                                      dataType_,
+                                      algorithm_,
+                                      rocsparse_spmv_stage_preprocess,
+                                      &buffer_size,
+                                      temp_buffer);
+          checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_preprocess");
+          hipCheckError(hipDeviceSynchronize());
+
+          status_ = rocsparse_spmv(handle_,
+                                      operation_,
+                                      &alpha,
+                                      description_,
+                                      x_description_,
+                                      &beta,
+                                      y_description_,
+                                      dataType_,
+                                      algorithm_,
+                                      rocsparse_spmv_stage_compute,
+                                      &buffer_size,
+                                      temp_buffer);
+          checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_compute");
+          hipCheckError(hipDeviceSynchronize());
+
+          if (print_) std::cout << "\tdestroying rocSPARSE structures" << std::endl;
+          // Now clean up
+          status_ = rocsparse_destroy_spmat_descr(description_);
+          checkStatus("Failed rocsparse_destroy_spmat_descr");
+          hipCheckError(hipFree(temp_buffer));
+          break;
+        }
+        case gpuOffloadType::unified: {
+          // Set up the rocSPARSE structures for the SpMDnV
+          if (print_) std::cout << "\tCreating rocSPARSE structures" << std::endl;
+          // Set up the rocSPARSE structures for the SpMDnV
+          status_ = rocsparse_create_csr_descr(&description_,
+                                               m_,
+                                               n_,
+                                               nnz_,
+                                               A_rows_,
+                                               A_cols_,
+                                               A_vals_,
+                                               index_,
+                                               index_,
+                                               base_,
+                                               dataType_);
+          checkStatus("Failed rocsparse_create_csr_descr");
+
+          status_ = rocsparse_create_dnvec_descr(&x_description_,
+                                                 n_,
+                                                 x_,
+                                                 dataType_);
+          checkStatus("Failed rocsparse_create_dnvec_descr for x");
+
+          status_ = rocsparse_create_dnvec_descr(&y_description_,
+                                                 m_,
+                                                 y_,
+                                                 dataType_);
+          checkStatus("Failed rocsparse_create_dnvec_descr for y");
+          hipCheckError(hipDeviceSynchronize());
+
+          size_t buffer_size = 0;
+          status_ = rocsparse_spmv(handle_,
+                                      operation_,
+                                      &alpha,
+                                      description_,
+                                      x_description_,
+                                      &beta,
+                                      y_description_,
+                                      dataType_,
+                                      algorithm_,
+                                      rocsparse_spmv_stage_buffer_size,
+                                      &buffer_size,
+                                      nullptr);
+          checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_buffer_size");
+          hipCheckError(hipDeviceSynchronize());
+          
+          void* temp_buffer;
+          hipCheckError(hipMalloc(&temp_buffer, buffer_size));
+          hipCheckError(hipDeviceSynchronize());
+
+          status_ = rocsparse_spmv(handle_,
+                                      operation_,
+                                      &alpha,
+                                      description_,
+                                      x_description_,
+                                      &beta,
+                                      y_description_,
+                                      dataType_,
+                                      algorithm_,
+                                      rocsparse_spmv_stage_preprocess,
+                                      &buffer_size,
+                                      temp_buffer);
+          checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_preprocess");
+          hipCheckError(hipDeviceSynchronize());
+
+          status_ = rocsparse_spmv(handle_,
+                                      operation_,
+                                      &alpha,
+                                      description_,
+                                      x_description_,
+                                      &beta,
+                                      y_description_,
+                                      dataType_,
+                                      algorithm_,
+                                      rocsparse_spmv_stage_compute,
+                                      &buffer_size,
+                                      temp_buffer);
+          checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_compute");
+          hipCheckError(hipDeviceSynchronize());
+
+          if (print_) std::cout << "\tdestroying rocSPARSE structures" << std::endl;
+          // Now clean up
+          status_ = rocsparse_destroy_spmat_descr(description_);
+          checkStatus("Failed rocsparse_destroy_spmat_descr");
+          hipCheckError(hipFree(temp_buffer));
+          break;
+        }
+      }
+    }
+
+    void postLoopRequirements() override {
+      if (print_) std::cout << "Post loop " << std::endl;
+      switch (offload_) {
+        case gpuOffloadType::always: {
+          break;
+        }
+        case gpuOffloadType::once: {
+          // Move result back to the CPU
+          if (print_) std::cout << "\tMovin data to CPU" << std::endl;
+          hipCheckError(hipMemcpyAsync(y_, y_device_, sizeof(T) * m_, hipMemcpyDeviceToHost, s3_));
+          hipCheckError(hipDeviceSynchronize());
+          break;
+        }
+        case gpuOffloadType::unified: {
+          // Ensure all output data resides on host once work has completed
+          if (print_) std::cout << "\tMovin data to CPU" << std::endl;
+          hipCheckError(hipMemPrefetchAsync(y_, sizeof(T) * m_, hipCpuDeviceId, s3_));
+          // Ensure device has finished all work.
+          hipCheckError(hipDeviceSynchronize());
+          break;
+        }
+      }
+    }
+
+    void postCallKernelCleanup() override {
+      if (print_) std::cout << "Post-kernel cleanup" << std::endl;
+      if (offload_ == gpuOffloadType::unified) {
+        if (print_) std::cout << "\tFreeing unified arrays" << std::endl;
+        hipCheckError(hipFree(A_));
+        hipCheckError(hipFree(A_rows_));
+        hipCheckError(hipFree(A_cols_));
+        hipCheckError(hipFree(A_vals_));
+        hipCheckError(hipFree(x_));
+        hipCheckError(hipFree(y_));
+      } else {
+        if (print_) std::cout << "\tFreeing CPU arrays" << std::endl;
+        hipCheckError(hipHostFree((void*)A_));
+        hipCheckError(hipHostFree((void*)A_rows_));
+        hipCheckError(hipHostFree((void*)A_cols_));
+        hipCheckError(hipHostFree((void*)A_vals_));
+        hipCheckError(hipHostFree((void*)x_));
+        hipCheckError(hipHostFree((void*)y_));
+
+        if (print_) std::cout << "\tFreeing GPU arrays" << std::endl;
+        hipCheckError(hipFree(A_rows_device_));
+        hipCheckError(hipFree(A_cols_device_));
+        hipCheckError(hipFree(A_vals_device_));
+        hipCheckError(hipFree(x_device_));
+        hipCheckError(hipFree(y_device_));
+      }
+    }
+
+    void checkStatus(std::string message) {
+      if (status_ != rocsparse_status_success) {
+        std::cerr << message << std::endl;
+        exit(1);
+      }
+    }
+
+    bool initialised_ = false;
+
+    bool print_ = false;
+
+    rocsparse_status status_;
+    rocsparse_operation operation_;
+    rocsparse_handle handle_;
+    rocsparse_indextype index_;
+    rocsparse_matrix_type type_;
+    rocsparse_index_base base_;
+    rocsparse_datatype dataType_;
+    rocsparse_spmv_alg algorithm_;
+
+    rocsparse_spmat_descr description_;
+    rocsparse_dnvec_descr x_description_;
+    rocsparse_dnvec_descr y_description_;
+
+
+    int64_t* A_rows_;
+    int64_t* A_cols_;
+    T* A_vals_;
+
+    int64_t* A_rows_device_;
+    int64_t* A_cols_device_;
+    T* A_vals_device_;
+    T* x_device_;
+    T* y_device_;
+
+    int gpuDevice_;
+    hipStream_t s1_, s2_, s3_;
+
+    const T alpha = ALPHA;
+    const T beta = BETA;
+};
+}
+
+#endif
diff --git a/rocBLAS/spmspm.hh b/rocBLAS/spmspm.hh
new file mode 100644
index 0000000..30c2703
--- /dev/null
+++ b/rocBLAS/spmspm.hh
@@ -0,0 +1,1107 @@
+#pragma once
+
+#ifdef GPU_ROCBLAS
+#include <hip/hip_runtime_api.h>
+#include <rocsparse/rocsparse.h>
+
+#include <memory>
+#include "../include/kernels/GPU/spmspm.hh"
+#include "../include/utilities.hh"
+#include "common.hh"
+
+namespace gpu {
+template <typename T>
+class spmspm_gpu : public spmspm<T> {
+public:
+    using spmspm<T>::spmspm;
+    using spmspm<T>::initInputMatrices;
+    using spmspm<T>::A_nnz_;
+    using spmspm<T>::B_nnz_;
+    using spmspm<T>::m_;
+    using spmspm<T>::n_;
+    using spmspm<T>::k_;
+    using spmspm<T>::A_;
+    using spmspm<T>::B_;
+    using spmspm<T>::C_;
+    using spmspm<T>::offload_;
+    using spmspm<T>::sparsity_;
+    using spmspm<T>::C_nnz_;
+    using spmspm<T>::C_vals_;
+    using spmspm<T>::C_rows_;
+    using spmspm<T>::C_cols_;
+
+    ~spmspm_gpu() {
+      if (initialised_) {
+        status_ = rocsparse_destroy_handle(handle_);
+        checkStatus("Failed rocsparse_destroy_handle");
+        hipCheckError(hipStreamDestroy(stream_));
+        initialised_ = false;
+      }
+    }
+
+    void initialise(gpuOffloadType offload, int m, int n, int k,
+                    double sparsity, bool binary = false) override {
+      if (print_) {
+        switch (offload) {
+          case gpuOffloadType::always: {
+            std::cout << "===========  ALWAYS  ===========" << std::endl;
+            break;
+          }
+          case gpuOffloadType::once: {
+            std::cout << "===========   ONCE   ===========" << std::endl;
+            break;
+          }
+          case gpuOffloadType::unified: {
+            std::cout << "===========  UNIFIED ===========" << std::endl;
+            break;
+          }
+        }
+      }
+
+      if (print_) std::cout << "Initialising " << m << "x" << k << " . " << k << "x" << n << std::endl;
+      firstRun_ = true;
+
+      m_ = m;
+      n_ = n;
+      k_ = k;
+      sparsity_ = sparsity;
+      offload_ = offload;
+      A_nnz_ = 1 + (int64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+      B_nnz_ = 1 + (int64_t)((double)k_ * (double)n_ * (1.0 - sparsity_));
+
+      // Set up rocSPARSE metadata
+      index_ = rocsparse_indextype_i64;
+      base_ = rocsparse_index_base_zero;
+      type_ = rocsparse_matrix_type_general;
+      operation_ = rocsparse_operation_none;
+      algorithm_ = rocsparse_spgemm_alg_default;
+
+      if constexpr (std::is_same_v<T, float>) {
+        dataType_ = rocsparse_datatype_f32_r;
+      } else if constexpr (std::is_same_v<T, double>) {
+        dataType_ = rocsparse_datatype_f64_r;
+      } else {
+        static_assert("Unsupported data type for rocSPARSE");
+      }
+
+      if (print_) std::cout << "\tAbout to set up handle and hip streams" << std::endl;
+      if (!initialised_) {
+        status_ = rocsparse_create_handle(&handle_);
+        checkStatus("Failed rocsparse_create_handle");
+
+        // Get the GPU
+        hipCheckError(hipGetDevice(&gpuDevice_));
+        // Make streams for asynchronous GPU comunication
+        hipCheckError(hipStreamCreate(&stream_));
+
+        status_ = rocsparse_set_stream(handle_, stream_);
+        checkStatus("Failed rocsparse_get_stream");
+      }
+
+      if (print_) std::cout << "\tAbout to malloc arrays" << std::endl;
+      if (offload_ == gpuOffloadType::unified) {
+        hipCheckError(hipMallocManaged(&A_, sizeof(T) * m_ * k_));
+        hipCheckError(hipMallocManaged(&A_rows_, sizeof(int64_t) * (m_ + 1)));
+        hipCheckError(hipMallocManaged(&A_cols_, sizeof(int64_t) * A_nnz_));
+        hipCheckError(hipMallocManaged(&A_vals_, sizeof(T) * A_nnz_));
+        hipCheckError(hipMallocManaged(&B_, sizeof(T) * k_ * n_));
+        hipCheckError(hipMallocManaged(&B_rows_, sizeof(int64_t) * (k_ + 1)));
+        hipCheckError(hipMallocManaged(&B_cols_, sizeof(int64_t) * B_nnz_));
+        hipCheckError(hipMallocManaged(&B_vals_, sizeof(T) * B_nnz_));
+        hipCheckError(hipMallocManaged(&D_rows_, sizeof(int64_t) * (m_ + 1)));
+        hipCheckError(hipMallocManaged(&D_cols_, sizeof(int64_t) * D_nnz_));
+        hipCheckError(hipMallocManaged(&D_vals_, sizeof(T) * D_nnz_));
+
+        hipCheckError(hipDeviceSynchronize());
+      } else {
+        // Host data structures
+        hipCheckError(hipHostMalloc(&A_, sizeof(T) * m_ * k_));
+        hipCheckError(hipHostMalloc(&A_rows_, sizeof(int64_t) * (m_ + 1)));
+        hipCheckError(hipHostMalloc(&A_cols_, sizeof(int64_t) * A_nnz_));
+        hipCheckError(hipHostMalloc(&A_vals_, sizeof(T) * A_nnz_));
+        hipCheckError(hipHostMalloc(&B_, sizeof(T) * k_ * n_));
+        hipCheckError(hipHostMalloc(&B_rows_, sizeof(int64_t) * (k_ + 1)));
+        hipCheckError(hipHostMalloc(&B_cols_, sizeof(int64_t) * B_nnz_));
+        hipCheckError(hipHostMalloc(&B_vals_, sizeof(T) * B_nnz_));
+        hipCheckError(hipHostMalloc(&D_rows_, sizeof(int64_t) * (m_ + 1)));
+        hipCheckError(hipHostMalloc(&D_cols_, sizeof(int64_t) * D_nnz_));
+        hipCheckError(hipHostMalloc(&D_vals_, sizeof(T) * D_nnz_));
+        hipCheckError(hipDeviceSynchronize());
+
+        // GPU data structures
+        hipCheckError(hipMalloc(&A_rows_device_, sizeof(int64_t) * (m_ + 1)));
+        hipCheckError(hipMalloc(&A_cols_device_, sizeof(int64_t) * A_nnz_));
+        hipCheckError(hipMalloc(&A_vals_device_, sizeof(T) * A_nnz_));
+        hipCheckError(hipMalloc(&B_rows_device_, sizeof(int64_t) * (k_ + 1)));
+        hipCheckError(hipMalloc(&B_cols_device_, sizeof(int64_t) * B_nnz_));
+        hipCheckError(hipMalloc(&B_vals_device_, sizeof(T) * B_nnz_));
+        hipCheckError(hipMalloc(&D_rows_device_, sizeof(int64_t) * (m_ + 1)));
+        hipCheckError(hipMalloc(&D_cols_device_, sizeof(int64_t) * D_nnz_));
+        hipCheckError(hipMalloc(&D_vals_device_, sizeof(T) * D_nnz_));
+        hipCheckError(hipDeviceSynchronize());
+      }
+
+
+      if (print_) std::cout << "\tInitialising matrices" << std::endl;
+      uint64_t outputNNZ = 0;
+      while (outputNNZ == 0) {
+        initInputMatrices();
+        outputNNZ = calcNNZC();
+      }
+    }
+
+protected:
+    void toSparseFormat() override {
+      if (print_) std::cout << "Making sparse now" << std::endl;
+      int64_t nnz_encountered = 0;
+
+      if (print_) std::cout << "\tA into CSR" << std::endl;
+      // Convert A to CSR format
+      A_rows_[0] = 0;
+
+      for (int64_t row = 0; row < m_; row++) {
+        for (int64_t col = 0; col < k_; col++) {
+          if (A_[(row * k_) + col] != 0.0) {
+            A_cols_[nnz_encountered] = col;
+            A_vals_[nnz_encountered] = static_cast<T>(A_[(row * k_) + col]);
+            nnz_encountered++;
+          }
+        }
+        A_rows_[row + 1] = nnz_encountered;
+      }
+
+      // Verify A conversion
+      if (nnz_encountered != A_nnz_) {
+        std::cerr << "Warning: A matrix has " << nnz_encountered << " non-zeros, expected " << A_nnz_ << std::endl;
+        A_nnz_ = nnz_encountered;  // Update to actual count
+      }
+
+      if (print_) std::cout << "\tB into CSR" << std::endl;
+      // Convert B to CSR format
+      nnz_encountered = 0;
+
+      B_rows_[0] = 0;
+
+      for (int64_t row = 0; row < k_; row++) {
+        for (int64_t col = 0; col < n_; col++) {
+          if (B_[(row * n_) + col] != 0.0) {
+            B_cols_[nnz_encountered] = col;
+            B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
+            nnz_encountered++;
+          }
+        }
+        B_rows_[row + 1] = nnz_encountered;
+      }
+
+      // Verify B conversion
+      if (nnz_encountered != B_nnz_) {
+        std::cerr << "Warning: B matrix has " << nnz_encountered << " non-zeros, expected " << B_nnz_ << std::endl;
+        B_nnz_ = nnz_encountered;  // Update to actual count
+      }
+
+      // Make D a possible matrix
+      D_cols_[0] = 0;
+      D_vals_[0] = 1.0;
+      D_rows_[0] = 0;
+      for (size_t i = 1; i < (m_ + 1); i++) {
+        D_rows_[i] = 1; 
+      }
+
+      // Ensure synchronization for unified memory
+      hipCheckError(hipDeviceSynchronize());
+    }
+
+private:
+    void preLoopRequirements() override {
+      if (print_) std::cout << "pre-loop stuff" << std::endl;
+      switch (offload_) {
+        case gpuOffloadType::always: {
+          break;
+        }
+        case gpuOffloadType::once: {
+          if (print_) std::cout << "\tMoving data to GPU" << std::endl;
+          hipCheckError(hipMemcpyAsync(A_rows_device_,
+                                       A_rows_,
+                                       sizeof(int64_t) * (m_ + 1),
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(A_cols_device_,
+                                       A_cols_,
+                                       sizeof(int64_t) * A_nnz_,
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(A_vals_device_,
+                                       A_vals_,
+                                       sizeof(T) * A_nnz_,
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(B_rows_device_,
+                                       B_rows_,
+                                       sizeof(int64_t) * (k_ + 1),
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(B_cols_device_,
+                                       B_cols_,
+                                       sizeof(int64_t) * B_nnz_,
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(B_vals_device_,
+                                       B_vals_,
+                                       sizeof(T) * B_nnz_,
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(D_rows_device_,
+                                       D_rows_,
+                                       sizeof(int64_t) * (m_ + 1),
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(D_cols_device_,
+                                       D_cols_,
+                                       sizeof(int64_t) * D_nnz_,
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(D_vals_device_,
+                                       D_vals_,
+                                       sizeof(T) * D_nnz_,
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipDeviceSynchronize());
+          break;
+        }
+        case gpuOffloadType::unified: {
+          if (print_) std::cout << "\tMoving data to GPU" << std::endl;
+          hipCheckError(hipMemPrefetchAsync(A_rows_, 
+                                            sizeof(int64_t) * (m_ + 1), 
+                                            gpuDevice_, 
+                                            stream_));
+          hipCheckError(hipMemPrefetchAsync(A_cols_, 
+                                            sizeof(int64_t) * A_nnz_, 
+                                            gpuDevice_, 
+                                            stream_));
+          hipCheckError(hipMemPrefetchAsync(A_vals_, 
+                                            sizeof(T) * A_nnz_, 
+                                            gpuDevice_, 
+                                            stream_));
+          hipCheckError(hipMemPrefetchAsync(B_rows_, 
+                                            sizeof(int64_t) * (k_ + 1), 
+                                            gpuDevice_, 
+                                            stream_));
+          hipCheckError(hipMemPrefetchAsync(B_cols_, 
+                                            sizeof(int64_t) * B_nnz_, 
+                                            gpuDevice_, 
+                                            stream_));
+          hipCheckError(hipMemPrefetchAsync(B_vals_, 
+                                            sizeof(T) * B_nnz_, 
+                                            gpuDevice_, 
+                                            stream_));
+          hipCheckError(hipMemPrefetchAsync(D_rows_, 
+                                            sizeof(int64_t) * (m_ + 1), 
+                                            gpuDevice_, 
+                                            stream_));
+          hipCheckError(hipMemPrefetchAsync(D_cols_, 
+                                            sizeof(int64_t) * D_nnz_, 
+                                            gpuDevice_, 
+                                            stream_));
+          hipCheckError(hipMemPrefetchAsync(D_vals_, 
+                                            sizeof(T) * D_nnz_, 
+                                            gpuDevice_, 
+                                            stream_));
+          hipCheckError(hipDeviceSynchronize());
+          break;
+        }
+      }
+    }
+
+    void callSpmspm() override {
+      if (print_) std::cout << "Calling spmspm kernel" << std::endl;
+      switch (offload_) {
+        case gpuOffloadType::unified: {
+          size_t buffer_size = 0;
+          // Check if there are old arrays to get rid of
+          if (!firstRun_) {
+            hipCheckError(hipFree(C_rows_));
+            hipCheckError(hipFree(C_cols_));
+            hipCheckError(hipFree(C_vals_));
+            hipCheckError(hipDeviceSynchronize());
+          }
+          if (print_) std::cout << "\tAllocating C rows" << std::endl;
+          hipCheckError(hipMallocManaged(&C_rows_, sizeof(int64_t) * (m_ + 1)));
+          hipCheckError(hipDeviceSynchronize());
+
+          // Set up the rocSPARSE structures for the MM
+          if (print_) std::cout << "\tCreating csr descriptions" << std::endl;
+          status_ = rocsparse_create_csr_descr(&description_A_, m_, k_, A_nnz_, A_rows_, A_cols_,
+                                               A_vals_, index_, index_, base_, dataType_);
+          checkStatus("Failed rocsparse_create_csr_descr");
+          status_ = rocsparse_create_csr_descr(&description_B_, k_, n_, B_nnz_, B_rows_, B_cols_,
+                                               B_vals_, index_, index_, base_, dataType_);
+          checkStatus("Failed rocsparse_create_csr_descr");
+          status_ = rocsparse_create_csr_descr(&description_C_, m_, n_, 0, C_rows_, nullptr,
+                                               nullptr, index_, index_, base_, dataType_);
+          checkStatus("Failed rocsparse_create_csr_descr");
+          status_ = rocsparse_create_csr_descr(&description_D_, m_, n_, D_nnz_, D_rows_, D_cols_,
+                                               D_vals_, index_, index_, base_, dataType_);
+          checkStatus("Failed rocsparse_create_csr_descr");
+          hipCheckError(hipDeviceSynchronize());
+
+          if (print_) std::cout << "\tDetermining buffer size" << std::endl;
+          stage_ = rocsparse_spgemm_stage_buffer_size;
+          status_ = rocsparse_spgemm(handle_,
+                                     operation_,
+                                     operation_,
+                                     &alpha,
+                                     description_A_,
+                                     description_B_,
+                                     &beta,
+                                     description_D_,
+                                     description_C_,
+                                     dataType_,
+                                     algorithm_,
+                                     stage_,
+                                     &buffer_size,
+                                     nullptr);
+          checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_buffer_size");
+          hipCheckError(hipDeviceSynchronize());
+
+          if (print_) std::cout << "\tAllocating buffer and C_rows" << std::endl;
+          void* buffer;
+          hipCheckError(hipMallocManaged(&buffer, buffer_size));
+          hipCheckError(hipDeviceSynchronize());
+
+          if (print_) std::cout << "\tDetermining nnz" << std::endl;
+          stage_ = rocsparse_spgemm_stage_nnz;
+          status_ = rocsparse_spgemm(handle_,
+                                     operation_,
+                                     operation_,
+                                     &alpha,
+                                     description_A_,
+                                     description_B_,
+                                     &beta,
+                                     description_D_,
+                                     description_C_,
+                                     dataType_,
+                                     algorithm_,
+                                     stage_,
+                                     &buffer_size,
+                                     buffer);
+          checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_nnz");
+          hipCheckError(hipDeviceSynchronize());
+
+          if (print_) std::cout << "\tAllocating rows and vals" << std::endl;
+          int64_t rowC, colC;
+          status_ = rocsparse_spmat_get_size(description_C_, &rowC, &colC, &C_nnz_);
+          checkStatus("Failed rocsparse_spmat_get_size");
+
+          hipCheckError(hipMallocManaged(&C_cols_, sizeof(int64_t) * C_nnz_));
+          hipCheckError(hipMallocManaged(&C_vals_, sizeof(T) * C_nnz_));
+          hipCheckError(hipDeviceSynchronize());
+
+          status_ = rocsparse_csr_set_pointers(description_C_, C_rows_, C_cols_, C_vals_);
+          checkStatus("Failed rocsparse_csr_set_pointers");
+          hipCheckError(hipDeviceSynchronize());
+
+          if (print_) std::cout << "\tDoing calculation" << std::endl;
+          stage_ = rocsparse_spgemm_stage_compute;
+          status_ = rocsparse_spgemm(handle_,
+                                     operation_,
+                                     operation_,
+                                     &alpha,
+                                     description_A_,
+                                     description_B_,
+                                     &beta,
+                                     description_D_,
+                                     description_C_,
+                                     dataType_,
+                                     algorithm_,
+                                     stage_,
+                                     &buffer_size,
+                                     buffer);
+          checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_compute");
+          hipCheckError(hipDeviceSynchronize());
+
+
+          if (print_) std::cout << "\tFreeing buffer and descriptions etc." << std::endl;
+          // Freeing up buffer
+          hipCheckError(hipFree(buffer));
+          status_ = rocsparse_destroy_spmat_descr(description_A_);
+          checkStatus("Failing rocsparse_destroy_mat_descr for A");
+          status_ = rocsparse_destroy_spmat_descr(description_B_);
+          checkStatus("Failing rocsparse_destroy_mat_descr for B");
+          status_ = rocsparse_destroy_spmat_descr(description_C_);
+          checkStatus("Failing rocsparse_destroy_mat_descr for C");
+          status_ = rocsparse_destroy_spmat_descr(description_D_);
+          checkStatus("Failing rocsparse_destroy_mat_descr for D");
+          hipCheckError(hipDeviceSynchronize());
+          firstRun_ = false;
+          break;
+        }
+        case gpuOffloadType::always: {
+          if (print_) std::cout << "\tMoving data to GPU" << std::endl;
+          hipCheckError(hipMemcpyAsync(A_rows_device_,
+                                       A_rows_,
+                                       sizeof(int64_t) * (m_ + 1),
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(A_cols_device_,
+                                       A_cols_,
+                                       sizeof(int64_t) * A_nnz_,
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(A_vals_device_,
+                                       A_vals_,
+                                       sizeof(T) * A_nnz_,
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(B_rows_device_,
+                                       B_rows_,
+                                       sizeof(int64_t) * (k_ + 1),
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(B_cols_device_,
+                                       B_cols_,
+                                       sizeof(int64_t) * B_nnz_,
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(B_vals_device_,
+                                       B_vals_,
+                                       sizeof(T) * B_nnz_,
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(D_rows_device_,
+                                       D_rows_,
+                                       sizeof(int64_t) * (m_ + 1),
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(D_cols_device_,
+                                       D_cols_,
+                                       sizeof(int64_t) * D_nnz_,
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(D_vals_device_,
+                                       D_vals_,
+                                       sizeof(T) * D_nnz_,
+                                       hipMemcpyHostToDevice,
+                                       stream_));
+          hipCheckError(hipDeviceSynchronize());
+          size_t buffer_size = 0;
+
+          if (print_) std::cout << "\tAllocating C rows" << std::endl;
+          hipCheckError(hipMalloc((void**)&C_rows_device_, sizeof(int64_t) * (m_ + 1)));
+          hipCheckError(hipDeviceSynchronize());
+
+          // Set up the rocSPARSE structures for the MM
+          if (print_) std::cout << "\tCreating csr descriptions" << std::endl;
+          status_ = rocsparse_create_csr_descr(&description_A_, m_, k_, A_nnz_, A_rows_device_, A_cols_device_,
+                                               A_vals_device_, index_, index_, base_, dataType_);
+          checkStatus("Failed rocsparse_create_csr_descr");
+          status_ = rocsparse_create_csr_descr(&description_B_, k_, n_, B_nnz_, B_rows_device_, B_cols_device_,
+                                               B_vals_device_, index_, index_, base_, dataType_);
+          checkStatus("Failed rocsparse_create_csr_descr");
+          status_ = rocsparse_create_csr_descr(&description_C_, m_, n_, 0, C_rows_device_, nullptr,
+                                               nullptr, index_, index_, base_, dataType_);
+          checkStatus("Failed rocsparse_create_csr_descr");
+          status_ = rocsparse_create_csr_descr(&description_D_, m_, n_, D_nnz_, D_rows_device_, D_cols_device_,
+                                               D_vals_device_, index_, index_, base_, dataType_);
+          checkStatus("Failed rocsparse_create_csr_descr");
+
+
+          if (print_) std::cout << "\tDetermining buffer size" << std::endl;
+          stage_ = rocsparse_spgemm_stage_buffer_size;
+          status_ = rocsparse_spgemm(handle_,
+                                     operation_,
+                                     operation_,
+                                     &alpha,
+                                     description_A_,
+                                     description_B_,
+                                     &beta,
+                                     description_D_,
+                                     description_C_,
+                                     dataType_,
+                                     algorithm_,
+                                     stage_,
+                                     &buffer_size,
+                                     nullptr);
+          checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_buffer_size");
+
+          if (print_) std::cout << "\tAllocating buffer and C_rows" << std::endl;
+          void* buffer;
+          hipCheckError(hipMalloc(&buffer, buffer_size));
+
+          if (print_) std::cout << "\tDetermining nnz" << std::endl;
+          stage_ = rocsparse_spgemm_stage_nnz;
+          status_ = rocsparse_spgemm(handle_,
+                                     operation_,
+                                     operation_,
+                                     &alpha,
+                                     description_A_,
+                                     description_B_,
+                                     &beta,
+                                     description_D_,
+                                     description_C_,
+                                     dataType_,
+                                     algorithm_,
+                                     stage_,
+                                     &buffer_size,
+                                     buffer);
+          checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_nnz");
+
+          if (print_) std::cout << "\tAllocating rows and vals" << std::endl;
+          int64_t rowC, colC;
+          status_ = rocsparse_spmat_get_size(description_C_, &rowC, &colC, &C_nnz_);
+          checkStatus("Failed rocsparse_spmat_get_size");
+
+          hipCheckError(hipMalloc((void**)&C_cols_device_, sizeof(int64_t) * C_nnz_));
+          hipCheckError(hipMalloc((void**)&C_vals_device_, sizeof(T) * C_nnz_));
+
+          status_ = rocsparse_csr_set_pointers(description_C_,
+                                               C_rows_device_,
+                                               C_cols_device_,
+                                               C_vals_device_);
+          checkStatus("Failed rocsparse_csr_set_pointers");
+
+          if (print_) std::cout << "\tDoing calculation" << std::endl;
+          stage_ = rocsparse_spgemm_stage_compute;
+          status_ = rocsparse_spgemm(handle_,
+                                     operation_,
+                                     operation_,
+                                     &alpha,
+                                     description_A_,
+                                     description_B_,
+                                     &beta,
+                                     description_D_,
+                                     description_C_,
+                                     dataType_,
+                                     algorithm_,
+                                     stage_,
+                                     &buffer_size,
+                                     buffer);
+          checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_compute");
+
+
+          if (print_) std::cout << "\tFreeing buffer and descriptions etc." << std::endl;
+          // Freeing up buffer
+          hipCheckError(hipFree(buffer));
+          status_ = rocsparse_destroy_spmat_descr(description_A_);
+          checkStatus("Failing rocsparse_destroy_mat_descr for A");
+          status_ = rocsparse_destroy_spmat_descr(description_B_);
+          checkStatus("Failing rocsparse_destroy_mat_descr for B");
+          status_ = rocsparse_destroy_spmat_descr(description_C_);
+          checkStatus("Failing rocsparse_destroy_mat_descr for C");
+          status_ = rocsparse_destroy_spmat_descr(description_D_);
+          checkStatus("Failing rocsparse_destroy_mat_descr for D");
+
+          if (print_) std::cout << "\tAllocating host C arrays" << std::endl;
+          // Allocate host arrays for C
+          hipCheckError(hipHostMalloc((void**)&C_rows_, sizeof(int64_t) * (m_ + 1)));
+          hipCheckError(hipHostMalloc((void**)&C_cols_, sizeof(int64_t) * C_nnz_));
+          hipCheckError(hipHostMalloc((void**)&C_vals_, sizeof(T) * C_nnz_));
+          hipCheckError(hipDeviceSynchronize());
+
+          // Moving data to CPU
+          if (print_) std::cout << "\tTransfering data back to CPU" << std::endl;
+          hipCheckError(hipMemcpyAsync(C_rows_,
+                                       C_rows_device_,
+                                       sizeof(int64_t) * (m_ + 1),
+                                       hipMemcpyDeviceToHost,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(C_cols_,
+                                       C_cols_device_,
+                                       sizeof(int64_t) * C_nnz_,
+                                       hipMemcpyDeviceToHost,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(C_vals_,
+                                       C_vals_device_,
+                                       sizeof(T) * C_nnz_,
+                                       hipMemcpyDeviceToHost,
+                                       stream_));
+          hipCheckError(hipDeviceSynchronize());
+
+          // Freeing stuff up
+          if (print_) std::cout << "\tFreeing C arrays (host and device)" << std::endl;
+          hipCheckError(hipFree(C_rows_device_));
+          hipCheckError(hipFree(C_cols_device_));
+          hipCheckError(hipFree(C_vals_device_));
+          hipCheckError(hipFree(C_rows_));
+          hipCheckError(hipFree(C_cols_));
+          hipCheckError(hipFree(C_vals_));
+          hipCheckError(hipDeviceSynchronize());
+          break;
+        }
+        case gpuOffloadType::once: {
+          size_t buffer_size;
+          // Check if there are old arrays to get rid of
+          if (!firstRun_) {
+            hipCheckError(hipFree(C_rows_device_));
+            hipCheckError(hipFree(C_cols_device_));
+            hipCheckError(hipFree(C_vals_device_));
+            hipCheckError(hipDeviceSynchronize());
+          }
+
+          if (print_) std::cout << "\tAllocating C rows" << std::endl;
+          hipCheckError(hipMalloc((void**)&C_rows_device_, sizeof(int64_t) * (m_ + 1)));
+          hipCheckError(hipDeviceSynchronize());
+
+          // Set up the rocSPARSE structures for the MM
+          if (print_) std::cout << "\tCreating csr descriptions" << std::endl;
+          status_ = rocsparse_create_csr_descr(&description_A_,
+                                               m_,
+                                               k_,
+                                               A_nnz_,
+                                               A_rows_device_,
+                                               A_cols_device_,
+                                               A_vals_device_,
+                                               index_,
+                                               index_,
+                                               base_,
+                                               dataType_);
+          checkStatus("Failed rocsparse_create_csr_descr");
+          status_ = rocsparse_create_csr_descr(&description_B_,
+                                               k_,
+                                               n_,
+                                               B_nnz_,
+                                               B_rows_device_,
+                                               B_cols_device_,
+                                               B_vals_device_,
+                                               index_,
+                                               index_,
+                                               base_,
+                                               dataType_);
+          checkStatus("Failed rocsparse_create_csr_descr");
+          status_ = rocsparse_create_csr_descr(&description_C_,
+                                               m_,
+                                               n_,
+                                               0,
+                                               C_rows_device_,
+                                               nullptr,
+                                               nullptr,
+                                               index_,
+                                               index_,
+                                               base_,
+                                               dataType_);
+          checkStatus("Failed rocsparse_create_csr_descr");
+          status_ = rocsparse_create_csr_descr(&description_D_,
+                                               m_,
+                                               n_,
+                                               D_nnz_,
+                                               D_rows_device_,
+                                               D_cols_device_,
+                                               D_vals_device_,
+                                               index_,
+                                               index_,
+                                               base_,
+                                               dataType_);
+          checkStatus("Failed rocsparse_create_csr_descr");
+
+          if (print_) std::cout << "\tDetermining buffer size" << std::endl;
+          stage_ = rocsparse_spgemm_stage_buffer_size;
+          status_ = rocsparse_spgemm(handle_,
+                                     operation_,
+                                     operation_,
+                                     &alpha,
+                                     description_A_,
+                                     description_B_,
+                                     &beta,
+                                     description_D_,
+                                     description_C_,
+                                     dataType_,
+                                     algorithm_,
+                                     stage_,
+                                     &buffer_size,
+                                     nullptr);
+          checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_buffer_size");
+
+          if (print_) std::cout << "\tAllocating buffer and C_rows" << std::endl;
+          void* buffer;
+          hipCheckError(hipMalloc(&buffer, buffer_size));
+
+          if (print_) std::cout << "\tDetermining nnz" << std::endl;
+          stage_ = rocsparse_spgemm_stage_nnz;
+          status_ = rocsparse_spgemm(handle_,
+                                     operation_,
+                                     operation_,
+                                     &alpha,
+                                     description_A_,
+                                     description_B_,
+                                     &beta,
+                                     description_D_,
+                                     description_C_,
+                                     dataType_,
+                                     algorithm_,
+                                     stage_,
+                                     &buffer_size,
+                                     buffer);
+          checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_nnz");
+
+          if (print_) std::cout << "\tAllocating rows and vals" << std::endl;
+          int64_t rowC, colC;
+          status_ = rocsparse_spmat_get_size(description_C_, &rowC, &colC, &C_nnz_);
+          checkStatus("Failed rocsparse_spmat_get_size");
+
+          hipCheckError(hipMalloc((void**)&C_cols_device_, sizeof(int64_t) * C_nnz_));
+          hipCheckError(hipMalloc((void**)&C_vals_device_, sizeof(T) * C_nnz_));
+
+          status_ = rocsparse_csr_set_pointers(description_C_,
+                                               C_rows_device_,
+                                               C_cols_device_,
+                                               C_vals_device_);
+          checkStatus("Failed rocsparse_csr_set_pointers");
+
+          if (print_) std::cout << "\tDoing calculation" << std::endl;
+          stage_ = rocsparse_spgemm_stage_compute;
+          status_ = rocsparse_spgemm(handle_,
+                                     operation_,
+                                     operation_,
+                                     &alpha,
+                                     description_A_,
+                                     description_B_,
+                                     &beta,
+                                     description_D_,
+                                     description_C_,
+                                     dataType_,
+                                     algorithm_,
+                                     stage_,
+                                     &buffer_size,
+                                     buffer);
+          checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_compute");
+
+
+          if (print_) std::cout << "\tFreeing buffer and descriptions etc." << std::endl;
+          // Freeing up buffer
+          hipCheckError(hipFree(buffer));
+          status_ = rocsparse_destroy_spmat_descr(description_A_);
+          checkStatus("Failing rocsparse_destroy_mat_descr for A");
+          status_ = rocsparse_destroy_spmat_descr(description_B_);
+          checkStatus("Failing rocsparse_destroy_mat_descr for B");
+          status_ = rocsparse_destroy_spmat_descr(description_C_);
+          checkStatus("Failing rocsparse_destroy_mat_descr for C");
+          status_ = rocsparse_destroy_spmat_descr(description_D_);
+          checkStatus("Failing rocsparse_destroy_mat_descr for D");
+          firstRun_ = false;
+          break;
+        }
+      }
+    }
+
+    void postLoopRequirements() override {
+      if (print_) std::cout << "Post-Loop stuff" << std::endl;
+      switch(offload_) {
+        case gpuOffloadType::always: {
+          break;
+        }
+        case gpuOffloadType::once: {
+          if (print_) std::cout << "\tAllocating host arrays for C" << std::endl;
+          // Allocate host arrays for C
+          hipCheckError(hipHostMalloc((void**)&C_rows_, sizeof(int64_t) * (m_ + 1)));
+          hipCheckError(hipHostMalloc((void**)&C_cols_, sizeof(int64_t) * C_nnz_));
+          hipCheckError(hipHostMalloc((void**)&C_vals_, sizeof(T) * C_nnz_));
+          hipCheckError(hipDeviceSynchronize());
+
+
+          if (print_) std::cout << "\tMoving C data to host" << std::endl;
+          // Moving data to CPU
+          hipCheckError(hipMemcpyAsync(C_rows_,
+                                       C_rows_device_,
+                                       sizeof(int64_t) * (m_ + 1),
+                                       hipMemcpyDeviceToHost,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(C_cols_,
+                                       C_cols_device_,
+                                       sizeof(int64_t) * C_nnz_,
+                                       hipMemcpyDeviceToHost,
+                                       stream_));
+          hipCheckError(hipMemcpyAsync(C_vals_,
+                                       C_vals_device_,
+                                       sizeof(T) * C_nnz_,
+                                       hipMemcpyDeviceToHost,
+                                       stream_));
+          hipCheckError(hipDeviceSynchronize());
+
+          // Freeing stuff up
+          if (print_) std::cout << "\tFreeing C arrays (host and device)" << std::endl;
+          hipCheckError(hipFree(C_rows_device_));
+          hipCheckError(hipFree(C_cols_device_));
+          hipCheckError(hipFree(C_vals_device_));
+          hipCheckError(hipFree(C_rows_));
+          hipCheckError(hipFree(C_cols_));
+          hipCheckError(hipFree(C_vals_));
+          hipCheckError(hipDeviceSynchronize());
+          break;
+        }
+        case gpuOffloadType::unified: {
+          if (print_) std::cout << "\tMoving data to CPU" << std::endl;
+          hipCheckError(hipMemPrefetchAsync(C_rows_,
+                                            sizeof(int64_t) * (m_ + 1),
+                                            hipCpuDeviceId,
+                                            stream_));
+          hipCheckError(hipMemPrefetchAsync(C_cols_,
+                                            sizeof(int64_t) * C_nnz_,
+                                            hipCpuDeviceId,
+                                            stream_));
+          hipCheckError(hipMemPrefetchAsync(C_vals_,
+                                            sizeof(T) * C_nnz_,
+                                            hipCpuDeviceId,
+                                            stream_));
+          hipCheckError(hipDeviceSynchronize());
+          if (print_) std::cout << "\tFreeing C arrays" << std::endl;
+          hipCheckError(hipFree(C_rows_));
+          hipCheckError(hipFree(C_cols_));
+          hipCheckError(hipFree(C_vals_));
+          hipCheckError(hipDeviceSynchronize());
+          break;
+        }
+      }
+    }
+
+    void postCallKernelCleanup() override {
+      if (print_) std::cout << "Post-kernel clean up" << std::endl;
+      if (offload_ == gpuOffloadType::unified) {
+        if (print_) std::cout << "Freeing unified memory arrays for A and B" << std::endl;
+        hipCheckError(hipFree(A_));
+        hipCheckError(hipFree(A_rows_));
+        hipCheckError(hipFree(A_cols_));
+        hipCheckError(hipFree(A_vals_));
+        hipCheckError(hipFree(B_));
+        hipCheckError(hipFree(B_rows_));
+        hipCheckError(hipFree(B_cols_));
+        hipCheckError(hipFree(B_vals_));
+        hipCheckError(hipFree(D_rows_));
+        hipCheckError(hipFree(D_cols_));
+        hipCheckError(hipFree(D_vals_));
+        hipCheckError(hipDeviceSynchronize());
+      } else {
+        if (print_) std::cout << "Freeing host arrays for A and B" << std::endl;
+        hipCheckError(hipHostFree((void*)A_));
+        hipCheckError(hipHostFree((void*)A_rows_));
+        hipCheckError(hipHostFree((void*)A_cols_));
+        hipCheckError(hipHostFree((void*)A_vals_));
+        hipCheckError(hipHostFree((void*)B_));
+        hipCheckError(hipHostFree((void*)B_rows_));
+        hipCheckError(hipHostFree((void*)B_cols_));
+        hipCheckError(hipHostFree((void*)B_vals_));
+        hipCheckError(hipHostFree((void*)D_rows_));
+        hipCheckError(hipHostFree((void*)D_cols_));
+        hipCheckError(hipHostFree((void*)D_vals_));
+        hipCheckError(hipDeviceSynchronize());
+        if (print_) std::cout << "Freeing GPU arrays for A and B" << std::endl;
+        hipCheckError(hipFree(A_rows_device_));
+        hipCheckError(hipFree(A_cols_device_));
+        hipCheckError(hipFree(A_vals_device_));
+        hipCheckError(hipFree(B_rows_device_));
+        hipCheckError(hipFree(B_cols_device_));
+        hipCheckError(hipFree(B_vals_device_));
+        hipCheckError(hipFree(D_rows_device_));
+        hipCheckError(hipFree(D_cols_device_));
+        hipCheckError(hipFree(D_vals_device_));
+        hipCheckError(hipDeviceSynchronize());
+      }
+    }
+
+    void checkStatus(std::string message) {
+      if (status_ != rocsparse_status_success) {
+        std::cerr << message << " error = ";
+        switch (status_) {
+          case rocsparse_status_success: {
+            std::cerr << "Success" << std::endl;
+            break;
+          }
+          case rocsparse_status_invalid_handle: {
+            std::cerr << "invalid handle (handle not initialized, invalid or null.)" << std::endl;
+            break;
+          }
+          case rocsparse_status_not_implemented: {
+            std::cerr << "not imlpemented (function is not implemented.)" << std::endl;
+            break;
+          }
+          case rocsparse_status_invalid_pointer: {
+            std::cerr << "invalid pointer (invalid pointer parameter.)" << std::endl;
+            break;
+          }
+          case rocsparse_status_invalid_size: {
+            std::cerr << "invalid size (invalid size parameter.)" << std::endl;
+            break;
+          }
+          case rocsparse_status_memory_error: {
+            std::cerr << "memory error (failed memory allocation, copy, dealloc.)" << std::endl;
+            break;
+          }
+          case rocsparse_status_internal_error: {
+            std::cerr << "internal error (other internal library failure.)" << std::endl;
+            break;
+          }
+          case rocsparse_status_invalid_value: {
+            std::cerr << "invalid value (invalid value parameter.)" << std::endl;
+            break;
+          }
+          case rocsparse_status_arch_mismatch: {
+            std::cerr << "arch mismatch (device arch is not supported.)" << std::endl;
+            break;
+          }
+          case rocsparse_status_zero_pivot: {
+            std::cerr << "zero pivot (encountered zero pivot.)" << std::endl;
+            break;
+          }
+          case rocsparse_status_not_initialized: {
+            std::cerr << "not initialized (decriptor has not been initialized.)" << std::endl;
+            break;
+          }
+          case rocsparse_status_type_mismatch: {
+            std::cerr << "type mismatch (index types do not match.)" << std::endl;
+            break;
+          }
+          case rocsparse_status_requires_sorted_storage: {
+            std::cerr << "requires sorted storage (sorted storage required.)" << std::endl;
+            break;
+          }
+          case rocsparse_status_thrown_exception: {
+            std::cerr << "thrown exception (exception being thrown.)" << std::endl;
+            break;
+          }
+          default: {
+            std::cerr << "Not a known status enum" << std::endl;
+            break;
+          }
+        }
+        exit(1);
+      }
+    }
+
+    uint64_t calcNNZC() {
+      uint64_t nnzSoFar = 0;
+      for (size_t row = 0; row < m_ + 1; row++) {
+        for (size_t col = 0; col < n_; col++) {
+          for (size_t entry = 0; entry < k_; entry++) {
+            if (A_[row * k_ + entry] != 0 && B_[entry * n_ + col] != 0) {
+              nnzSoFar++;
+              break;
+            }
+          }
+        }
+      }
+      if (print_) std::cout << "Calculated nnzC = " << nnzSoFar << std::endl;
+      return nnzSoFar;
+    }
+
+    void printMatrices() {
+      std::cout << "================ Printing matrices ================" << std::endl;
+      std::cout << "A matrix dense:" << std::endl;
+      for (size_t i = 0; i < m_; i++) {
+        for (size_t j = 0; j < k_; j++) {
+          std::cout << A_[i * k_ + j] << " ";
+        }
+        std::cout << std::endl;
+      }
+      std::cout << "A matrix CSR:" << std::endl;
+      std::cout << "\tRows: ";
+      for (size_t i = 0; i < m_ + 1; i++) {
+        std::cout << A_rows_[i] << " ";
+      }
+      std::cout << std::endl;
+      std::cout << "\tCols: ";
+      for (size_t i = 0; i < A_nnz_; i++) {
+        std::cout << A_cols_[i] << " ";
+      }
+      std::cout << std::endl;
+      std::cout << "\tVals: ";
+      for (size_t i = 0; i < A_nnz_; i++) {
+        std::cout << A_vals_[i] << " ";
+      }
+      std::cout << std::endl;
+
+      std::cout << "---------------------------------------------------" << std::endl;
+
+      std::cout << "B matrix dense:" << std::endl;
+      for (size_t i = 0; i < k_; i++) {
+        for (size_t j = 0; j < n_; j++) {
+          std::cout << B_[i * n_ + j] << " ";
+        }
+        std::cout << std::endl;
+      }
+      std::cout << "B matrix CSR:" << std::endl;
+      std::cout << "\tRows: ";
+      for (size_t i = 0; i < k_ + 1; i++) {
+        std::cout << B_rows_[i] << " ";
+      }
+      std::cout << std::endl;
+      std::cout << "\tCols: ";
+      for (size_t i = 0; i < B_nnz_; i++) {
+        std::cout << B_cols_[i] << " ";
+      }
+      std::cout << std::endl;
+      std::cout << "\tVals: ";
+      for (size_t i = 0; i < B_nnz_; i++) {
+        std::cout << B_vals_[i] << " ";
+      }
+      std::cout << std::endl;
+
+      std::cout << "---------------------------------------------------" << std::endl;
+
+      std::cout << "D matrix CSR:" << std::endl;
+      std::cout << "\tRows: ";
+      for (size_t i = 0; i < m_ + 1; i++) {
+        std::cout << D_rows_[i] << " ";
+      }
+      std::cout << std::endl;
+      std::cout << "\tCols: ";
+      for (size_t i = 0; i < D_nnz_; i++) {
+        std::cout << D_cols_[i] << " ";
+      }
+      std::cout << std::endl;
+      std::cout << "\tVals: ";
+      for (size_t i = 0; i < D_nnz_; i++) {
+        std::cout << D_vals_[i] << " ";
+      }
+      std::cout << std::endl;
+
+      std::cout << "================ Matrices printed! ================" << std::endl;
+    }  
+
+    bool print_ = true;
+    bool initialised_ = false;
+    bool firstRun_ = false;
+
+    rocsparse_handle handle_;
+    rocsparse_operation operation_;
+    rocsparse_status status_;
+    rocsparse_indextype_ index_;
+    rocsparse_index_base base_;
+    rocsparse_matrix_type type_;
+    rocsparse_datatype dataType_;
+    rocsparse_spgemm_stage stage_;
+    rocsparse_spgemm_alg algorithm_;
+    
+    rocsparse_spmat_descr description_A_, description_B_, description_C_, description_D_;
+
+    int64_t* A_rows_;
+    int64_t* A_cols_;
+    T* A_vals_;
+    int64_t* B_rows_;
+    int64_t* B_cols_;
+    T* B_vals_;
+
+    int64_t* A_rows_device_;
+    int64_t* A_cols_device_;
+    T* A_vals_device_;
+    int64_t* B_rows_device_;
+    int64_t* B_cols_device_;
+    T* B_vals_device_;
+    int64_t* C_rows_device_;
+    int64_t* C_cols_device_;
+    T* C_vals_device_;
+
+    int64_t* D_rows_;
+    int64_t* D_cols_;
+    T* D_vals_;
+    int64_t* D_rows_device_;
+    int64_t* D_cols_device_;
+    T* D_vals_device_;
+    int64_t D_nnz_ = 1;
+
+    int gpuDevice_;
+    hipStream_t stream_;
+    
+    const T alpha = ALPHA;
+    const T beta = BETA;
+};
+} // namespace gpu
+
+#endif
diff --git a/src/main.cc b/src/main.cc
index 2d046e3..b98350c 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -3,16 +3,56 @@
 int iters = 10;
 int startDim = 1;
 int upperLimit = 128;
+int step = 1;
+double sparsity = 0.99;
+// GEMV kernels
+bool doSgemv = true;
+bool doDgemv = true;
+// Sparse GEMV kernels
+bool doSspmdnv = true;
+bool doDspmdnv = true;
+// GEMM kernels
+bool doSgemm = true;
+bool doDgemm = true;
+// Sparse GEMM kernels
+bool doSspmdnm = true;
+bool doDspmdnm = true;
+// Sparse-sparse matrix multiplication kernels
+bool doSspmspm = true;
+bool doDspmspm = true;
 
 bool doCpu = CPU_ENABLED;
 bool doGpu = GPU_ENABLED;
 
+matrixType type = matrixType::random;
+
 std::string CSV_DIR = "CSV_Results";
 
 int main(int argc, char** argv) {
   getParameters(argc, argv);
   printBenchmarkConfig(iters, upperLimit);
 
+#ifdef CPU_ARMPL
+  if (doSspmdnm || doDspmdnm) {
+    std::cout << "WARNING - ArmPL does not currently provide a Sparse Matrix-Dense Matrix kernel. Disabling Sparse Matrix-Dense Matrix tests." << std::endl;
+    doSspmdnm = false;
+    doDspmdnm = false;
+  }
+#endif
+
+#ifdef CPU_NVPL
+  if (doSspmdnm || doDspmdnm) {
+    std::cout << "WARNING - NVPL does not currently provide a Sparse Matrix-Dense Matrix kernel. Disabling Sparse Matrix-Dense Matrix tests." << std::endl;
+    doSspmdnm = false;
+    doDspmdnm = false;
+  }
+  if (doSspmspm || doDspmspm) {
+    std::cout << "WARNING - NVPL does not currently provide a Sparse Matrix-Sparse Matrix kernel. Disabling Sparse Matrix-Sparse Matrix tests." << std::endl;
+    doSspmspm = false;
+    doDspmspm = false;
+  }
+#endif
+
   if (!doCpu && !doGpu) {
     std::cout << "Finished!" << std::endl;
     exit(0);
@@ -28,41 +68,106 @@ int main(int argc, char** argv) {
   std::cout << "All results will be saved in CSV files at '" << absPath << "'"
             << std::endl
             << std::endl;
+// -------- GEMV --------
+  // Single-Precision GEMV
+  if (doSgemv) {
+    std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl;
+    doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit,
+                        step, doCpu, doGpu);
+    sgemv.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
+
+  // Double-Precision GEMV
+  if (doDgemv) {
+    std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl;
+    doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit,
+                         step, doCpu, doGpu);
+    dgemv.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
+
+//  // -------- GEMM --------
+//  // Single-Precision GEMM
+ if (doSgemm) {
+   std::cout << std::endl << "Comparing SGEMM Kernels:" << std::endl;
+   doGemm<float> sgemm(std::string(absPath), iters, startDim, upperLimit,
+                       step, doCpu, doGpu);
+   sgemm.collectData();
+   std::cout << "Finished!" << std::endl;
+ }
+
+ // Double-Precision GEMM
+ if (doDgemm) {
+   std::cout << std::endl << "Comparing DGEMM Kernels:" << std::endl;
+   doGemm<double> dgemm(std::string(absPath), iters, startDim, upperLimit,
+                        step, doCpu, doGpu);
+   dgemm.collectData();
+   std::cout << "Finished!" << std::endl;
+ }
 
-  // -------- GEMM --------
-  // SGEMM Comparison
-  std::cout << std::endl << "Comparing SGEMM Kernels:" << std::endl;
-  doGemm<float> sgemm(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                      doGpu);
-  sgemm.collectData();
-  std::cout << "Finished!" << std::endl;
-
-  // DGEMM Comparison
-  std::cout << std::endl << "Comparing DGEMM Kernels:" << std::endl;
-  doGemm<double> dgemm(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                       doGpu);
-  dgemm.collectData();
-  std::cout << "Finished!" << std::endl;
-
-  // -------- GEMV --------
-  // SGEMV Comparison
-  std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl;
-  doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                      doGpu);
-  sgemv.collectData();
-  std::cout << "Finished!" << std::endl;
-
-  // DGEMV Comparison
-  std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl;
-  doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                       doGpu);
-  dgemv.collectData();
-  std::cout << "Finished!" << std::endl;
 
+  // -------- SPMDNV --------
+  // Single-Precision Sparse Matrix-Dense Vector
+  if (doSspmdnv) {
+    std::cout << std::endl << "Comparing SSPMDNV Kernels:" << std::endl;
+    doSpmdnv<float> sspmdnv(std::string(absPath), iters, startDim, upperLimit,
+                            step, sparsity, type, doCpu, doGpu);
+    sspmdnv.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
+
+  // Double-Precision Sparse Matrix-Dense Vector
+  if (doDspmdnv) {
+    std::cout << std::endl << "Comparing DSPMDNV Kernels:" << std::endl;
+    doSpmdnv<double> dspmdnv(std::string(absPath), iters, startDim, upperLimit,
+                             step, sparsity, type, doCpu, doGpu);
+    dspmdnv.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
+
+  // // -------- SPMDNM --------
+  // // Single-Precision Sparse Matrix-Dense Matrix
+  if (doSspmdnm) {
+    std::cout << std::endl << "Comparing SSpMDnM Kernels:" << std::endl;
+    doSpmdnm<float> sspmdnm(std::string(absPath), iters, startDim, upperLimit,
+                            step, sparsity, type, doCpu, doGpu);
+    sspmdnm.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
+
+  // Double-Precision Sparse Matrix-Dense Matrix
+  if (doDspmdnm) {
+    std::cout << std::endl << "Comparing DSpMDnM Kernels:" << std::endl;
+    doSpmdnm<double> dspmdnm(std::string(absPath), iters, startDim, upperLimit,
+                             step, sparsity, type, doCpu, doGpu);
+    dspmdnm.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
+
+  // -------- SPMSPM --------
+  // Single-Precision Sparse Matrix-Sparse Matrix
+  if (doSspmspm) {
+    std::cout << std::endl << "Comparing SSpMSpM Kernels:" << std::endl;
+    doSpmspm<float> sspmspm(std::string(absPath), iters, startDim, upperLimit,
+                            step, sparsity, type, doCpu, doGpu);
+    sspmspm.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
+
+  // Double-Precision Sparse Matrix-Sparse Matrix
+  if (doDspmspm) {
+    std::cout << std::endl << "Comparing DSpMSpM Kernels:" << std::endl;
+    doSpmspm<double> dspmspm(std::string(absPath), iters, startDim, upperLimit,
+                             step, sparsity, type, doCpu, doGpu);
+    dspmspm.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
   free(absPath);
   return 0;
 }
 
+
 void printBenchmarkConfig(const int iters, const int upperLimit) {
   std::string cpuEnabledStr = (doCpu) ? "True" : "False";
   std::string gpuEnabledStr = (doGpu) ? "True" : "False";
@@ -71,16 +176,33 @@ void printBenchmarkConfig(const int iters, const int upperLimit) {
       (getenv("BLIS_NUM_THREADS") != NULL) ? atoi(getenv("BLIS_NUM_THREADS"))
                                            : 1;
 #else
-      (getenv("OMP_NUM_THREADS") != NULL) ? atoi(getenv("OMP_NUM_THREADS")) : 1;
+      (getenv("OMP_NUM_THREADS") != nullptr) ? atoi(getenv("OMP_NUM_THREADS")) : 1;
 #endif
   const char* ompProcBind =
-      (getenv("OMP_PROC_BIND") != NULL) ? getenv("OMP_PROC_BIND") : "Not Set";
+      (getenv("OMP_PROC_BIND") != nullptr) ? getenv("OMP_PROC_BIND") : "Not "
+                                                                       "Set";
   const char* ompPlaces =
-      (getenv("OMP_PLACES") != NULL) ? getenv("OMP_PLACES") : "Not Set";
+      (getenv("OMP_PLACES") != nullptr) ? getenv("OMP_PLACES") : "Not Set";
+  const char* matrixType;
+  switch (type) {
+  case matrixType::rmat:
+    matrixType = "rMAT";
+    break;
+  case matrixType::random:
+    matrixType = "random";
+    break;
+  case matrixType::finiteElements:
+    matrixType = "finiteElements";
+    break;
+  default:
+    matrixType = "Unknown";
+    break;  
+  }
   std::cout << "GPU BLAS Offload Benchmark:" << std::endl;
   std::cout << "\tIterations per Kernel: " << iters << std::endl;
   std::cout << "\tStarting Problem Dimension: " << startDim << std::endl;
   std::cout << "\tMaximum Problem Dimension: " << upperLimit << std::endl;
+  std::cout << "\tSparse Matrix Type: " << matrixType << std::endl;
   std::cout << "\tCPU Kernels Enabled: " << cpuEnabledStr << std::endl;
   std::cout << "\tCPU Library: " << CPU_LIB_NAME << std::endl;
   std::cout << "\tGPU Kernels Enabled: " << gpuEnabledStr << std::endl;
@@ -112,7 +234,7 @@ int parseInt(const char* str) {
   return strlen(next) ? -1 : value;
 }
 
-void getParameters(int argc, char* argv[]) {
+void getParameters(int argc, char** argv) {
   for (int i = 1; i < argc; i++) {
     if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i")) {
       if (++i >= argc || (iters = parseInt(argv[i])) < 0) {
@@ -137,17 +259,62 @@ void getParameters(int argc, char* argv[]) {
             << std::endl;
         exit(1);
       }
+    } else if (!strcmp(argv[i], "--step")) {
+      if (++i >= argc || (step = parseInt(argv[i])) < 0) {
+        std::cout << "ERROR - Invalid dimension step size" << std::endl;
+        exit(1);
+      }
     } else if (!strcmp(argv[i], "--no_cpu")) {
       doCpu = false;
     } else if (!strcmp(argv[i], "--no_gpu")) {
       doGpu = false;
-    } else if (!strcmp(argv[i], "--output_dir") || !strcmp(argv[i], "-o")) {
-      if (++i >= argc) {
-        std::cout << "ERROR - Invalid output directory" << std::endl;
+    } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) {
+      std::string kernelList = argv[++i];
+      doSgemm = (kernelList.find("sgemm") != std::string::npos);
+      doDgemm = (kernelList.find("dgemm") != std::string::npos);
+      doSspmdnm = (kernelList.find("sspmdnm") != std::string::npos);
+      doDspmdnm = (kernelList.find("dspmdnm") != std::string::npos);
+      doSspmspm = (kernelList.find("sspmspm") != std::string::npos);
+      doDspmspm = (kernelList.find("dspmspm") != std::string::npos);
+      doSgemv = (kernelList.find("sgemv") != std::string::npos);
+      doDgemv = (kernelList.find("dgemv") != std::string::npos);
+      doSspmdnv = (kernelList.find("sspmdnv") != std::string::npos);
+      doDspmdnv = (kernelList.find("dspmdnv") != std::string::npos);
+
+      if (!doSgemv && !doSspmdnv && !doSgemm && !doSspmdnm && !doSspmspm &&
+          !doDgemv && !doDspmdnv && !doDgemm && !doDspmdnm && !doDspmspm) {
+        std::cout << "ERROR - no implemented kernels in list" << std::endl;
         exit(1);
       } else {
         CSV_DIR = argv[i];
       }
+    } else if (!strcmp(argv[i], "--sparsity")) {
+      if (++i >= argc || (sparsity = std::stod(argv[i])) < 0 ||
+          sparsity >= 1.00)  {
+        std::cout << "ERROR - Invalid sparsity value" << std::endl;
+        exit(1);
+      }
+    } else if (!strcmp(argv[i], "--matrix_type") || !strcmp(argv[i], "-t")) {
+      if (++i >= argc) {
+        std::cout << "ERROR - No matrix type specified" << std::endl;
+        exit(1);
+      } else if (!strcmp(argv[i], "rmat")) {
+        type = matrixType::rmat;
+      } else if (!strcmp(argv[i], "random")) {
+        type = matrixType::random;
+      } else if (!strcmp(argv[i], "finiteElements")) {
+        type = matrixType::finiteElements;
+      } else {
+        std::cout << "ERROR - Unrecognized matrix type '" << argv[i]
+                  << "'" << std::endl;
+        exit(1);
+      }
+    } else if (!strcmp(argv[i], "--output_dir") || !strcmp(argv[i], "-o")) {
+      if (++i >= argc) {
+        std::cout << "ERROR - No output directory specified" << std::endl;
+        exit(1);
+      }
+      CSV_DIR = argv[i];
     } else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) {
       std::cout << std::endl;
       std::cout << "Usage: ./gpu-blob [OPTIONS]" << std::endl << std::endl;
@@ -158,19 +325,35 @@ void getParameters(int argc, char* argv[]) {
                 << std::endl;
       std::cout << "  --no_gpu                     Disable all GPU kernel Runs"
                 << std::endl;
-      std::cout
-          << "  -o  --output_dir             The CSV file output directory"
-          << std::endl;
+      std::cout << "  -o  --output_dir             The CSV file output directory"
+                << std::endl;
       std::cout << "  -i  --iterations I           Repeat each kernel I times "
-                   "(default: "
-                << iters << ")" << std::endl;
+                   "(default: " << iters << ")" 
+                << std::endl;
       std::cout << "  -s  --start_dimension S      First value of M, N, K is S "
-                   "(default: "
-                << startDim << ")" << std::endl;
+                   "(default: " << startDim << ")" 
+                << std::endl;
+      std::cout << "  --step St                    Step size between values of M, N, K"
+                   "(default: " << step << ")" 
+                << std::endl;
       std::cout << "  -d  --dimension_limit D      Max value of M, N, K is D "
-                   "(default: "
-                << upperLimit << ")" << std::endl;
-      std::cout << std::endl;
+                   "(default: " << upperLimit << ")" 
+                << std::endl;
+      std::cout << "  -k  --kernels <kernels>      Comma-separated list of "
+                   "kernels to be run.  Options are sgemm, dgemm, sspmdnm, "
+                   "dspmdnm, sspmspm, dspmspm, sgemv, dgemv, sspmdnv, dspmdnv "
+                   "(default: `-k sgemm,dgemm,sspmdnm,dspmdnm,sspmspm,dspmspm,"
+                   "sgemv,dgemv,sspmdnv,dspmdnv`)" 
+                << std::endl;
+      std::cout << "  --sparsity Sp                Sparsity value, between 0 "
+                   "and 1 (double), to be used by the sparse BLAS kernels.  "
+                   "Matrices with be generated with this sparsity value.  "
+                   "Defaults to 0.99" 
+                << std::endl;
+      std::cout << "  -t  --matrix_type M          Type of sparse matrix to use."
+                   ".  Only applies to sparse kernels.  Options are rmat, random"
+                   ", finiteElements (default -t random)" 
+                << std::endl;
       exit(0);
     } else {
       std::cout << "Unrecognized argument '" << argv[i] << "' (try '--help')"
@@ -178,4 +361,4 @@ void getParameters(int argc, char* argv[]) {
       exit(1);
     }
   }
-}
\ No newline at end of file
+}