diff --git a/blackbox.simx.cache b/blackbox.simx.cache
new file mode 100644
index 000000000..32e551ea1
--- /dev/null
+++ b/blackbox.simx.cache
@@ -0,0 +1 @@
+-DNUM_CLUSTERS=1 -DNUM_CORES=16 -DNUM_WARPS=16 -DNUM_THREADS=16    +0+0
diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile
index db8366795..a2c479784 100644
--- a/tests/opencl/Makefile
+++ b/tests/opencl/Makefile
@@ -22,6 +22,10 @@ all:
 	$(MAKE) -C sgemm2
 	$(MAKE) -C sgemm3
 	$(MAKE) -C psum
+	$(MAKE) -C kernel1
+	$(MAKE) -C kernel2
+	$(MAKE) -C kernel3
+	$(MAKE) -C kernel4
 
 run-simx:
 	$(MAKE) -C vecadd run-simx
@@ -43,6 +47,10 @@ run-simx:
 	$(MAKE) -C sgemm2 run-simx
 	$(MAKE) -C sgemm3 run-simx
 	$(MAKE) -C psum run-simx
+	$(MAKE) -C kernel1 run-simx
+	$(MAKE) -C kernel2 run-simx
+	$(MAKE) -C kernel3 run-simx
+	$(MAKE) -C kernel4 run-simx
 
 run-rtlsim:
 	$(MAKE) -C vecadd run-rtlsim
@@ -64,6 +72,10 @@ run-rtlsim:
 	$(MAKE) -C sgemm2 run-rtlsim
 	$(MAKE) -C sgemm3 run-rtlsim
 	$(MAKE) -C psum run-rtlsim
+	$(MAKE) -C kernel1 run-rtlsimy
+	$(MAKE) -C kernel2 run-rtlsimy
+	$(MAKE) -C kernel3 run-rtlsimy
+	$(MAKE) -C kernel4 run-rtlsimy
 
 run-opae:
 	$(MAKE) -C vecadd run-opae
@@ -85,6 +97,10 @@ run-opae:
 	$(MAKE) -C sgemm2 run-opae
 	$(MAKE) -C sgemm3 run-opae
 	$(MAKE) -C psum run-opae
+	$(MAKE) -C kernel1 run-opae
+	$(MAKE) -C kernel2 run-opae
+	$(MAKE) -C kernel3 run-opae
+	$(MAKE) -C kernel4 run-opae
 
 clean:
 	$(MAKE) -C vecadd clean
@@ -106,4 +122,8 @@ clean:
 	$(MAKE) -C bfs clean
 	$(MAKE) -C sgemm2 clean
 	$(MAKE) -C sgemm3 clean
-	$(MAKE) -C psum clean
\ No newline at end of file
+	$(MAKE) -C psum clean
+	$(MAKE) -C kernel1 clean
+	$(MAKE) -C kernel2 clean
+	$(MAKE) -C kernel3 clean
+	$(MAKE) -C kernel4 clean
diff --git a/tests/opencl/j_stat/statistic.ipynb b/tests/opencl/j_stat/statistic.ipynb
new file mode 100644
index 000000000..af5afdc76
--- /dev/null
+++ b/tests/opencl/j_stat/statistic.ipynb
@@ -0,0 +1,268 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import subprocess\n",
+    "import matplotlib.pyplot as plt\n",
+    "from dataclasses import dataclass, field\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "from pathlib import Path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# architecture parameters\n",
+    "@dataclass\n",
+    "class arch:\n",
+    "    warps: int = 1\n",
+    "    cores: int = 1\n",
+    "    threads: int = 1\n",
+    "# running parameters \n",
+    "@dataclass\n",
+    "class run:\n",
+    "    arch: arch\n",
+    "    kernel: str\n",
+    "    driver: str = \"simx\"\n",
+    "    args: dict = field(default_factory=dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path_to_vortex = Path.cwd().parent.parent.parent\n",
+    "tile_size = 'TS'\n",
+    "work_per_thread = 'WPT'\n",
+    "width = 'WIDTH'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def error_running (run_params: run, error_text: str) -> str:\n",
+    "    return f\"error in running in {run_params.kernel} : warps={run_params.arch.warps} cores={run_params.arch.cores} threads={run_params.arch.threads}\" \\\n",
+    "         f\" driver={run_params.driver} args=-N{run_params.args['N']} -M{run_params.args['M']} -K{run_params.args['K']} error message - {error_text}/n\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def error_verification (run_params: run, number_of_errors: str) -> str:\n",
+    "    return f\"error in verifing results {run_params.kernel} : warps={run_params.arch.warps} cores={run_params.arch.cores} threads={run_params.arch.threads}\" \\\n",
+    "        f\" driver={run_params.driver} args=-N{run_params.args['N']} -M{run_params.args['M']} -K{run_params.args['K']} Number of errors : {number_of_errors}'\\n'\"   "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_common_h (params: dict, kernel_name: str):\n",
+    "    file_name = f\"{path_to_vortex}/tests/opencl/{kernel_name}/common.h\"\n",
+    "    with open(file_name, 'w') as file:\n",
+    "        file.write(\"#ifndef COMMON_H\\n\" + \"#define COMMON_H\\n\" + \"\\n\")\n",
+    "        if tile_size in params:\n",
+    "            file.write(f\"#define TS {params[tile_size]}\\n\")\n",
+    "        if work_per_thread in params:\n",
+    "            file.write(f\"#define WPT {params[work_per_thread]}\\n\")\n",
+    "            file.write(\"#define RTS (TS/WPT)\\n\")\n",
+    "        if width in params:\n",
+    "            file.write(f\"#define WIDTH {params[width]}\\n\")\n",
+    "        file.write('\\n' +  \"#endif // COMMON_H\")\n",
+    "    # open main.cc file to recompile before run with new common.h\n",
+    "    Path(f\"{path_to_vortex}/tests/opencl/{kernel_name}/main.cc\").touch(exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def perf (run_params: run, path_to_output_file: str) -> pd.DataFrame:\n",
+    "    # run kernel\n",
+    "    vortex = f\"--warps={run_params.arch.warps} --cores={run_params.arch.cores} --threads={run_params.arch.threads}\"\n",
+    "    run_args = f\"-N{run_params.args['N']} -M{run_params.args['M']} -K{run_params.args['K']}\"\n",
+    "    command = f\"cd {path_to_vortex}/build && ./ci/blackbox.sh {vortex} --driver={run_params.driver} --app={run_params.kernel} --args=\\\"{run_args}\\\"\"\n",
+    "    print(command)\n",
+    "    result = subprocess.run(f\"{command} > {path_to_output_file}\", shell=True)\n",
+    "\n",
+    "    # collect statistic \n",
+    "    with open(path_to_output_file, 'r') as file:\n",
+    "        lines = file.readlines()\n",
+    "    error_message = \"\"\n",
+    "    general_perf_stat = \"\"\n",
+    "    for line in lines:\n",
+    "        if \"PERF:\" in line:\n",
+    "            general_perf_stat = line\n",
+    "        # check for errors\n",
+    "        if result != 0:\n",
+    "            if \"FAILED\" in line: \n",
+    "                error_message = error_verification(run_params, line[line.find(\"FAILED! - \"):])\n",
+    "            if \"Error\" in line:\n",
+    "                error_message = error_running(run_params, line[line.find(\"Error:\"):])\n",
+    "    # pars string with general perf statistic of running kernel\n",
+    "    pairs = general_perf_stat.replace(\"PERF: \", \"\").split(\", \")\n",
+    "    perf_dict = {key_value.split(\"=\")[0]: float(key_value.split(\"=\")[1]) for key_value in pairs}\n",
+    "    if perf_dict[\"cycles\"] <= 0:\n",
+    "        error_message = error_running(run_params, \"Invalid number of cycles\")\n",
+    "    # write result to data frame\n",
+    "    run_result = pd.DataFrame([{\"kernel\": run_params.kernel[-1], \"driver\": run_params.driver, \"cores\": run_params.arch.cores, \n",
+    "                \"warps\": run_params.arch.warps, \"threads\": run_params.arch.threads, \"M\": run_params.args[\"M\"], \n",
+    "                \"N\": run_params.args[\"N\"], \"K\": run_params.args[\"K\"], \"instrs\": perf_dict[\"instrs\"], \"cycles\": perf_dict[\"cycles\"],\n",
+    "                \"IPC\": perf_dict[\"IPC\"], \"error\": error_message}])\n",
+    "    return run_result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def draw (data_frame: pd.DataFrame, x_label: str, y_label: str, title: str, path: str):\n",
+    "    data_frame.plot(kind = \"bar\", x = x_label, y = y_label)\n",
+    "    plt.title(title)\n",
+    "    plt.xlabel(x_label)\n",
+    "    plt.ylabel(y_label)\n",
+    "    plt.savefig(path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/4 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "cd /home/jblab/ivan_khromov/release/vortex/build && ./ci/blackbox.sh --warps=2 --cores=1 --threads=8 --driver=simx --app=kernel1 --args=\"-N16 -M16 -K16\"\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "In file included from /home/jblab/tools/pocl/include/CL/cl.h:20,\n",
+      "                 from /home/jblab/tools/pocl/include/CL/opencl.h:24,\n",
+      "                 from /home/jblab/ivan_khromov/release/vortex/tests/opencl/kernel1/main.cc:1:\n",
+      "/home/jblab/tools/pocl/include/CL/cl_version.h:22:104: note: #pragma message: cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 300 (OpenCL 3.0)\n",
+      "   22 | #pragma message(\"cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 300 (OpenCL 3.0)\")\n",
+      "      |                                                                                                        ^\n",
+      "/home/jblab/ivan_khromov/release/vortex/tests/opencl/kernel1/main.cc: In function ‘int main(int, char**)’:\n",
+      "/home/jblab/ivan_khromov/release/vortex/tests/opencl/kernel1/main.cc:161:25: warning: unused variable ‘context_properties’ [-Wunused-variable]\n",
+      "  161 |   cl_context_properties context_properties[]{\n",
+      "      |                         ^~~~~~~~~~~~~~~~~~\n",
+      "/home/jblab/ivan_khromov/release/vortex/tests/opencl/kernel1/main.cc:163:16: warning: unused variable ‘devices’ [-Wunused-variable]\n",
+      "  163 |   cl_device_id devices[]{device_id};\n",
+      "      |                ^~~~~~~\n",
+      "  0%|          | 0/4 [00:23<?, ?it/s]\n"
+     ]
+    },
+    {
+     "ename": "IndexError",
+     "evalue": "list index out of range",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[45], line 37\u001b[0m\n\u001b[1;32m     35\u001b[0m output_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath_to_vortex\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/tests/opencl/j_stat/output.txt\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     36\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m params \u001b[38;5;129;01min\u001b[39;00m tqdm(run_p):\n\u001b[0;32m---> 37\u001b[0m     data_frames\u001b[38;5;241m.\u001b[39mappend(\u001b[43mperf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_file\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m     38\u001b[0m data_frame \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mconcat(data_frames, ignore_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m     40\u001b[0m \u001b[38;5;66;03m# draw graph based on the recived statistic\u001b[39;00m\n",
+      "Cell \u001b[0;32mIn[43], line 26\u001b[0m, in \u001b[0;36mperf\u001b[0;34m(run_params, path_to_output_file)\u001b[0m\n\u001b[1;32m     24\u001b[0m \u001b[38;5;66;03m# pars string with general perf statistic of running kernel\u001b[39;00m\n\u001b[1;32m     25\u001b[0m pairs \u001b[38;5;241m=\u001b[39m general_perf_stat\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPERF: \u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 26\u001b[0m perf_dict \u001b[38;5;241m=\u001b[39m \u001b[43m{\u001b[49m\u001b[43mkey_value\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mfloat\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mkey_value\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mkey_value\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpairs\u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m     27\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m perf_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcycles\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m     28\u001b[0m     error_message \u001b[38;5;241m=\u001b[39m error_running(run_params, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid number of cycles\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "Cell \u001b[0;32mIn[43], line 26\u001b[0m, in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m     24\u001b[0m \u001b[38;5;66;03m# pars string with general perf statistic of running kernel\u001b[39;00m\n\u001b[1;32m     25\u001b[0m pairs \u001b[38;5;241m=\u001b[39m general_perf_stat\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPERF: \u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 26\u001b[0m perf_dict \u001b[38;5;241m=\u001b[39m {key_value\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m=\u001b[39m\u001b[38;5;124m\"\u001b[39m)[\u001b[38;5;241m0\u001b[39m]: \u001b[38;5;28mfloat\u001b[39m(\u001b[43mkey_value\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m) \u001b[38;5;28;01mfor\u001b[39;00m key_value \u001b[38;5;129;01min\u001b[39;00m pairs}\n\u001b[1;32m     27\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m perf_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcycles\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m     28\u001b[0m     error_message \u001b[38;5;241m=\u001b[39m error_running(run_params, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid number of cycles\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mIndexError\u001b[0m: list index out of range"
+     ]
+    }
+   ],
+   "source": [
+    "# create common.h files for each kernel\n",
+    "params1 = {\n",
+    "    tile_size: 4\n",
+    "}\n",
+    "create_common_h(params1, \"kernel1\")\n",
+    "create_common_h(params1, \"kernel2\")\n",
+    "\n",
+    "params3 = {\n",
+    "    tile_size: 4,\n",
+    "    work_per_thread: 4\n",
+    "}\n",
+    "create_common_h(params3, \"kernel3\")\n",
+    "\n",
+    "params4 = {\n",
+    "    tile_size: 8,\n",
+    "    width: 4\n",
+    "}\n",
+    "create_common_h(params4, \"kernel4\")\n",
+    "\n",
+    "# fill running params data class for each kernel\n",
+    "run_p = []\n",
+    "arg = {\n",
+    "    \"M\": 16,\n",
+    "    \"N\": 16,\n",
+    "    \"K\": 16\n",
+    "}\n",
+    "arch_p = arch(threads=8, cores=1, warps=2)\n",
+    "run_p.append(run(arch_p, kernel=\"kernel1\", driver=\"simx\", args=arg))\n",
+    "run_p.append(run(arch_p, kernel=\"kernel2\", driver=\"simx\", args=arg))\n",
+    "run_p.append(run(arch_p, kernel=\"kernel3\", driver=\"simx\", args=arg))\n",
+    "run_p.append(run(arch_p, kernel=\"kernel4\", driver=\"simx\", args=arg))\n",
+    "\n",
+    "# run all kernels and collect statistic in data frame\n",
+    "data_frames = []\n",
+    "output_file = f\"{path_to_vortex}/tests/opencl/j_stat/output.txt\"\n",
+    "for params in tqdm(run_p):\n",
+    "    data_frames.append(perf(params, output_file))\n",
+    "data_frame = pd.concat(data_frames, ignore_index=True)\n",
+    "\n",
+    "# draw graph based on the recived statistic\n",
+    "draw(data_frame, \"kernel\", \"cycles\", \"number of cycles per kernel\", f\"graphics/graph.png\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tests/opencl/kernel1/Makefile b/tests/opencl/kernel1/Makefile
new file mode 100644
index 000000000..37ada9b31
--- /dev/null
+++ b/tests/opencl/kernel1/Makefile
@@ -0,0 +1,17 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := kernel1
+
+SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cc
+
+kernel.cl: $(SRC_DIR)/kernel.cl
+	cp $< $@
+
+KERNEL_SRCS := kernel.cl
+
+OPTS ?=
+
+include ../common.mk
diff --git a/tests/opencl/kernel1/common.h b/tests/opencl/kernel1/common.h
new file mode 100644
index 000000000..e51835d3c
--- /dev/null
+++ b/tests/opencl/kernel1/common.h
@@ -0,0 +1,6 @@
+#ifndef COMMON_H
+#define COMMON_H
+
+#define TS 4
+
+#endif // COMMON_H
\ No newline at end of file
diff --git a/tests/opencl/kernel1/kernel.cl b/tests/opencl/kernel1/kernel.cl
new file mode 100644
index 000000000..e5ebd4958
--- /dev/null
+++ b/tests/opencl/kernel1/kernel.cl
@@ -0,0 +1,18 @@
+__kernel void myGEMM1(const int M, const int N, const int K,
+                      const __global float* A,
+                      const __global float* B,
+                      __global float* C) {
+    // Thread identifiers
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+
+    // Compute a single element (loop over K)
+    float acc = 0.0f;
+    for (int k=0; k<K; k++) {
+        acc += A[k*M + globalRow] * B[globalCol*K + k];
+    }
+
+    // Store the result
+    C[globalCol*M + globalRow] = acc;
+}
+
diff --git a/tests/opencl/kernel1/main.cc b/tests/opencl/kernel1/main.cc
new file mode 100644
index 000000000..2fa9da0ed
--- /dev/null
+++ b/tests/opencl/kernel1/main.cc
@@ -0,0 +1,283 @@
+#include <CL/opencl.h>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "common.h"
+
+int M = 0, N = 0, K = 0;
+
+static void show_usage() {
+  printf("Usage: [-M number of rows in first matrix] [-N number of columns in first matrix] [-K number of columns in first matrix and rows in second matrix] [-h: help]\n");
+}
+
+#define CL_CHECK(_expr)                                                        \
+  do {                                                                         \
+    cl_int _err = _expr;                                                       \
+    if (_err == CL_SUCCESS)                                                    \
+      break;                                                                   \
+    printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);            \
+    cleanup();                                                                 \
+    exit(-1);                                                                  \
+  } while (0)
+
+#define CL_CHECK2(_expr)                                                       \
+  ({                                                                           \
+    cl_int _err = CL_INVALID_VALUE;                                            \
+    decltype(_expr) _ret = _expr;                                              \
+    if (_err != CL_SUCCESS) {                                                  \
+      printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);          \
+      cleanup();                                                               \
+      exit(-1);                                                                \
+    }                                                                          \
+    _ret;                                                                      \
+  })
+
+static int read_kernel_file(const char *filename, uint8_t **data,
+                            size_t *size) {
+  if (NULL == filename || NULL == data || 0 == size)
+    return -1;
+
+  FILE *fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+
+  fseek(fp, 0, SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t *)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+
+  fclose(fp);
+
+  return 0;
+}
+
+static void sgemm_cpu(float *C, const float *A, const float *B, int M, int N,
+                      int K) {
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      float acc = 0;
+      for (int k = 0; k < K; ++k) {
+        acc += A[k * M + m] * B[n * K + k];
+      }
+      C[n * M + m] = acc;
+    }
+  }
+}
+
+cl_platform_id platform_id = NULL;
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue command_queue = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_mem a_memobj = NULL;
+cl_mem b_memobj = NULL;
+cl_mem c_memobj = NULL;
+uint8_t *kernel_bin = NULL;
+
+static void cleanup() {
+  if (command_queue)
+    clReleaseCommandQueue(command_queue);
+  if (kernel)
+    clReleaseKernel(kernel);
+  if (program)
+    clReleaseProgram(program);
+  if (a_memobj)
+    clReleaseMemObject(a_memobj);
+  if (b_memobj)
+    clReleaseMemObject(b_memobj);
+  if (c_memobj)
+    clReleaseMemObject(c_memobj);
+  if (context)
+    clReleaseContext(context);
+  if (device_id)
+    clReleaseDevice(device_id);
+  if (kernel_bin)
+    free(kernel_bin);
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:M:N:K:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      M = N = K = atoi(optarg);
+      break;
+    case 'M':
+      M = atoi(optarg);
+      break;
+    case 'N':
+      N = atoi(optarg);
+      break;
+    case 'K':
+      K = atoi(optarg);
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+
+  if (M < 2 || N < 2 || K < 2 || TS < 2) {
+    printf("Error: invalid size!\n");
+    exit(-1);
+  }
+}
+
+int main(int argc, char **argv) {
+  // parse command arguments
+  parse_args(argc, argv);
+
+  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+
+  // create context
+  cl_context_properties context_properties[]{
+      CL_CONTEXT_PLATFORM, cl_context_properties(platform_id), 0};
+  cl_device_id devices[]{device_id};
+  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
+
+  char device_string[1024];
+  clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
+  printf("Using device: %s\n", device_string);
+
+  // create command queue
+  command_queue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));
+
+  // generate input data
+  float *A, *B, *C;
+  A = (float *)(malloc(M * K * sizeof(float)));
+  B = (float *)(malloc(N * K * sizeof(float)));
+  C = (float *)(malloc(M * N * sizeof(float)));
+  if (A == NULL || B == NULL || C == NULL) {
+    printf("Not enough memory");
+    cleanup();
+    return -1;
+  }
+  srand(time(NULL));
+  for (int i = 0; i < M * K; i++)
+    A[i] = (int)((float)rand() / (float)RAND_MAX);
+  for (int i = 0; i < N * K; i++)
+    B[i] = (int)((float)rand() / (float)RAND_MAX);
+
+  // create buffers
+  a_memobj =
+      CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                     M * K * sizeof(float), A, &_err));
+  b_memobj =
+      CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                     N * K * sizeof(float), B, &_err));
+  c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                                   M * N * sizeof(float), NULL, &_err));
+
+  // load kernel text
+  size_t kernel_size;
+  if (read_kernel_file("kernel.cl", &kernel_bin, &kernel_size) != 0) {
+    cleanup();
+    return -1;
+  }
+  program = CL_CHECK2(clCreateProgramWithSource(context, 1, (const char **)&kernel_bin,
+                                      &kernel_size, &_err));
+  if (program == NULL) {
+    cleanup();
+    return -1;
+  }
+
+  // build program
+  cl_int build_status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
+
+  // create kernel
+  kernel = CL_CHECK2(clCreateKernel(program, "myGEMM1", &_err));
+
+  // check building info
+  size_t log_size = 0;
+  CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0,
+                                 NULL, &log_size));
+  char *log = (char *)malloc(log_size * sizeof(char));
+  if (log == NULL) {
+    printf("Not enough memory");
+    cleanup();
+    return -1;
+  }
+  CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
+                                 log_size, log, NULL));
+  if (log_size > 1) {
+    printf("Log:\n");
+    printf("%s", log);
+    printf("\n");
+  }
+  CL_CHECK(build_status);
+
+
+  // set kernel arguments
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M));
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N));
+  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K));
+  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&a_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&b_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&c_memobj));
+
+  // run kernel
+  const size_t local[2] = {TS, TS};
+  const size_t global[2] = {M, N};
+  printf("Execute the kernel\n");
+  auto time_start = std::chrono::high_resolution_clock::now();
+  CL_CHECK(clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, local,
+                                  0, NULL, NULL));
+  CL_CHECK(clFinish(command_queue));
+  //CL_CHECK(clWaitForEvents(1, &event));
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
+                       time_end - time_start)
+                       .count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  // get results from VRAM
+  CL_CHECK(clEnqueueReadBuffer(command_queue, c_memobj, CL_TRUE, 0,
+                               M * N * sizeof(float), C, 0, NULL, NULL));
+  CL_CHECK(clFinish(command_queue));
+
+  // verify results
+  printf("Verify result\n");
+  float *C_cpu = (float *)malloc(M * N * sizeof(float));
+  if (C_cpu == NULL) {
+    printf("Not enough memory");
+    cleanup();
+    return -1;
+  }
+  sgemm_cpu(C_cpu, A, B, M, N, K);
+  int errors = 0;
+
+  for (size_t i = 0; i < size_t(M * N); i++)
+    if (C_cpu[i] != C[i])
+      errors++;
+  if (errors != 0)
+    printf("FAILED! - %d errors\n", errors);
+  else
+    printf("PASSED!\n");
+
+  // free resureses
+  cleanup();
+  free(A);
+  free(B);
+  free(C);
+  free(log);
+  free(C_cpu);
+  return errors;
+}
diff --git a/tests/opencl/kernel2/Makefile b/tests/opencl/kernel2/Makefile
new file mode 100644
index 000000000..fa9b7f379
--- /dev/null
+++ b/tests/opencl/kernel2/Makefile
@@ -0,0 +1,20 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := kernel2
+
+SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cc
+
+kernel.cl: $(SRC_DIR)/kernel.cl
+	cp $< $@
+
+common.h: $(SRC_DIR)/common.h
+	cp $< $@
+
+KERNEL_SRCS := kernel.cl common.h
+
+OPTS ?=
+
+include ../common.mk
diff --git a/tests/opencl/kernel2/common.h b/tests/opencl/kernel2/common.h
new file mode 100644
index 000000000..e51835d3c
--- /dev/null
+++ b/tests/opencl/kernel2/common.h
@@ -0,0 +1,6 @@
+#ifndef COMMON_H
+#define COMMON_H
+
+#define TS 4
+
+#endif // COMMON_H
\ No newline at end of file
diff --git a/tests/opencl/kernel2/kernel.cl b/tests/opencl/kernel2/kernel.cl
new file mode 100644
index 000000000..692318cde
--- /dev/null
+++ b/tests/opencl/kernel2/kernel.cl
@@ -0,0 +1,46 @@
+#include "common.h"
+
+__kernel void myGEMM2(const int M, const int N, const int K,
+                      const __global float* A,
+                      const __global float* B,
+                      __global float* C) {
+    
+    // Thread identifiers
+    const int row = get_local_id(0); // Local row ID (max: TS)
+    const int col = get_local_id(1); // Local col ID (max: TS)
+    const int globalRow = TS*get_group_id(0) + row; // Row ID of C (0..M)
+    const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N)
+
+    // Local memory to fit a tile of TS*TS elements of A and B
+    __local float Asub[TS][TS];
+    __local float Bsub[TS][TS];
+
+    // Initialise the accumulation register
+    float acc = 0.0f;
+    
+    // Loop over all tiles
+    const int numTiles = K/TS;
+    for (int t=0; t<numTiles; t++) {
+
+        // Load one tile of A and B into local memory
+        const int tiledRow = TS*t + row;
+        const int tiledCol = TS*t + col;
+        Asub[col][row] = A[tiledCol*M + globalRow];
+        Bsub[col][row] = B[globalCol*K + tiledRow];
+
+        // Synchronise to make sure the tile is loaded
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // Perform the computation for a single tile
+        for (int k=0; k<TS; k++) {
+            acc += Asub[k][row] * Bsub[col][k];
+        }
+
+        // Synchronise before loading the next tile
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // Store the final result in C
+    C[globalCol*M + globalRow] = acc;
+}
+
diff --git a/tests/opencl/kernel2/main.cc b/tests/opencl/kernel2/main.cc
new file mode 100644
index 000000000..933420964
--- /dev/null
+++ b/tests/opencl/kernel2/main.cc
@@ -0,0 +1,283 @@
+#include <CL/opencl.h>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "common.h"
+
+int M = 256, N = 256, K = 256;
+
+static void show_usage() {
+  printf("Usage: [-M number of rows in first matrix] [-N number of columns in first matrix] [-K number of columns in first matrix and rows in second matrix] [-h: help]\n");
+}
+
+#define CL_CHECK(_expr)                                                        \
+  do {                                                                         \
+    cl_int _err = _expr;                                                       \
+    if (_err == CL_SUCCESS)                                                    \
+      break;                                                                   \
+    printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);            \
+    cleanup();                                                                 \
+    exit(-1);                                                                  \
+  } while (0)
+
+#define CL_CHECK2(_expr)                                                       \
+  ({                                                                           \
+    cl_int _err = CL_INVALID_VALUE;                                            \
+    decltype(_expr) _ret = _expr;                                              \
+    if (_err != CL_SUCCESS) {                                                  \
+      printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);          \
+      cleanup();                                                               \
+      exit(-1);                                                                \
+    }                                                                          \
+    _ret;                                                                      \
+  })
+
+static int read_kernel_file(const char *filename, uint8_t **data,
+                            size_t *size) {
+  if (NULL == filename || NULL == data || 0 == size)
+    return -1;
+
+  FILE *fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+
+  fseek(fp, 0, SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t *)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+
+  fclose(fp);
+
+  return 0;
+}
+
+static void sgemm_cpu(float *C, const float *A, const float *B, int M, int N,
+                      int K) {
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      float acc = 0;
+      for (int k = 0; k < K; ++k) {
+        acc += A[k * M + m] * B[n * K + k];
+      }
+      C[n * M + m] = acc;
+    }
+  }
+}
+
+cl_platform_id platform_id = NULL;
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue command_queue = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_mem a_memobj = NULL;
+cl_mem b_memobj = NULL;
+cl_mem c_memobj = NULL;
+uint8_t *kernel_bin = NULL;
+
+static void cleanup() {
+  if (command_queue)
+    clReleaseCommandQueue(command_queue);
+  if (kernel)
+    clReleaseKernel(kernel);
+  if (program)
+    clReleaseProgram(program);
+  if (a_memobj)
+    clReleaseMemObject(a_memobj);
+  if (b_memobj)
+    clReleaseMemObject(b_memobj);
+  if (c_memobj)
+    clReleaseMemObject(c_memobj);
+  if (context)
+    clReleaseContext(context);
+  if (device_id)
+    clReleaseDevice(device_id);
+  if (kernel_bin)
+    free(kernel_bin);
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:M:N:K:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      M = N = K = atoi(optarg);
+      break;
+    case 'M':
+      M = atoi(optarg);
+      break;
+    case 'N':
+      N = atoi(optarg);
+      break;
+    case 'K':
+      K = atoi(optarg);
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+
+  if (M < 2 || N < 2 || K < 2 || TS < 2) {
+    printf("Error: invalid size!\n");
+    exit(-1);
+  }
+}
+
+int main(int argc, char **argv) {
+  // parse command arguments
+  parse_args(argc, argv);
+
+  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+
+  // create context
+  cl_context_properties context_properties[]{
+      CL_CONTEXT_PLATFORM, cl_context_properties(platform_id), 0};
+  cl_device_id devices[]{device_id};
+  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
+
+  char device_string[1024];
+  clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
+  printf("Using device: %s\n", device_string);
+
+  // create command queue
+  command_queue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));
+
+  // generate input data
+  float *A, *B, *C;
+  A = (float *)(malloc(M * K * sizeof(float)));
+  B = (float *)(malloc(N * K * sizeof(float)));
+  C = (float *)(malloc(M * N * sizeof(float)));
+  if (A == NULL || B == NULL || C == NULL) {
+    printf("Not enough memory");
+    cleanup();
+    return -1;
+  }
+  srand(time(NULL));
+  for (int i = 0; i < M * K; i++)
+    A[i] = (int)((float)rand() / (float)RAND_MAX);
+  for (int i = 0; i < N * K; i++)
+    B[i] = (int)((float)rand() / (float)RAND_MAX);
+
+  // create buffers
+  a_memobj =
+      CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                     M * K * sizeof(float), A, &_err));
+  b_memobj =
+      CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                     N * K * sizeof(float), B, &_err));
+  c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                                   M * N * sizeof(float), NULL, &_err));
+
+  // load kernel text
+  size_t kernel_size;
+  if (read_kernel_file("kernel.cl", &kernel_bin, &kernel_size) != 0) {
+    cleanup();
+    return -1;
+  }
+  program = CL_CHECK2(clCreateProgramWithSource(context, 1, (const char **)&kernel_bin,
+                                      &kernel_size, &_err));
+  if (program == NULL) {
+    cleanup();
+    return -1;
+  }
+
+  // build program
+  cl_int build_status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
+
+  // create kernel
+  kernel = CL_CHECK2(clCreateKernel(program, "myGEMM2", &_err));
+
+  // check building info
+  size_t log_size = 0;
+  CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0,
+                                 NULL, &log_size));
+  char *log = (char *)malloc(log_size * sizeof(char));
+  if (log == NULL) {
+    printf("Not enough memory");
+    cleanup();
+    return -1;
+  }
+  CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
+                                 log_size, log, NULL));
+  if (log_size > 1) {
+    printf("Log:\n");
+    printf("%s", log);
+    printf("\n");
+  }
+  CL_CHECK(build_status);
+
+
+  // set kernel arguments
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M));
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N));
+  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K));
+  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&a_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&b_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&c_memobj));
+
+  // run kernel
+  const size_t local[2] = {TS, TS};
+  const size_t global[2] = {M, N};
+  printf("Execute the kernel\n");
+  auto time_start = std::chrono::high_resolution_clock::now();
+  CL_CHECK(clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, local,
+                                  0, NULL, NULL));
+  CL_CHECK(clFinish(command_queue));
+  //CL_CHECK(clWaitForEvents(1, &event));
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
+                       time_end - time_start)
+                       .count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  // get results from VRAM
+  CL_CHECK(clEnqueueReadBuffer(command_queue, c_memobj, CL_TRUE, 0,
+                               M * N * sizeof(float), C, 0, NULL, NULL));
+  CL_CHECK(clFinish(command_queue));
+
+  // verify results
+  printf("Verify result\n");
+  float *C_cpu = (float *)malloc(M * N * sizeof(float));
+  if (C_cpu == NULL) {
+    printf("Not enough memory");
+    cleanup();
+    return -1;
+  }
+  sgemm_cpu(C_cpu, A, B, M, N, K);
+  int errors = 0;
+
+  for (size_t i = 0; i < size_t(M * N); i++)
+    if (C_cpu[i] != C[i])
+      errors++;
+  if (errors != 0)
+    printf("FAILED! - %d errors\n", errors);
+  else
+    printf("PASSED!\n");
+
+  // free resureses
+  cleanup();
+  free(A);
+  free(B);
+  free(C);
+  free(log);
+  free(C_cpu);
+  return errors;
+}
\ No newline at end of file
diff --git a/tests/opencl/kernel3/Makefile b/tests/opencl/kernel3/Makefile
new file mode 100644
index 000000000..b7e0c3133
--- /dev/null
+++ b/tests/opencl/kernel3/Makefile
@@ -0,0 +1,20 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := kernel3
+
+SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cc
+
+kernel.cl: $(SRC_DIR)/kernel.cl
+	cp $< $@
+
+common.h: $(SRC_DIR)/common.h
+	cp $< $@
+
+KERNEL_SRCS := kernel.cl common.h
+
+OPTS ?=
+
+include ../common.mk
diff --git a/tests/opencl/kernel3/common.h b/tests/opencl/kernel3/common.h
new file mode 100644
index 000000000..c922a82eb
--- /dev/null
+++ b/tests/opencl/kernel3/common.h
@@ -0,0 +1,8 @@
+#ifndef COMMON_H
+#define COMMON_H
+
+#define TS 4
+#define WPT 4
+#define RTS (TS/WPT)
+
+#endif // COMMON_H
\ No newline at end of file
diff --git a/tests/opencl/kernel3/kernel.cl b/tests/opencl/kernel3/kernel.cl
new file mode 100644
index 000000000..fbc30034b
--- /dev/null
+++ b/tests/opencl/kernel3/kernel.cl
@@ -0,0 +1,55 @@
+#include "common.h"
+
+__kernel void myGEMM3(const int M, const int N, const int K,
+                      const __global float* A,
+                      const __global float* B,
+                      __global float* C) {
+    
+    // Thread identifiers
+    const int row = get_local_id(0); // Local row ID (max: TS)
+    const int col = get_local_id(1); // Local col ID (max: TS/WPT == RTS)
+    const int globalRow = TS*get_group_id(0) + row; // Row ID of C (0..M)
+    const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N)
+
+    // Local memory to fit a tile of TS*TS elements of A and B
+    __local float Asub[TS][TS];
+    __local float Bsub[TS][TS];
+
+    // Initialise the accumulation registers
+    float acc[WPT];
+    for (int w=0; w<WPT; w++) {
+        acc[w] = 0.0f;
+    }
+    
+    // Loop over all tiles
+    const int numTiles = K/TS;
+    for (int t=0; t<numTiles; t++) {
+
+        // Load one tile of A and B into local memory
+        for (int w=0; w<WPT; w++) {
+            const int tiledRow = TS*t + row;
+            const int tiledCol = TS*t + col;
+            Asub[col + w*RTS][row] = A[(tiledCol + w*RTS)*M + globalRow];
+            Bsub[col + w*RTS][row] = B[(globalCol + w*RTS)*K + tiledRow];
+        }
+
+        // Synchronise to make sure the tile is loaded
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // Perform the computation for a single tile
+        for (int k=0; k<TS; k++) {
+            for (int w=0; w<WPT; w++) {
+                acc[w] += Asub[k][row] * Bsub[col + w*RTS][k];
+            }
+        }
+
+        // Synchronise before loading the next tile
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // Store the final results in C
+    for (int w=0; w<WPT; w++) {
+        C[(globalCol + w*RTS)*M + globalRow] = acc[w];
+    }
+}
+
diff --git a/tests/opencl/kernel3/main.cc b/tests/opencl/kernel3/main.cc
new file mode 100644
index 000000000..8097aaf12
--- /dev/null
+++ b/tests/opencl/kernel3/main.cc
@@ -0,0 +1,283 @@
+#include <CL/opencl.h>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "common.h"
+
+int M = 256, N = 256, K = 256;
+
+static void show_usage() {
+  printf("Usage: [-M number of rows in first matrix] [-N number of columns in first matrix] [-K number of columns in first matrix and rows in second matrix] [-h: help]\n");
+}
+
+#define CL_CHECK(_expr)                                                        \
+  do {                                                                         \
+    cl_int _err = _expr;                                                       \
+    if (_err == CL_SUCCESS)                                                    \
+      break;                                                                   \
+    printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);            \
+    cleanup();                                                                 \
+    exit(-1);                                                                  \
+  } while (0)
+
+#define CL_CHECK2(_expr)                                                       \
+  ({                                                                           \
+    cl_int _err = CL_INVALID_VALUE;                                            \
+    decltype(_expr) _ret = _expr;                                              \
+    if (_err != CL_SUCCESS) {                                                  \
+      printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);          \
+      cleanup();                                                               \
+      exit(-1);                                                                \
+    }                                                                          \
+    _ret;                                                                      \
+  })
+
+static int read_kernel_file(const char *filename, uint8_t **data,
+                            size_t *size) {
+  if (NULL == filename || NULL == data || 0 == size)
+    return -1;
+
+  FILE *fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+
+  fseek(fp, 0, SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t *)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+
+  fclose(fp);
+
+  return 0;
+}
+
+static void sgemm_cpu(float *C, const float *A, const float *B, int M, int N,
+                      int K) {
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      float acc = 0;
+      for (int k = 0; k < K; ++k) {
+        acc += A[k * M + m] * B[n * K + k];
+      }
+      C[n * M + m] = acc;
+    }
+  }
+}
+
+cl_platform_id platform_id = NULL;
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue command_queue = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_mem a_memobj = NULL;
+cl_mem b_memobj = NULL;
+cl_mem c_memobj = NULL;
+uint8_t *kernel_bin = NULL;
+
+static void cleanup() {
+  if (command_queue)
+    clReleaseCommandQueue(command_queue);
+  if (kernel)
+    clReleaseKernel(kernel);
+  if (program)
+    clReleaseProgram(program);
+  if (a_memobj)
+    clReleaseMemObject(a_memobj);
+  if (b_memobj)
+    clReleaseMemObject(b_memobj);
+  if (c_memobj)
+    clReleaseMemObject(c_memobj);
+  if (context)
+    clReleaseContext(context);
+  if (device_id)
+    clReleaseDevice(device_id);
+  if (kernel_bin)
+    free(kernel_bin);
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:M:N:K:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      M = N = K = atoi(optarg);
+      break;
+    case 'M':
+      M = atoi(optarg);
+      break;
+    case 'N':
+      N = atoi(optarg);
+      break;
+    case 'K':
+      K = atoi(optarg);
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+
+  if (M < 2 || N < 2 || K < 2) {
+    printf("Error: invalid size!\n");
+    exit(-1);
+  }
+}
+
+int main(int argc, char **argv) {
+  // parse command arguments
+  parse_args(argc, argv);
+
+  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+
+  // create context
+  cl_context_properties context_properties[]{
+      CL_CONTEXT_PLATFORM, cl_context_properties(platform_id), 0};
+  cl_device_id devices[]{device_id};
+  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
+
+  char device_string[1024];
+  clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
+  printf("Using device: %s\n", device_string);
+  
+  // create command queue
+  command_queue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));
+
+  // generate input data
+  float *A, *B, *C;
+  A = (float *)(malloc(M * K * sizeof(float)));
+  B = (float *)(malloc(N * K * sizeof(float)));
+  C = (float *)(malloc(M * N * sizeof(float)));
+  if (A == NULL || B == NULL || C == NULL) {
+    printf("Not enough memory");
+    cleanup();
+    return -1;
+  }
+  srand(time(NULL));
+  for (int i = 0; i < M * K; i++)
+    A[i] = (int)((float)rand() / (float)RAND_MAX);
+  for (int i = 0; i < N * K; i++)
+    B[i] = (int)((float)rand() / (float)RAND_MAX);
+
+  // create buffers
+  a_memobj =
+      CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                     M * K * sizeof(float), A, &_err));
+  b_memobj =
+      CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                     N * K * sizeof(float), B, &_err));
+  c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                                   M * N * sizeof(float), NULL, &_err));
+
+  // load kernel text
+  size_t kernel_size;
+  if (read_kernel_file("kernel.cl", &kernel_bin, &kernel_size) != 0) {
+    cleanup();
+    return -1;
+  }
+  program = CL_CHECK2(clCreateProgramWithSource(context, 1, (const char **)&kernel_bin,
+                                      &kernel_size, &_err));
+  if (program == NULL) {
+    cleanup();
+    return -1;
+  }
+
+  // build program
+  cl_int build_status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
+
+  // create kernel
+  kernel = CL_CHECK2(clCreateKernel(program, "myGEMM3", &_err));
+
+  // check building info
+  size_t log_size = 0;
+  CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0,
+                                 NULL, &log_size));
+  char *log = (char *)malloc(log_size * sizeof(char));
+  if (log == NULL) {
+    printf("Not enough memory");
+    cleanup();
+    return -1;
+  }
+  CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
+                                 log_size, log, NULL));
+  if (log_size > 1) {
+    printf("Log:\n");
+    printf("%s", log);
+    printf("\n");
+  }
+  CL_CHECK(build_status);
+
+
+  // set kernel arguments
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M));
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N));
+  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K));
+  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&a_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&b_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&c_memobj));
+
+  // run kernel
+  const size_t local[2] = {TS, TS / WPT};
+  const size_t global[2] = {M, N / WPT};
+  printf("Execute the kernel\n");
+  auto time_start = std::chrono::high_resolution_clock::now();
+  CL_CHECK(clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, local,
+                                  0, NULL, NULL));
+  CL_CHECK(clFinish(command_queue));
+  //CL_CHECK(clWaitForEvents(1, &event));
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
+                       time_end - time_start)
+                       .count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  // get results from VRAM
+  CL_CHECK(clEnqueueReadBuffer(command_queue, c_memobj, CL_TRUE, 0,
+                               M * N * sizeof(float), C, 0, NULL, NULL));
+  CL_CHECK(clFinish(command_queue));
+
+  // verify results
+  printf("Verify result\n");
+  float *C_cpu = (float *)malloc(M * N * sizeof(float));
+  if (C_cpu == NULL) {
+    printf("Not enough memory");
+    cleanup();
+    return -1;
+  }
+  sgemm_cpu(C_cpu, A, B, M, N, K);
+  int errors = 0;
+
+  for (size_t i = 0; i < size_t(M * N); i++)
+    if (C_cpu[i] != C[i])
+      errors++;
+  if (errors != 0)
+    printf("FAILED! - %d errors\n", errors);
+  else
+    printf("PASSED!\n");
+
+  // free resureses
+  cleanup();
+  free(A);
+  free(B);
+  free(C);
+  free(log);
+  free(C_cpu);
+  return errors;
+}
diff --git a/tests/opencl/kernel4/Makefile b/tests/opencl/kernel4/Makefile
new file mode 100644
index 000000000..312ec638d
--- /dev/null
+++ b/tests/opencl/kernel4/Makefile
@@ -0,0 +1,20 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := kernel4
+
+SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cc
+
+kernel.cl: $(SRC_DIR)/kernel.cl
+	cp $< $@
+
+common.h: $(SRC_DIR)/common.h
+	cp $< $@
+
+KERNEL_SRCS := kernel.cl common.h
+
+OPTS ?=
+
+include ../common.mk
diff --git a/tests/opencl/kernel4/common.h b/tests/opencl/kernel4/common.h
new file mode 100644
index 000000000..3ed810585
--- /dev/null
+++ b/tests/opencl/kernel4/common.h
@@ -0,0 +1,7 @@
+#ifndef COMMON_H
+#define COMMON_H
+
+#define TS 8
+#define WIDTH 4
+
+#endif // COMMON_H
\ No newline at end of file
diff --git a/tests/opencl/kernel4/kernel.cl b/tests/opencl/kernel4/kernel.cl
new file mode 100644
index 000000000..179789d97
--- /dev/null
+++ b/tests/opencl/kernel4/kernel.cl
@@ -0,0 +1,110 @@
+#include "common.h"
+
+#if WIDTH == 1
+    typedef float floatX;
+#elif WIDTH == 2
+    typedef float2 floatX;
+#elif WIDTH == 4
+    typedef float4 floatX;
+#elif WIDTH == 8
+    typedef float8 floatX;
+#endif
+
+__kernel void myGEMM4(const int M, const int N, const int K,
+                      const __global floatX* A,
+                      const __global floatX* B,
+                      __global floatX* C) {
+
+    // Thread identifiers
+    const int row = get_local_id(0); // Local row ID (max: TS/WIDTH)
+    const int col = get_local_id(1); // Local col ID (max: TS)
+    const int globalRow = (TS/WIDTH)*get_group_id(0) + row; // Row ID of C (0..M/WIDTH)
+    const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N)
+
+    // Local memory to fit a tile of TS*TS elements of A and B
+    __local floatX Asub[TS][TS/WIDTH];
+    __local floatX Bsub[TS][TS/WIDTH];
+
+    // Initialise the accumulation registers
+    #if WIDTH == 1
+        floatX acc = 0.0f;
+    #elif WIDTH == 2
+        floatX acc = { 0.0f, 0.0f };
+    #elif WIDTH == 4
+        floatX acc = { 0.0f, 0.0f, 0.0f, 0.0f };
+    #elif WIDTH == 8
+        floatX acc = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+    #endif
+    
+    // Loop over all tiles
+    const int numTiles = K/TS;
+    for (int tile=0; tile<numTiles; tile++) {
+
+        // Load one tile of A and B into local memory
+        const int tiledRow = (TS/WIDTH)*tile + row;
+        const int tiledCol = TS*tile + col;
+        Asub[col][row] = A[tiledCol*(M/WIDTH) + globalRow];
+        Bsub[col][row] = B[globalCol*(K/WIDTH) + tiledRow];
+
+        // Synchronise to make sure the tile is loaded
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // Perform the computation for a single tile
+        floatX vecA, vecB;
+        float valB;
+        for (int k=0; k<TS/WIDTH; k++) {
+            vecB = Bsub[col][k];
+            for (int w=0; w<WIDTH; w++) {
+                vecA = Asub[WIDTH*k + w][row];
+                #if WIDTH == 1
+                    valB = vecB;
+                    acc += vecA * valB;
+                #elif WIDTH == 2
+                    switch (w) {
+                        case 0: valB = vecB.x; break;
+                        case 1: valB = vecB.y; break;
+                    }
+                    acc.x += vecA.x * valB;
+                    acc.y += vecA.y * valB;
+                #elif WIDTH == 4
+                    switch (w) {
+                        case 0: valB = vecB.x; break;
+                        case 1: valB = vecB.y; break;
+                        case 2: valB = vecB.z; break;
+                        case 3: valB = vecB.w; break;
+                    }
+                    acc.x += vecA.x * valB;
+                    acc.y += vecA.y * valB;
+                    acc.z += vecA.z * valB;
+                    acc.w += vecA.w * valB;
+                #elif WIDTH == 8
+                    switch (w) {
+                        case 0: valB = vecB.s0; break;
+                        case 1: valB = vecB.s1; break;
+                        case 2: valB = vecB.s2; break;
+                        case 3: valB = vecB.s3; break;
+                        case 4: valB = vecB.s4; break;
+                        case 5: valB = vecB.s5; break;
+                        case 6: valB = vecB.s6; break;
+                        case 7: valB = vecB.s7; break;
+                    }
+                    acc.s0 += vecA.s0 * valB;
+                    acc.s1 += vecA.s1 * valB;
+                    acc.s2 += vecA.s2 * valB;
+                    acc.s3 += vecA.s3 * valB;
+                    acc.s4 += vecA.s4 * valB;
+                    acc.s5 += vecA.s5 * valB;
+                    acc.s6 += vecA.s6 * valB;
+                    acc.s7 += vecA.s7 * valB;
+                #endif
+            }
+        }
+
+        // Synchronise before loading the next tile
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // Store the final results in C
+    C[globalCol*(M/WIDTH) + globalRow] = acc;
+}
+
diff --git a/tests/opencl/kernel4/main.cc b/tests/opencl/kernel4/main.cc
new file mode 100644
index 000000000..780541652
--- /dev/null
+++ b/tests/opencl/kernel4/main.cc
@@ -0,0 +1,283 @@
+#include <CL/opencl.h>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "common.h"
+
+int M = 256, N = 256, K = 256;
+
+static void show_usage() {
+  printf("Usage: [-M number of rows in first matrix] [-N number of columns in first matrix] [-K number of columns in first matrix and rows in second matrix] [-h: help]\n");
+}
+
+#define CL_CHECK(_expr)                                                        \
+  do {                                                                         \
+    cl_int _err = _expr;                                                       \
+    if (_err == CL_SUCCESS)                                                    \
+      break;                                                                   \
+    printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);            \
+    cleanup();                                                                 \
+    exit(-1);                                                                  \
+  } while (0)
+
+#define CL_CHECK2(_expr)                                                       \
+  ({                                                                           \
+    cl_int _err = CL_INVALID_VALUE;                                            \
+    decltype(_expr) _ret = _expr;                                              \
+    if (_err != CL_SUCCESS) {                                                  \
+      printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);          \
+      cleanup();                                                               \
+      exit(-1);                                                                \
+    }                                                                          \
+    _ret;                                                                      \
+  })
+
+static int read_kernel_file(const char *filename, uint8_t **data,
+                            size_t *size) {
+  if (NULL == filename || NULL == data || 0 == size)
+    return -1;
+
+  FILE *fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+
+  fseek(fp, 0, SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t *)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+
+  fclose(fp);
+
+  return 0;
+}
+
+static void sgemm_cpu(float *C, const float *A, const float *B, int M, int N,
+                      int K) {
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      float acc = 0;
+      for (int k = 0; k < K; ++k) {
+        acc += A[k * M + m] * B[n * K + k];
+      }
+      C[n * M + m] = acc;
+    }
+  }
+}
+
+cl_platform_id platform_id = NULL;
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue command_queue = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_mem a_memobj = NULL;
+cl_mem b_memobj = NULL;
+cl_mem c_memobj = NULL;
+uint8_t *kernel_bin = NULL;
+
+static void cleanup() {
+  if (command_queue)
+    clReleaseCommandQueue(command_queue);
+  if (kernel)
+    clReleaseKernel(kernel);
+  if (program)
+    clReleaseProgram(program);
+  if (a_memobj)
+    clReleaseMemObject(a_memobj);
+  if (b_memobj)
+    clReleaseMemObject(b_memobj);
+  if (c_memobj)
+    clReleaseMemObject(c_memobj);
+  if (context)
+    clReleaseContext(context);
+  if (device_id)
+    clReleaseDevice(device_id);
+  if (kernel_bin)
+    free(kernel_bin);
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:M:N:K:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      M = N = K = atoi(optarg);
+      break;
+    case 'M':
+      M = atoi(optarg);
+      break;
+    case 'N':
+      N = atoi(optarg);
+      break;
+    case 'K':
+      K = atoi(optarg);
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+
+  if (M < 2 || N < 2 || K < 2) {
+    printf("Error: invalid size!\n");
+    exit(-1);
+  }
+}
+
+int main(int argc, char **argv) {
+  // parse command arguments
+  parse_args(argc, argv);
+
+  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+
+  // create context
+  cl_context_properties context_properties[]{
+      CL_CONTEXT_PLATFORM, cl_context_properties(platform_id), 0};
+  cl_device_id devices[]{device_id};
+  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
+
+  char device_string[1024];
+  clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
+  printf("Using device: %s\n", device_string);
+
+  // create command queue
+  command_queue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));
+
+  // generate input data
+  float *A, *B, *C;
+  A = (float *)(malloc(M * K * sizeof(float)));
+  B = (float *)(malloc(N * K * sizeof(float)));
+  C = (float *)(malloc(M * N * sizeof(float)));
+  if (A == NULL || B == NULL || C == NULL) {
+    printf("Not enough memory");
+    cleanup();
+    return -1;
+  }
+  srand(time(NULL));
+  for (int i = 0; i < M * K; i++)
+    A[i] = (int)((float)rand() / (float)RAND_MAX);
+  for (int i = 0; i < N * K; i++)
+    B[i] = (int)((float)rand() / (float)RAND_MAX);
+
+  // create buffers
+  a_memobj =
+      CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                     M * K * sizeof(float), A, &_err));
+  b_memobj =
+      CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                     N * K * sizeof(float), B, &_err));
+  c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                                   M * N * sizeof(float), NULL, &_err));
+
+  // load kernel text
+  size_t kernel_size;
+  if (read_kernel_file("kernel.cl", &kernel_bin, &kernel_size) != 0) {
+    cleanup();
+    return -1;
+  }
+  program = CL_CHECK2(clCreateProgramWithSource(context, 1, (const char **)&kernel_bin,
+                                      &kernel_size, &_err));
+  if (program == NULL) {
+    cleanup();
+    return -1;
+  }
+
+  // build program
+  cl_int build_status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
+
+  // create kernel
+  kernel = CL_CHECK2(clCreateKernel(program, "myGEMM4", &_err));
+
+  // check building info
+  size_t log_size = 0;
+  CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0,
+                                 NULL, &log_size));
+  char *log = (char *)malloc(log_size * sizeof(char));
+  if (log == NULL) {
+    printf("Not enough memory");
+    cleanup();
+    return -1;
+  }
+  CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
+                                 log_size, log, NULL));
+  if (log_size > 1) {
+    printf("Log:\n");
+    printf("%s", log);
+    printf("\n");
+  }
+  CL_CHECK(build_status);
+
+
+  // set kernel arguments
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M));
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N));
+  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K));
+  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&a_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&b_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&c_memobj));
+
+  // run kernel
+  const size_t local[2] = {TS / WIDTH, TS};
+  const size_t global[2] = {M / WIDTH, N};
+  printf("Execute the kernel\n");
+  auto time_start = std::chrono::high_resolution_clock::now();
+  CL_CHECK(clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, local,
+                                  0, NULL, NULL));
+  CL_CHECK(clFinish(command_queue));
+  //CL_CHECK(clWaitForEvents(1, &event));
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
+                       time_end - time_start)
+                       .count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  // get results from VRAM
+  CL_CHECK(clEnqueueReadBuffer(command_queue, c_memobj, CL_TRUE, 0,
+                               M * N * sizeof(float), C, 0, NULL, NULL));
+  CL_CHECK(clFinish(command_queue));
+
+  // verify results
+  printf("Verify result\n");
+  float *C_cpu = (float *)malloc(M * N * sizeof(float));
+  if (C_cpu == NULL) {
+    printf("Not enough memory");
+    cleanup();
+    return -1;
+  }
+  sgemm_cpu(C_cpu, A, B, M, N, K);
+  int errors = 0;
+
+  for (size_t i = 0; i < size_t(M * N); i++)
+    if (C_cpu[i] != C[i])
+      errors++;
+  if (errors != 0)
+    printf("FAILED! - %d errors\n", errors);
+  else
+    printf("PASSED!\n");
+
+  // free resureses
+  cleanup();
+  free(A);
+  free(B);
+  free(C);
+  free(log);
+  free(C_cpu);
+  return errors;
+}