diff --git a/blackbox.simx.cache b/blackbox.simx.cache new file mode 100644 index 000000000..32e551ea1 --- /dev/null +++ b/blackbox.simx.cache @@ -0,0 +1 @@ +-DNUM_CLUSTERS=1 -DNUM_CORES=16 -DNUM_WARPS=16 -DNUM_THREADS=16 +0+0 diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile index db8366795..a2c479784 100644 --- a/tests/opencl/Makefile +++ b/tests/opencl/Makefile @@ -22,6 +22,10 @@ all: $(MAKE) -C sgemm2 $(MAKE) -C sgemm3 $(MAKE) -C psum + $(MAKE) -C kernel1 + $(MAKE) -C kernel2 + $(MAKE) -C kernel3 + $(MAKE) -C kernel4 run-simx: $(MAKE) -C vecadd run-simx @@ -43,6 +47,10 @@ run-simx: $(MAKE) -C sgemm2 run-simx $(MAKE) -C sgemm3 run-simx $(MAKE) -C psum run-simx + $(MAKE) -C kernel1 run-simx + $(MAKE) -C kernel2 run-simx + $(MAKE) -C kernel3 run-simx + $(MAKE) -C kernel4 run-simx run-rtlsim: $(MAKE) -C vecadd run-rtlsim @@ -64,6 +72,10 @@ run-rtlsim: $(MAKE) -C sgemm2 run-rtlsim $(MAKE) -C sgemm3 run-rtlsim $(MAKE) -C psum run-rtlsim + $(MAKE) -C kernel1 run-rtlsimy + $(MAKE) -C kernel2 run-rtlsimy + $(MAKE) -C kernel3 run-rtlsimy + $(MAKE) -C kernel4 run-rtlsimy run-opae: $(MAKE) -C vecadd run-opae @@ -85,6 +97,10 @@ run-opae: $(MAKE) -C sgemm2 run-opae $(MAKE) -C sgemm3 run-opae $(MAKE) -C psum run-opae + $(MAKE) -C kernel1 run-opae + $(MAKE) -C kernel2 run-opae + $(MAKE) -C kernel3 run-opae + $(MAKE) -C kernel4 run-opae clean: $(MAKE) -C vecadd clean @@ -106,4 +122,8 @@ clean: $(MAKE) -C bfs clean $(MAKE) -C sgemm2 clean $(MAKE) -C sgemm3 clean - $(MAKE) -C psum clean \ No newline at end of file + $(MAKE) -C psum clean + $(MAKE) -C kernel1 clean + $(MAKE) -C kernel2 clean + $(MAKE) -C kernel3 clean + $(MAKE) -C kernel4 clean diff --git a/tests/opencl/j_stat/statistic.ipynb b/tests/opencl/j_stat/statistic.ipynb new file mode 100644 index 000000000..af5afdc76 --- /dev/null +++ b/tests/opencl/j_stat/statistic.ipynb @@ -0,0 +1,268 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess\n", + "import matplotlib.pyplot as plt\n", + "from dataclasses import dataclass, field\n", + "import pandas as pd\n", + "from tqdm import tqdm\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "# architecture parameters\n", + "@dataclass\n", + "class arch:\n", + " warps: int = 1\n", + " cores: int = 1\n", + " threads: int = 1\n", + "# running parameters \n", + "@dataclass\n", + "class run:\n", + " arch: arch\n", + " kernel: str\n", + " driver: str = \"simx\"\n", + " args: dict = field(default_factory=dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "path_to_vortex = Path.cwd().parent.parent.parent\n", + "tile_size = 'TS'\n", + "work_per_thread = 'WPT'\n", + "width = 'WIDTH'" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "def error_running (run_params: run, error_text: str) -> str:\n", + " return f\"error in running in {run_params.kernel} : warps={run_params.arch.warps} cores={run_params.arch.cores} threads={run_params.arch.threads}\" \\\n", + " f\" driver={run_params.driver} args=-N{run_params.args['N']} -M{run_params.args['M']} -K{run_params.args['K']} error message - {error_text}/n\"" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "def error_verification (run_params: run, number_of_errors: str) -> str:\n", + " return f\"error in verifing results {run_params.kernel} : warps={run_params.arch.warps} cores={run_params.arch.cores} threads={run_params.arch.threads}\" \\\n", + " f\" driver={run_params.driver} args=-N{run_params.args['N']} -M{run_params.args['M']} -K{run_params.args['K']} Number of errors : {number_of_errors}'\\n'\" " + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "def create_common_h (params: dict, kernel_name: str):\n", + " file_name = f\"{path_to_vortex}/tests/opencl/{kernel_name}/common.h\"\n", + " with open(file_name, 'w') as file:\n", + " file.write(\"#ifndef COMMON_H\\n\" + \"#define COMMON_H\\n\" + \"\\n\")\n", + " if tile_size in params:\n", + " file.write(f\"#define TS {params[tile_size]}\\n\")\n", + " if work_per_thread in params:\n", + " file.write(f\"#define WPT {params[work_per_thread]}\\n\")\n", + " file.write(\"#define RTS (TS/WPT)\\n\")\n", + " if width in params:\n", + " file.write(f\"#define WIDTH {params[width]}\\n\")\n", + " file.write('\\n' + \"#endif // COMMON_H\")\n", + " # open main.cc file to recompile before run with new common.h\n", + " Path(f\"{path_to_vortex}/tests/opencl/{kernel_name}/main.cc\").touch(exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "def perf (run_params: run, path_to_output_file: str) -> pd.DataFrame:\n", + " # run kernel\n", + " vortex = f\"--warps={run_params.arch.warps} --cores={run_params.arch.cores} --threads={run_params.arch.threads}\"\n", + " run_args = f\"-N{run_params.args['N']} -M{run_params.args['M']} -K{run_params.args['K']}\"\n", + " command = f\"cd {path_to_vortex}/build && ./ci/blackbox.sh {vortex} --driver={run_params.driver} --app={run_params.kernel} --args=\\\"{run_args}\\\"\"\n", + " print(command)\n", + " result = subprocess.run(f\"{command} > {path_to_output_file}\", shell=True)\n", + "\n", + " # collect statistic \n", + " with open(path_to_output_file, 'r') as file:\n", + " lines = file.readlines()\n", + " error_message = \"\"\n", + " general_perf_stat = \"\"\n", + " for line in lines:\n", + " if \"PERF:\" in line:\n", + " general_perf_stat = line\n", + " # check for errors\n", + " if result != 0:\n", + " if \"FAILED\" in line: \n", + " error_message = error_verification(run_params, line[line.find(\"FAILED! - \"):])\n", + " if \"Error\" in line:\n", + " error_message = error_running(run_params, line[line.find(\"Error:\"):])\n", + " # pars string with general perf statistic of running kernel\n", + " pairs = general_perf_stat.replace(\"PERF: \", \"\").split(\", \")\n", + " perf_dict = {key_value.split(\"=\")[0]: float(key_value.split(\"=\")[1]) for key_value in pairs}\n", + " if perf_dict[\"cycles\"] <= 0:\n", + " error_message = error_running(run_params, \"Invalid number of cycles\")\n", + " # write result to data frame\n", + " run_result = pd.DataFrame([{\"kernel\": run_params.kernel[-1], \"driver\": run_params.driver, \"cores\": run_params.arch.cores, \n", + " \"warps\": run_params.arch.warps, \"threads\": run_params.arch.threads, \"M\": run_params.args[\"M\"], \n", + " \"N\": run_params.args[\"N\"], \"K\": run_params.args[\"K\"], \"instrs\": perf_dict[\"instrs\"], \"cycles\": perf_dict[\"cycles\"],\n", + " \"IPC\": perf_dict[\"IPC\"], \"error\": error_message}])\n", + " return run_result" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "def draw (data_frame: pd.DataFrame, x_label: str, y_label: str, title: str, path: str):\n", + " data_frame.plot(kind = \"bar\", x = x_label, y = y_label)\n", + " plt.title(title)\n", + " plt.xlabel(x_label)\n", + " plt.ylabel(y_label)\n", + " plt.savefig(path)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/4 [00:00 37\u001b[0m data_frames\u001b[38;5;241m.\u001b[39mappend(\u001b[43mperf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_file\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 38\u001b[0m data_frame \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mconcat(data_frames, ignore_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 40\u001b[0m \u001b[38;5;66;03m# draw graph based on the recived statistic\u001b[39;00m\n", + "Cell \u001b[0;32mIn[43], line 26\u001b[0m, in \u001b[0;36mperf\u001b[0;34m(run_params, path_to_output_file)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# pars string with general perf statistic of running kernel\u001b[39;00m\n\u001b[1;32m 25\u001b[0m pairs \u001b[38;5;241m=\u001b[39m general_perf_stat\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPERF: \u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 26\u001b[0m perf_dict \u001b[38;5;241m=\u001b[39m \u001b[43m{\u001b[49m\u001b[43mkey_value\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mfloat\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mkey_value\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mkey_value\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpairs\u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m perf_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcycles\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 28\u001b[0m error_message \u001b[38;5;241m=\u001b[39m error_running(run_params, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid number of cycles\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "Cell \u001b[0;32mIn[43], line 26\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# pars string with general perf statistic of running kernel\u001b[39;00m\n\u001b[1;32m 25\u001b[0m pairs \u001b[38;5;241m=\u001b[39m general_perf_stat\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPERF: \u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 26\u001b[0m perf_dict \u001b[38;5;241m=\u001b[39m {key_value\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m=\u001b[39m\u001b[38;5;124m\"\u001b[39m)[\u001b[38;5;241m0\u001b[39m]: \u001b[38;5;28mfloat\u001b[39m(\u001b[43mkey_value\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m) \u001b[38;5;28;01mfor\u001b[39;00m key_value \u001b[38;5;129;01min\u001b[39;00m pairs}\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m perf_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcycles\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 28\u001b[0m error_message \u001b[38;5;241m=\u001b[39m error_running(run_params, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid number of cycles\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mIndexError\u001b[0m: list index out of range" + ] + } + ], + "source": [ + "# create common.h files for each kernel\n", + "params1 = {\n", + " tile_size: 4\n", + "}\n", + "create_common_h(params1, \"kernel1\")\n", + "create_common_h(params1, \"kernel2\")\n", + "\n", + "params3 = {\n", + " tile_size: 4,\n", + " work_per_thread: 4\n", + "}\n", + "create_common_h(params3, \"kernel3\")\n", + "\n", + "params4 = {\n", + " tile_size: 8,\n", + " width: 4\n", + "}\n", + "create_common_h(params4, \"kernel4\")\n", + "\n", + "# fill running params data class for each kernel\n", + "run_p = []\n", + "arg = {\n", + " \"M\": 16,\n", + " \"N\": 16,\n", + " \"K\": 16\n", + "}\n", + "arch_p = arch(threads=8, cores=1, warps=2)\n", + "run_p.append(run(arch_p, kernel=\"kernel1\", driver=\"simx\", args=arg))\n", + "run_p.append(run(arch_p, kernel=\"kernel2\", driver=\"simx\", args=arg))\n", + "run_p.append(run(arch_p, kernel=\"kernel3\", driver=\"simx\", args=arg))\n", + "run_p.append(run(arch_p, kernel=\"kernel4\", driver=\"simx\", args=arg))\n", + "\n", + "# run all kernels and collect statistic in data frame\n", + "data_frames = []\n", + "output_file = f\"{path_to_vortex}/tests/opencl/j_stat/output.txt\"\n", + "for params in tqdm(run_p):\n", + " data_frames.append(perf(params, output_file))\n", + "data_frame = pd.concat(data_frames, ignore_index=True)\n", + "\n", + "# draw graph based on the recived statistic\n", + "draw(data_frame, \"kernel\", \"cycles\", \"number of cycles per kernel\", f\"graphics/graph.png\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/opencl/kernel1/Makefile b/tests/opencl/kernel1/Makefile new file mode 100644 index 000000000..37ada9b31 --- /dev/null +++ b/tests/opencl/kernel1/Makefile @@ -0,0 +1,17 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := kernel1 + +SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cc + +kernel.cl: $(SRC_DIR)/kernel.cl + cp $< $@ + +KERNEL_SRCS := kernel.cl + +OPTS ?= + +include ../common.mk diff --git a/tests/opencl/kernel1/common.h b/tests/opencl/kernel1/common.h new file mode 100644 index 000000000..e51835d3c --- /dev/null +++ b/tests/opencl/kernel1/common.h @@ -0,0 +1,6 @@ +#ifndef COMMON_H +#define COMMON_H + +#define TS 4 + +#endif // COMMON_H \ No newline at end of file diff --git a/tests/opencl/kernel1/kernel.cl b/tests/opencl/kernel1/kernel.cl new file mode 100644 index 000000000..e5ebd4958 --- /dev/null +++ b/tests/opencl/kernel1/kernel.cl @@ -0,0 +1,18 @@ +__kernel void myGEMM1(const int M, const int N, const int K, + const __global float* A, + const __global float* B, + __global float* C) { + // Thread identifiers + const int globalRow = get_global_id(0); // Row ID of C (0..M) + const int globalCol = get_global_id(1); // Col ID of C (0..N) + + // Compute a single element (loop over K) + float acc = 0.0f; + for (int k=0; k +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +int M = 0, N = 0, K = 0; + +static void show_usage() { + printf("Usage: [-M number of rows in first matrix] [-N number of columns in first matrix] [-K number of columns in first matrix and rows in second matrix] [-h: help]\n"); +} + +#define CL_CHECK(_expr) \ + do { \ + cl_int _err = _expr; \ + if (_err == CL_SUCCESS) \ + break; \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } while (0) + +#define CL_CHECK2(_expr) \ + ({ \ + cl_int _err = CL_INVALID_VALUE; \ + decltype(_expr) _ret = _expr; \ + if (_err != CL_SUCCESS) { \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } \ + _ret; \ + }) + +static int read_kernel_file(const char *filename, uint8_t **data, + size_t *size) { + if (NULL == filename || NULL == data || 0 == size) + return -1; + + FILE *fp = fopen(filename, "r"); + if (NULL == fp) { + fprintf(stderr, "Failed to load kernel."); + return -1; + } + + fseek(fp, 0, SEEK_END); + long fsize = ftell(fp); + rewind(fp); + + *data = (uint8_t *)malloc(fsize); + *size = fread(*data, 1, fsize, fp); + + fclose(fp); + + return 0; +} + +static void sgemm_cpu(float *C, const float *A, const float *B, int M, int N, + int K) { + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float acc = 0; + for (int k = 0; k < K; ++k) { + acc += A[k * M + m] * B[n * K + k]; + } + C[n * M + m] = acc; + } + } +} + +cl_platform_id platform_id = NULL; +cl_device_id device_id = NULL; +cl_context context = NULL; +cl_command_queue command_queue = NULL; +cl_program program = NULL; +cl_kernel kernel = NULL; +cl_mem a_memobj = NULL; +cl_mem b_memobj = NULL; +cl_mem c_memobj = NULL; +uint8_t *kernel_bin = NULL; + +static void cleanup() { + if (command_queue) + clReleaseCommandQueue(command_queue); + if (kernel) + clReleaseKernel(kernel); + if (program) + clReleaseProgram(program); + if (a_memobj) + clReleaseMemObject(a_memobj); + if (b_memobj) + clReleaseMemObject(b_memobj); + if (c_memobj) + clReleaseMemObject(c_memobj); + if (context) + clReleaseContext(context); + if (device_id) + clReleaseDevice(device_id); + if (kernel_bin) + free(kernel_bin); +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:M:N:K:h?")) != -1) { + switch (c) { + case 'n': + M = N = K = atoi(optarg); + break; + case 'M': + M = atoi(optarg); + break; + case 'N': + N = atoi(optarg); + break; + case 'K': + K = atoi(optarg); + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } + + if (M < 2 || N < 2 || K < 2 || TS < 2) { + printf("Error: invalid size!\n"); + exit(-1); + } +} + +int main(int argc, char **argv) { + // parse command arguments + parse_args(argc, argv); + + CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); + CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL)); + + // create context + cl_context_properties context_properties[]{ + CL_CONTEXT_PLATFORM, cl_context_properties(platform_id), 0}; + cl_device_id devices[]{device_id}; + context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err)); + + char device_string[1024]; + clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL); + printf("Using device: %s\n", device_string); + + // create command queue + command_queue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); + + // generate input data + float *A, *B, *C; + A = (float *)(malloc(M * K * sizeof(float))); + B = (float *)(malloc(N * K * sizeof(float))); + C = (float *)(malloc(M * N * sizeof(float))); + if (A == NULL || B == NULL || C == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + srand(time(NULL)); + for (int i = 0; i < M * K; i++) + A[i] = (int)((float)rand() / (float)RAND_MAX); + for (int i = 0; i < N * K; i++) + B[i] = (int)((float)rand() / (float)RAND_MAX); + + // create buffers + a_memobj = + CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + M * K * sizeof(float), A, &_err)); + b_memobj = + CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + N * K * sizeof(float), B, &_err)); + c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, + M * N * sizeof(float), NULL, &_err)); + + // load kernel text + size_t kernel_size; + if (read_kernel_file("kernel.cl", &kernel_bin, &kernel_size) != 0) { + cleanup(); + return -1; + } + program = CL_CHECK2(clCreateProgramWithSource(context, 1, (const char **)&kernel_bin, + &kernel_size, &_err)); + if (program == NULL) { + cleanup(); + return -1; + } + + // build program + cl_int build_status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); + + // create kernel + kernel = CL_CHECK2(clCreateKernel(program, "myGEMM1", &_err)); + + // check building info + size_t log_size = 0; + CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, + NULL, &log_size)); + char *log = (char *)malloc(log_size * sizeof(char)); + if (log == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, + log_size, log, NULL)); + if (log_size > 1) { + printf("Log:\n"); + printf("%s", log); + printf("\n"); + } + CL_CHECK(build_status); + + + // set kernel arguments + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&a_memobj)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&b_memobj)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&c_memobj)); + + // run kernel + const size_t local[2] = {TS, TS}; + const size_t global[2] = {M, N}; + printf("Execute the kernel\n"); + auto time_start = std::chrono::high_resolution_clock::now(); + CL_CHECK(clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, local, + 0, NULL, NULL)); + CL_CHECK(clFinish(command_queue)); + //CL_CHECK(clWaitForEvents(1, &event)); + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast( + time_end - time_start) + .count(); + printf("Elapsed time: %lg ms\n", elapsed); + + // get results from VRAM + CL_CHECK(clEnqueueReadBuffer(command_queue, c_memobj, CL_TRUE, 0, + M * N * sizeof(float), C, 0, NULL, NULL)); + CL_CHECK(clFinish(command_queue)); + + // verify results + printf("Verify result\n"); + float *C_cpu = (float *)malloc(M * N * sizeof(float)); + if (C_cpu == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + sgemm_cpu(C_cpu, A, B, M, N, K); + int errors = 0; + + for (size_t i = 0; i < size_t(M * N); i++) + if (C_cpu[i] != C[i]) + errors++; + if (errors != 0) + printf("FAILED! - %d errors\n", errors); + else + printf("PASSED!\n"); + + // free resureses + cleanup(); + free(A); + free(B); + free(C); + free(log); + free(C_cpu); + return errors; +} diff --git a/tests/opencl/kernel2/Makefile b/tests/opencl/kernel2/Makefile new file mode 100644 index 000000000..fa9b7f379 --- /dev/null +++ b/tests/opencl/kernel2/Makefile @@ -0,0 +1,20 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := kernel2 + +SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cc + +kernel.cl: $(SRC_DIR)/kernel.cl + cp $< $@ + +common.h: $(SRC_DIR)/common.h + cp $< $@ + +KERNEL_SRCS := kernel.cl common.h + +OPTS ?= + +include ../common.mk diff --git a/tests/opencl/kernel2/common.h b/tests/opencl/kernel2/common.h new file mode 100644 index 000000000..e51835d3c --- /dev/null +++ b/tests/opencl/kernel2/common.h @@ -0,0 +1,6 @@ +#ifndef COMMON_H +#define COMMON_H + +#define TS 4 + +#endif // COMMON_H \ No newline at end of file diff --git a/tests/opencl/kernel2/kernel.cl b/tests/opencl/kernel2/kernel.cl new file mode 100644 index 000000000..692318cde --- /dev/null +++ b/tests/opencl/kernel2/kernel.cl @@ -0,0 +1,46 @@ +#include "common.h" + +__kernel void myGEMM2(const int M, const int N, const int K, + const __global float* A, + const __global float* B, + __global float* C) { + + // Thread identifiers + const int row = get_local_id(0); // Local row ID (max: TS) + const int col = get_local_id(1); // Local col ID (max: TS) + const int globalRow = TS*get_group_id(0) + row; // Row ID of C (0..M) + const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N) + + // Local memory to fit a tile of TS*TS elements of A and B + __local float Asub[TS][TS]; + __local float Bsub[TS][TS]; + + // Initialise the accumulation register + float acc = 0.0f; + + // Loop over all tiles + const int numTiles = K/TS; + for (int t=0; t +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +int M = 256, N = 256, K = 256; + +static void show_usage() { + printf("Usage: [-M number of rows in first matrix] [-N number of columns in first matrix] [-K number of columns in first matrix and rows in second matrix] [-h: help]\n"); +} + +#define CL_CHECK(_expr) \ + do { \ + cl_int _err = _expr; \ + if (_err == CL_SUCCESS) \ + break; \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } while (0) + +#define CL_CHECK2(_expr) \ + ({ \ + cl_int _err = CL_INVALID_VALUE; \ + decltype(_expr) _ret = _expr; \ + if (_err != CL_SUCCESS) { \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } \ + _ret; \ + }) + +static int read_kernel_file(const char *filename, uint8_t **data, + size_t *size) { + if (NULL == filename || NULL == data || 0 == size) + return -1; + + FILE *fp = fopen(filename, "r"); + if (NULL == fp) { + fprintf(stderr, "Failed to load kernel."); + return -1; + } + + fseek(fp, 0, SEEK_END); + long fsize = ftell(fp); + rewind(fp); + + *data = (uint8_t *)malloc(fsize); + *size = fread(*data, 1, fsize, fp); + + fclose(fp); + + return 0; +} + +static void sgemm_cpu(float *C, const float *A, const float *B, int M, int N, + int K) { + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float acc = 0; + for (int k = 0; k < K; ++k) { + acc += A[k * M + m] * B[n * K + k]; + } + C[n * M + m] = acc; + } + } +} + +cl_platform_id platform_id = NULL; +cl_device_id device_id = NULL; +cl_context context = NULL; +cl_command_queue command_queue = NULL; +cl_program program = NULL; +cl_kernel kernel = NULL; +cl_mem a_memobj = NULL; +cl_mem b_memobj = NULL; +cl_mem c_memobj = NULL; +uint8_t *kernel_bin = NULL; + +static void cleanup() { + if (command_queue) + clReleaseCommandQueue(command_queue); + if (kernel) + clReleaseKernel(kernel); + if (program) + clReleaseProgram(program); + if (a_memobj) + clReleaseMemObject(a_memobj); + if (b_memobj) + clReleaseMemObject(b_memobj); + if (c_memobj) + clReleaseMemObject(c_memobj); + if (context) + clReleaseContext(context); + if (device_id) + clReleaseDevice(device_id); + if (kernel_bin) + free(kernel_bin); +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:M:N:K:h?")) != -1) { + switch (c) { + case 'n': + M = N = K = atoi(optarg); + break; + case 'M': + M = atoi(optarg); + break; + case 'N': + N = atoi(optarg); + break; + case 'K': + K = atoi(optarg); + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } + + if (M < 2 || N < 2 || K < 2 || TS < 2) { + printf("Error: invalid size!\n"); + exit(-1); + } +} + +int main(int argc, char **argv) { + // parse command arguments + parse_args(argc, argv); + + CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); + CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL)); + + // create context + cl_context_properties context_properties[]{ + CL_CONTEXT_PLATFORM, cl_context_properties(platform_id), 0}; + cl_device_id devices[]{device_id}; + context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err)); + + char device_string[1024]; + clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL); + printf("Using device: %s\n", device_string); + + // create command queue + command_queue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); + + // generate input data + float *A, *B, *C; + A = (float *)(malloc(M * K * sizeof(float))); + B = (float *)(malloc(N * K * sizeof(float))); + C = (float *)(malloc(M * N * sizeof(float))); + if (A == NULL || B == NULL || C == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + srand(time(NULL)); + for (int i = 0; i < M * K; i++) + A[i] = (int)((float)rand() / (float)RAND_MAX); + for (int i = 0; i < N * K; i++) + B[i] = (int)((float)rand() / (float)RAND_MAX); + + // create buffers + a_memobj = + CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + M * K * sizeof(float), A, &_err)); + b_memobj = + CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + N * K * sizeof(float), B, &_err)); + c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, + M * N * sizeof(float), NULL, &_err)); + + // load kernel text + size_t kernel_size; + if (read_kernel_file("kernel.cl", &kernel_bin, &kernel_size) != 0) { + cleanup(); + return -1; + } + program = CL_CHECK2(clCreateProgramWithSource(context, 1, (const char **)&kernel_bin, + &kernel_size, &_err)); + if (program == NULL) { + cleanup(); + return -1; + } + + // build program + cl_int build_status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); + + // create kernel + kernel = CL_CHECK2(clCreateKernel(program, "myGEMM2", &_err)); + + // check building info + size_t log_size = 0; + CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, + NULL, &log_size)); + char *log = (char *)malloc(log_size * sizeof(char)); + if (log == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, + log_size, log, NULL)); + if (log_size > 1) { + printf("Log:\n"); + printf("%s", log); + printf("\n"); + } + CL_CHECK(build_status); + + + // set kernel arguments + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&a_memobj)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&b_memobj)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&c_memobj)); + + // run kernel + const size_t local[2] = {TS, TS}; + const size_t global[2] = {M, N}; + printf("Execute the kernel\n"); + auto time_start = std::chrono::high_resolution_clock::now(); + CL_CHECK(clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, local, + 0, NULL, NULL)); + CL_CHECK(clFinish(command_queue)); + //CL_CHECK(clWaitForEvents(1, &event)); + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast( + time_end - time_start) + .count(); + printf("Elapsed time: %lg ms\n", elapsed); + + // get results from VRAM + CL_CHECK(clEnqueueReadBuffer(command_queue, c_memobj, CL_TRUE, 0, + M * N * sizeof(float), C, 0, NULL, NULL)); + CL_CHECK(clFinish(command_queue)); + + // verify results + printf("Verify result\n"); + float *C_cpu = (float *)malloc(M * N * sizeof(float)); + if (C_cpu == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + sgemm_cpu(C_cpu, A, B, M, N, K); + int errors = 0; + + for (size_t i = 0; i < size_t(M * N); i++) + if (C_cpu[i] != C[i]) + errors++; + if (errors != 0) + printf("FAILED! - %d errors\n", errors); + else + printf("PASSED!\n"); + + // free resureses + cleanup(); + free(A); + free(B); + free(C); + free(log); + free(C_cpu); + return errors; +} \ No newline at end of file diff --git a/tests/opencl/kernel3/Makefile b/tests/opencl/kernel3/Makefile new file mode 100644 index 000000000..b7e0c3133 --- /dev/null +++ b/tests/opencl/kernel3/Makefile @@ -0,0 +1,20 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := kernel3 + +SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cc + +kernel.cl: $(SRC_DIR)/kernel.cl + cp $< $@ + +common.h: $(SRC_DIR)/common.h + cp $< $@ + +KERNEL_SRCS := kernel.cl common.h + +OPTS ?= + +include ../common.mk diff --git a/tests/opencl/kernel3/common.h b/tests/opencl/kernel3/common.h new file mode 100644 index 000000000..c922a82eb --- /dev/null +++ b/tests/opencl/kernel3/common.h @@ -0,0 +1,8 @@ +#ifndef COMMON_H +#define COMMON_H + +#define TS 4 +#define WPT 4 +#define RTS (TS/WPT) + +#endif // COMMON_H \ No newline at end of file diff --git a/tests/opencl/kernel3/kernel.cl b/tests/opencl/kernel3/kernel.cl new file mode 100644 index 000000000..fbc30034b --- /dev/null +++ b/tests/opencl/kernel3/kernel.cl @@ -0,0 +1,55 @@ +#include "common.h" + +__kernel void myGEMM3(const int M, const int N, const int K, + const __global float* A, + const __global float* B, + __global float* C) { + + // Thread identifiers + const int row = get_local_id(0); // Local row ID (max: TS) + const int col = get_local_id(1); // Local col ID (max: TS/WPT == RTS) + const int globalRow = TS*get_group_id(0) + row; // Row ID of C (0..M) + const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N) + + // Local memory to fit a tile of TS*TS elements of A and B + __local float Asub[TS][TS]; + __local float Bsub[TS][TS]; + + // Initialise the accumulation registers + float acc[WPT]; + for (int w=0; w +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +int M = 256, N = 256, K = 256; + +static void show_usage() { + printf("Usage: [-M number of rows in first matrix] [-N number of columns in first matrix] [-K number of columns in first matrix and rows in second matrix] [-h: help]\n"); +} + +#define CL_CHECK(_expr) \ + do { \ + cl_int _err = _expr; \ + if (_err == CL_SUCCESS) \ + break; \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } while (0) + +#define CL_CHECK2(_expr) \ + ({ \ + cl_int _err = CL_INVALID_VALUE; \ + decltype(_expr) _ret = _expr; \ + if (_err != CL_SUCCESS) { \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } \ + _ret; \ + }) + +static int read_kernel_file(const char *filename, uint8_t **data, + size_t *size) { + if (NULL == filename || NULL == data || 0 == size) + return -1; + + FILE *fp = fopen(filename, "r"); + if (NULL == fp) { + fprintf(stderr, "Failed to load kernel."); + return -1; + } + + fseek(fp, 0, SEEK_END); + long fsize = ftell(fp); + rewind(fp); + + *data = (uint8_t *)malloc(fsize); + *size = fread(*data, 1, fsize, fp); + + fclose(fp); + + return 0; +} + +static void sgemm_cpu(float *C, const float *A, const float *B, int M, int N, + int K) { + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float acc = 0; + for (int k = 0; k < K; ++k) { + acc += A[k * M + m] * B[n * K + k]; + } + C[n * M + m] = acc; + } + } +} + +cl_platform_id platform_id = NULL; +cl_device_id device_id = NULL; +cl_context context = NULL; +cl_command_queue command_queue = NULL; +cl_program program = NULL; +cl_kernel kernel = NULL; +cl_mem a_memobj = NULL; +cl_mem b_memobj = NULL; +cl_mem c_memobj = NULL; +uint8_t *kernel_bin = NULL; + +static void cleanup() { + if (command_queue) + clReleaseCommandQueue(command_queue); + if (kernel) + clReleaseKernel(kernel); + if (program) + clReleaseProgram(program); + if (a_memobj) + clReleaseMemObject(a_memobj); + if (b_memobj) + clReleaseMemObject(b_memobj); + if (c_memobj) + clReleaseMemObject(c_memobj); + if (context) + clReleaseContext(context); + if (device_id) + clReleaseDevice(device_id); + if (kernel_bin) + free(kernel_bin); +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:M:N:K:h?")) != -1) { + switch (c) { + case 'n': + M = N = K = atoi(optarg); + break; + case 'M': + M = atoi(optarg); + break; + case 'N': + N = atoi(optarg); + break; + case 'K': + K = atoi(optarg); + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } + + if (M < 2 || N < 2 || K < 2) { + printf("Error: invalid size!\n"); + exit(-1); + } +} + +int main(int argc, char **argv) { + // parse command arguments + parse_args(argc, argv); + + CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); + CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL)); + + // create context + cl_context_properties context_properties[]{ + CL_CONTEXT_PLATFORM, cl_context_properties(platform_id), 0}; + cl_device_id devices[]{device_id}; + context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err)); + + char device_string[1024]; + clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL); + printf("Using device: %s\n", device_string); + + // create command queue + command_queue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); + + // generate input data + float *A, *B, *C; + A = (float *)(malloc(M * K * sizeof(float))); + B = (float *)(malloc(N * K * sizeof(float))); + C = (float *)(malloc(M * N * sizeof(float))); + if (A == NULL || B == NULL || C == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + srand(time(NULL)); + for (int i = 0; i < M * K; i++) + A[i] = (int)((float)rand() / (float)RAND_MAX); + for (int i = 0; i < N * K; i++) + B[i] = (int)((float)rand() / (float)RAND_MAX); + + // create buffers + a_memobj = + CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + M * K * sizeof(float), A, &_err)); + b_memobj = + CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + N * K * sizeof(float), B, &_err)); + c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, + M * N * sizeof(float), NULL, &_err)); + + // load kernel text + size_t kernel_size; + if (read_kernel_file("kernel.cl", &kernel_bin, &kernel_size) != 0) { + cleanup(); + return -1; + } + program = CL_CHECK2(clCreateProgramWithSource(context, 1, (const char **)&kernel_bin, + &kernel_size, &_err)); + if (program == NULL) { + cleanup(); + return -1; + } + + // build program + cl_int build_status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); + + // create kernel + kernel = CL_CHECK2(clCreateKernel(program, "myGEMM3", &_err)); + + // check building info + size_t log_size = 0; + CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, + NULL, &log_size)); + char *log = (char *)malloc(log_size * sizeof(char)); + if (log == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, + log_size, log, NULL)); + if (log_size > 1) { + printf("Log:\n"); + printf("%s", log); + printf("\n"); + } + CL_CHECK(build_status); + + + // set kernel arguments + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&a_memobj)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&b_memobj)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&c_memobj)); + + // run kernel + const size_t local[2] = {TS, TS / WPT}; + const size_t global[2] = {M, N / WPT}; + printf("Execute the kernel\n"); + auto time_start = std::chrono::high_resolution_clock::now(); + CL_CHECK(clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, local, + 0, NULL, NULL)); + CL_CHECK(clFinish(command_queue)); + //CL_CHECK(clWaitForEvents(1, &event)); + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast( + time_end - time_start) + .count(); + printf("Elapsed time: %lg ms\n", elapsed); + + // get results from VRAM + CL_CHECK(clEnqueueReadBuffer(command_queue, c_memobj, CL_TRUE, 0, + M * N * sizeof(float), C, 0, NULL, NULL)); + CL_CHECK(clFinish(command_queue)); + + // verify results + printf("Verify result\n"); + float *C_cpu = (float *)malloc(M * N * sizeof(float)); + if (C_cpu == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + sgemm_cpu(C_cpu, A, B, M, N, K); + int errors = 0; + + for (size_t i = 0; i < size_t(M * N); i++) + if (C_cpu[i] != C[i]) + errors++; + if (errors != 0) + printf("FAILED! - %d errors\n", errors); + else + printf("PASSED!\n"); + + // free resureses + cleanup(); + free(A); + free(B); + free(C); + free(log); + free(C_cpu); + return errors; +} diff --git a/tests/opencl/kernel4/Makefile b/tests/opencl/kernel4/Makefile new file mode 100644 index 000000000..312ec638d --- /dev/null +++ b/tests/opencl/kernel4/Makefile @@ -0,0 +1,20 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := kernel4 + +SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cc + +kernel.cl: $(SRC_DIR)/kernel.cl + cp $< $@ + +common.h: $(SRC_DIR)/common.h + cp $< $@ + +KERNEL_SRCS := kernel.cl common.h + +OPTS ?= + +include ../common.mk diff --git a/tests/opencl/kernel4/common.h b/tests/opencl/kernel4/common.h new file mode 100644 index 000000000..3ed810585 --- /dev/null +++ b/tests/opencl/kernel4/common.h @@ -0,0 +1,7 @@ +#ifndef COMMON_H +#define COMMON_H + +#define TS 8 +#define WIDTH 4 + +#endif // COMMON_H \ No newline at end of file diff --git a/tests/opencl/kernel4/kernel.cl b/tests/opencl/kernel4/kernel.cl new file mode 100644 index 000000000..179789d97 --- /dev/null +++ b/tests/opencl/kernel4/kernel.cl @@ -0,0 +1,110 @@ +#include "common.h" + +#if WIDTH == 1 + typedef float floatX; +#elif WIDTH == 2 + typedef float2 floatX; +#elif WIDTH == 4 + typedef float4 floatX; +#elif WIDTH == 8 + typedef float8 floatX; +#endif + +__kernel void myGEMM4(const int M, const int N, const int K, + const __global floatX* A, + const __global floatX* B, + __global floatX* C) { + + // Thread identifiers + const int row = get_local_id(0); // Local row ID (max: TS/WIDTH) + const int col = get_local_id(1); // Local col ID (max: TS) + const int globalRow = (TS/WIDTH)*get_group_id(0) + row; // Row ID of C (0..M/WIDTH) + const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N) + + // Local memory to fit a tile of TS*TS elements of A and B + __local floatX Asub[TS][TS/WIDTH]; + __local floatX Bsub[TS][TS/WIDTH]; + + // Initialise the accumulation registers + #if WIDTH == 1 + floatX acc = 0.0f; + #elif WIDTH == 2 + floatX acc = { 0.0f, 0.0f }; + #elif WIDTH == 4 + floatX acc = { 0.0f, 0.0f, 0.0f, 0.0f }; + #elif WIDTH == 8 + floatX acc = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; + #endif + + // Loop over all tiles + const int numTiles = K/TS; + for (int tile=0; tile +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +int M = 256, N = 256, K = 256; + +static void show_usage() { + printf("Usage: [-M number of rows in first matrix] [-N number of columns in first matrix] [-K number of columns in first matrix and rows in second matrix] [-h: help]\n"); +} + +#define CL_CHECK(_expr) \ + do { \ + cl_int _err = _expr; \ + if (_err == CL_SUCCESS) \ + break; \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } while (0) + +#define CL_CHECK2(_expr) \ + ({ \ + cl_int _err = CL_INVALID_VALUE; \ + decltype(_expr) _ret = _expr; \ + if (_err != CL_SUCCESS) { \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } \ + _ret; \ + }) + +static int read_kernel_file(const char *filename, uint8_t **data, + size_t *size) { + if (NULL == filename || NULL == data || 0 == size) + return -1; + + FILE *fp = fopen(filename, "r"); + if (NULL == fp) { + fprintf(stderr, "Failed to load kernel."); + return -1; + } + + fseek(fp, 0, SEEK_END); + long fsize = ftell(fp); + rewind(fp); + + *data = (uint8_t *)malloc(fsize); + *size = fread(*data, 1, fsize, fp); + + fclose(fp); + + return 0; +} + +static void sgemm_cpu(float *C, const float *A, const float *B, int M, int N, + int K) { + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float acc = 0; + for (int k = 0; k < K; ++k) { + acc += A[k * M + m] * B[n * K + k]; + } + C[n * M + m] = acc; + } + } +} + +cl_platform_id platform_id = NULL; +cl_device_id device_id = NULL; +cl_context context = NULL; +cl_command_queue command_queue = NULL; +cl_program program = NULL; +cl_kernel kernel = NULL; +cl_mem a_memobj = NULL; +cl_mem b_memobj = NULL; +cl_mem c_memobj = NULL; +uint8_t *kernel_bin = NULL; + +static void cleanup() { + if (command_queue) + clReleaseCommandQueue(command_queue); + if (kernel) + clReleaseKernel(kernel); + if (program) + clReleaseProgram(program); + if (a_memobj) + clReleaseMemObject(a_memobj); + if (b_memobj) + clReleaseMemObject(b_memobj); + if (c_memobj) + clReleaseMemObject(c_memobj); + if (context) + clReleaseContext(context); + if (device_id) + clReleaseDevice(device_id); + if (kernel_bin) + free(kernel_bin); +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:M:N:K:h?")) != -1) { + switch (c) { + case 'n': + M = N = K = atoi(optarg); + break; + case 'M': + M = atoi(optarg); + break; + case 'N': + N = atoi(optarg); + break; + case 'K': + K = atoi(optarg); + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } + + if (M < 2 || N < 2 || K < 2) { + printf("Error: invalid size!\n"); + exit(-1); + } +} + +int main(int argc, char **argv) { + // parse command arguments + parse_args(argc, argv); + + CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); + CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL)); + + // create context + cl_context_properties context_properties[]{ + CL_CONTEXT_PLATFORM, cl_context_properties(platform_id), 0}; + cl_device_id devices[]{device_id}; + context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err)); + + char device_string[1024]; + clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL); + printf("Using device: %s\n", device_string); + + // create command queue + command_queue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); + + // generate input data + float *A, *B, *C; + A = (float *)(malloc(M * K * sizeof(float))); + B = (float *)(malloc(N * K * sizeof(float))); + C = (float *)(malloc(M * N * sizeof(float))); + if (A == NULL || B == NULL || C == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + srand(time(NULL)); + for (int i = 0; i < M * K; i++) + A[i] = (int)((float)rand() / (float)RAND_MAX); + for (int i = 0; i < N * K; i++) + B[i] = (int)((float)rand() / (float)RAND_MAX); + + // create buffers + a_memobj = + CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + M * K * sizeof(float), A, &_err)); + b_memobj = + CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + N * K * sizeof(float), B, &_err)); + c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, + M * N * sizeof(float), NULL, &_err)); + + // load kernel text + size_t kernel_size; + if (read_kernel_file("kernel.cl", &kernel_bin, &kernel_size) != 0) { + cleanup(); + return -1; + } + program = CL_CHECK2(clCreateProgramWithSource(context, 1, (const char **)&kernel_bin, + &kernel_size, &_err)); + if (program == NULL) { + cleanup(); + return -1; + } + + // build program + cl_int build_status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); + + // create kernel + kernel = CL_CHECK2(clCreateKernel(program, "myGEMM4", &_err)); + + // check building info + size_t log_size = 0; + CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, + NULL, &log_size)); + char *log = (char *)malloc(log_size * sizeof(char)); + if (log == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + CL_CHECK(clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, + log_size, log, NULL)); + if (log_size > 1) { + printf("Log:\n"); + printf("%s", log); + printf("\n"); + } + CL_CHECK(build_status); + + + // set kernel arguments + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&a_memobj)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&b_memobj)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&c_memobj)); + + // run kernel + const size_t local[2] = {TS / WIDTH, TS}; + const size_t global[2] = {M / WIDTH, N}; + printf("Execute the kernel\n"); + auto time_start = std::chrono::high_resolution_clock::now(); + CL_CHECK(clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, local, + 0, NULL, NULL)); + CL_CHECK(clFinish(command_queue)); + //CL_CHECK(clWaitForEvents(1, &event)); + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast( + time_end - time_start) + .count(); + printf("Elapsed time: %lg ms\n", elapsed); + + // get results from VRAM + CL_CHECK(clEnqueueReadBuffer(command_queue, c_memobj, CL_TRUE, 0, + M * N * sizeof(float), C, 0, NULL, NULL)); + CL_CHECK(clFinish(command_queue)); + + // verify results + printf("Verify result\n"); + float *C_cpu = (float *)malloc(M * N * sizeof(float)); + if (C_cpu == NULL) { + printf("Not enough memory"); + cleanup(); + return -1; + } + sgemm_cpu(C_cpu, A, B, M, N, K); + int errors = 0; + + for (size_t i = 0; i < size_t(M * N); i++) + if (C_cpu[i] != C[i]) + errors++; + if (errors != 0) + printf("FAILED! - %d errors\n", errors); + else + printf("PASSED!\n"); + + // free resureses + cleanup(); + free(A); + free(B); + free(C); + free(log); + free(C_cpu); + return errors; +}