diff --git a/.flake8 b/.flake8 index af332dd..0d726f9 100755 --- a/.flake8 +++ b/.flake8 @@ -8,3 +8,4 @@ max-line-length = 119 exclude = .venv/** build/** + benchmarks/** \ No newline at end of file diff --git a/benchmarks/src/README.md b/benchmarks/src/README.md new file mode 100644 index 0000000..ae84c4e --- /dev/null +++ b/benchmarks/src/README.md @@ -0,0 +1,36 @@ +Benchmarks +=========== + +## Setting up environment + +```sh + python -m pip install -r requirements.txt +``` + +## Benchmark parameters + +The benchmark packages, rounds, array sizes, and numeric type may be specified on the constants at the top of [pytest_benchmark/common.py](pytest_benchmark/common.py). + +Alternatively, they may be specified individually at the top of each test file. + + +## Running + +These are the steps to run the benchmarks, and produce the graphs + +Run the benchmarks and store the results in `results.json` +```sh + pytest .\pytest_benchmark --benchmark-json=results.json +``` + +To create graphs and store the timing results after creating the `results.json`, run: +```sh + python graphs.py +``` + +To modify the tests being shown, modify the `TESTS` list at the top of the `graphs.py` file. +To modify the labels shown, modify `PKG_LABELS` +To modify the hardware display, modify `HARDWARE` + +## Notes +When running with `dpnp`, set the environment variable `DPNP_RAISE_EXCEPION_ON_NUMPY_FALLBACK` to 0. \ No newline at end of file diff --git a/benchmarks/src/graphs.py b/benchmarks/src/graphs.py new file mode 100644 index 0000000..1b2e4e4 --- /dev/null +++ b/benchmarks/src/graphs.py @@ -0,0 +1,223 @@ +import json + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + +BENCHMARKS_JSON = "results.json" + +# Hardware details shown in title +HARDWARE = "AMD Ryzen 9 9900X 12-Core Processor 63032 MB (fp64 fp16)\noneAPI 2025.1.3 Intel(R) OpenCL Graphics: Intel(R) Arc(TM) B580 Graphics, 11873 MB (fp64 fp16)" + +# Show speedup in graph +SHOW_NUMBERS = True + +# Round to digits after decimal +ROUND_NUMBERS = 1 + +# package list in graph order; arrayfire packages are added later +PKG_NAMES = ["numpy", "dpnp", "cupy"] + +# color used in graphs +PKG_COLOR = { + "numpy": "tab:blue", + "cupy": "tab:green", + "dpnp": "tab:red", + "afcpu": "tab:orange", + "afopencl": "tab:orange", + "afcuda": "tab:orange", + "afoneapi": "tab:orange", +} + +# labels displayed in the graph +PKG_LABELS = { + "numpy": "numpy[cpu]", + "dpnp": "dpnp[level_zero:gpu]", + "cupy": "cupy", + "afcpu": "afcpu", + "afcuda": "afcuda", + "afopencl": "afopencl[opencl:gpu]", + "afoneapi": "afoneapi[opencl:gpu]", +} + +AFBACKENDS = ["afcpu", "afcuda", "afopencl", "afoneapi"] + +# Tests to be shown in graphs +TESTS = [ + "qr", + "neural_network", + "gemm", + "mandelbrot", + "nbody", + "pi", + "black_scholes", + "fft", + "normal", + "group_elementwise", + # Other tests + # 'svd + # 'cholesky', + # 'det', + # 'norm', + # 'uniform', + # 'inv' +] + + +def get_benchmark_data(): + results = {} + descriptions = {} + with open(BENCHMARKS_JSON) as f: + js = json.load(f) + for bench in js["benchmarks"]: + test_name = bench["name"] + test_name = test_name[test_name.find("_") + 1 : test_name.find("[")] + + key = bench["param"] + val = bench["stats"]["ops"] + + if len(bench["extra_info"]) != 0 and (not test_name in descriptions): + descriptions[test_name] = bench["extra_info"]["description"] + + if test_name not in results: + results[test_name] = {key: val} + else: + results[test_name][key] = val + + return results, descriptions + + +def create_graph(test_name, test_results): + names = [] + values = [] + for name in test_results: + names.append(name) + values.append(test_results[name]) + + bar = plt.bar(names, values) + plt.title(test_name) + + plt.savefig("img/" + test_name + ".png") + plt.close() + + +def generate_individual_graphs(): + results, descriptions = get_benchmark_data() + + for test in results: + create_graph(test, results[test]) + + +# Stores the timing results in a csv file +def store_csv(): + data_dict = {} + data_dict["Test(seconds)"] = [] + results = {} + for pkg in PKG_LABELS.keys(): + data_dict[pkg] = [] + results[pkg] = {} + + with open(BENCHMARKS_JSON) as f: + js = json.load(f) + for bench in js["benchmarks"]: + test_name = bench["name"] + test_name = test_name[test_name.find("_") + 1 : test_name.find("[")] + + pkg = bench["param"] + time = bench["stats"]["mean"] + + if not test_name in data_dict["Test(seconds)"]: + data_dict["Test(seconds)"].append(test_name) + + results[pkg][test_name] = time + + for test in data_dict["Test(seconds)"]: + for pkg in PKG_LABELS.keys(): + if test in results[pkg]: + data_dict[pkg].append(results[pkg][test]) + else: + data_dict[pkg].append(np.nan) + + df = pd.DataFrame(data_dict) + df.to_csv("summary.csv") + + +def generate_group_graph(test_list=None, show_numbers=False, filename="comparison"): + results, descriptions = get_benchmark_data() + + width = 1 / (1 + len(PKG_NAMES)) + multiplier = 0 + + tests = None + if test_list: + tests = test_list + else: + tests = results.keys() + + tests_values = {} + x = np.arange(len(tests)) + + for name in PKG_NAMES: + tests_values[name] = [] + + max_val = 1 + for test in tests: + for name in PKG_NAMES: + base_value = results[test]["numpy"] + if name in results[test]: + val = results[test][name] / base_value + + if ROUND_NUMBERS: + val = round(val, ROUND_NUMBERS) + + if max_val < val: + max_val = val + + tests_values[name].append(val) + else: + tests_values[name].append(np.nan) + + fig, ax = plt.subplots(layout="constrained") + + for name in PKG_NAMES: + offset = width * multiplier + rects = ax.barh(x + offset, tests_values[name], width, label=PKG_LABELS[name], color=PKG_COLOR[name]) + + if show_numbers: + ax.bar_label(rects, padding=3, rotation=0) + multiplier += 1 + + xlabels = [] + for test in tests: + xlabels.append(test + "\n" + descriptions[test]) + + ax.set_xlabel("Speedup") + ax.set_xscale("log") + ax.set_title(f"Runtime Comparison\n{HARDWARE}") + ax.set_yticks(x + width, xlabels, rotation=0) + xmin, xmax = ax.get_xlim() + ax.set_xlim(xmin, xmax * 2) + + ax.legend(loc="lower right", ncols=len(PKG_NAMES)) + fig.set_figheight(8) + fig.set_figwidth(13) + fig.savefig(f"img/{filename}.png") + plt.show() + + +def main(): + store_csv() + for backend in AFBACKENDS: + try: + filename = f"comparison_{backend}" + if not backend in PKG_NAMES: + PKG_NAMES.insert(1, backend) + generate_group_graph(TESTS, SHOW_NUMBERS, filename) + PKG_NAMES.remove(backend) + except Exception as e: + print(e) + print("No data for", backend) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/src/pytest_benchmark/common.py b/benchmarks/src/pytest_benchmark/common.py new file mode 100644 index 0000000..9e82a47 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/common.py @@ -0,0 +1,105 @@ +# cython: language_level=3 +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2016-2024, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import gc +import math + +import cupy +import dpctl +import dpnp +import numpy as np +import pytest + +import arrayfire as af + +# modify parameters for most benchmarks +ROUNDS = 30 +NSIZE = 2**13 +NNSIZE = NSIZE**2 +DTYPE = "float32" + +# comment a line to remove that package from testing +PKGDICT = { + "dpnp": dpnp, + "numpy": np, + "cupy": cupy, + # "afcpu": af, + "afopencl": af, + "afcuda": af, + "afoneapi": af, +} + +PKGS = [] +IDS = [] + +for key, value in PKGDICT.items(): + IDS.append(key) + PKGS.append(value) + + +# Initialize packages and cleanup memory before each round +def initialize_package(PKG_ID): + pkg = PKGDICT[PKG_ID] + + try: + af.device_gc() + mempool = cupy.get_default_memory_pool() + mempool.free_all_blocks() + except: + pass + + if PKG_ID == "afcpu": + af.set_backend(af.BackendType.cpu) + af.device_gc() + af.info() + elif PKG_ID == "afopencl": + af.set_backend(af.BackendType.opencl) + af.device_gc() + af.info() + elif PKG_ID == "afcuda": + af.set_backend(af.BackendType.cuda) + af.device_gc() + af.info() + elif PKG_ID == "afoneapi": + af.set_backend(af.BackendType.oneapi) + af.device_gc() + af.info() + elif PKG_ID == "numpy": + np.random.seed(0) + elif PKG_ID == "dpnp": + dpnp.random.seed(0) + print(dpctl.get_devices()[0]) + elif PKG_ID == "cupy": + cupy.random.seed(0) + print(cupy.cuda.Device()) + mempool = cupy.get_default_memory_pool() + mempool.free_all_blocks() + else: + raise NotImplementedError() + + # Free all unused memory + gc.collect() diff --git a/benchmarks/src/pytest_benchmark/sgemm.cu b/benchmarks/src/pytest_benchmark/sgemm.cu new file mode 100644 index 0000000..9e4791f --- /dev/null +++ b/benchmarks/src/pytest_benchmark/sgemm.cu @@ -0,0 +1,200 @@ +/* +Original works by: +-------------------------------------------------------- +MAGMA +Copyright (c) 2017 The University of Tennessee. All rights reserved. +Licensed under modified BSD license +*/ + + +// These parameters will be determined by utils.read_code +//#define DIM_X ${DIM_X} +//#define DIM_Y ${DIM_Y} +//#define BLK_M ${BLK_M} +//#define BLK_N ${BLK_N} +//#define BLK_K ${BLK_K} +//#define DIM_XA ${DIM_XA} +//#define DIM_YA ${DIM_YA} +//#define DIM_XB ${DIM_XB} +//#define DIM_YB ${DIM_YB} +//#define THR_N ${THR_N} +//#define THR_M ${THR_M} + +#define fetch(arr, col, m, n, bound) arr[min(n*col + m, bound)] + + +extern "C" __global__ +void sgemm( + int M, int N, int K, + const float* A, + const float* B, + float * C) +{ + int idx = threadIdx.x; + int idy = threadIdx.y; + + int idt = DIM_X * idy + idx; + + int idxA = idt % DIM_XA; + int idyA = idt / DIM_XA; + + int idxB = idt % DIM_XB; + int idyB = idt / DIM_XB; + + int blx = blockIdx.x; + int bly = blockIdx.y; + + __shared__ float sA[BLK_K][BLK_M + 1]; + __shared__ float sB[BLK_N][BLK_K + 1]; + + // registers for the innermost loop + float rC[THR_N][THR_M]; + float rA[THR_M]; + float rB[THR_N]; + + float ra[BLK_K / DIM_YA][BLK_M / DIM_XA]; + float rb[BLK_N / DIM_YB][BLK_K / DIM_XB]; + + const float* offs_dA = A + blx * BLK_M + idyA * M + idxA; + int boundA = (M * (K - 1) + M) - (blx * BLK_M + idyA * M + idxA) - 1; + const float* offs_dB = B + bly * BLK_N * K + idyB * K + idxB; + int boundB = (K * (N - 1) + K) - (bly * BLK_N * K + idyB * K + idxB) - 1; + + int m, n, k, kk; + + #pragma unroll + for (n = 0; n < THR_N; n++) { + #pragma unroll + for (m = 0 ; m < THR_M; m++) { + rC[n][m] = 0; + } + } + + // blockwise transpose to transpose load + #pragma unroll + for (n = 0; n < BLK_K; n += DIM_YA) { + #pragma unroll + for (m = 0; m < BLK_M; m += DIM_XA) { + sA[n + idyA][m + idxA] = fetch(offs_dA, M, m, n, boundA); + } + } + // blockwise transpose to transpose load + #pragma unroll + for (n = 0; n < BLK_N; n += DIM_YB) { + #pragma unroll + for (m = 0; m < BLK_K; m += DIM_XB) { + sB[n + idyB][m + idxB] = fetch(offs_dB, K, m, n, boundB); + } + } + __syncthreads(); + + for (kk = 0; kk < K - BLK_K; kk += BLK_K) + { + offs_dA += BLK_K * M; + boundA -= BLK_K * M; + offs_dB += BLK_K; + boundB -= BLK_K; + + #pragma unroll + for (n = 0; n < BLK_K / DIM_YA; n++) { + #pragma unroll + for (m = 0; m < BLK_M / DIM_XA; m++) { + ra[n][m] = fetch(offs_dA, M, m * DIM_XA, n * DIM_YA, boundA); + } + } + + #pragma unroll + for (n = 0; n < BLK_N / DIM_YB; n++) { + #pragma unroll + for (m = 0; m < BLK_K / DIM_XB; m++) { + rb[n][m] = fetch(offs_dB, K, m * DIM_XB, n * DIM_YB, boundB); + } + } + + // multiply + #pragma unroll + for (k = 0; k < BLK_K; k++) + { + #pragma unroll + for (m = 0; m < THR_M; m++) { + rA[m] = sA[k][m * DIM_X + idx]; + } + + #pragma unroll + for (n = 0; n < THR_N; n++) { + rB[n] = sB[n * DIM_Y + idy][k]; + } + + #pragma unroll + for (n = 0; n < THR_N; n++) { + #pragma unroll + for (m = 0; m < THR_M; m++) { + rC[n][m] += rA[m] * rB[n]; + } + } + } + __syncthreads(); + + // store A regs->smem + #pragma unroll + for (n = 0; n < BLK_K / DIM_YA; n++) + { + #pragma unroll + for (m = 0; m < BLK_M / DIM_XA; m++) + { + sA[n * DIM_YA + idyA][m * DIM_XA + idxA] = ra[n][m]; + } + } + + #pragma unroll + for (n = 0; n < BLK_N / DIM_YB; n++) + { + #pragma unroll + for (m = 0; m < BLK_K / DIM_XB; m++) + { + sB[n * DIM_YB + idyB][m * DIM_XB + idxB] = rb[n][m]; + } + } + __syncthreads(); + } + + // Multiply last full (BLK_K) or partial block of columns of A and + // rows of B. + // It's okay that m,n exceed matrix bounds as all work is in registers + // or shared memory, and out-of-bounds rC[n][m] will not be saved later. + + kk = K - kk; + #pragma unroll + for (k = 0; k < kk; k++) + { + #pragma unroll + for (m = 0; m < THR_M; m++) { + rA[m] = sA[k][m * DIM_X + idx]; + } + + #pragma unroll + for (n = 0; n < THR_N; n++) { + rB[n] = sB[n * DIM_Y + idy][k]; + } + + #pragma unroll + for (n = 0; n < THR_N; n++) { + #pragma unroll + for (m = 0; m < THR_M; m++) { + rC[n][m] += rA[m] * rB[n]; + } + } + } + + #pragma unroll + for (n = 0; n < THR_N; n++) { + int coord_dCn = bly * BLK_N + n * DIM_Y + idy; + #pragma unroll + for (m = 0; m < THR_M; m++) { + int coord_dCm = blx * BLK_M + m * DIM_X + idx; + if (coord_dCm < M && coord_dCn < N) { + C[coord_dCn * M + coord_dCm] = rC[n][m]; + } + } + } +} \ No newline at end of file diff --git a/benchmarks/src/pytest_benchmark/test_blackscholes.py b/benchmarks/src/pytest_benchmark/test_blackscholes.py new file mode 100644 index 0000000..fae2fc5 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_blackscholes.py @@ -0,0 +1,149 @@ +#!/usr/bin/python + +####################################################### +# Copyright (c) 2015, ArrayFire +# All rights reserved. +# +# This file is distributed under 3-clause BSD license. +# The complete license agreement can be obtained at: +# http://arrayfire.com/licenses/BSD-3-Clause +######################################################## + +from common import * + +ITERATIONS = 1 + +sqrt2 = math.sqrt(2.0) + + +@pytest.mark.parametrize("pkgid", IDS, ids=IDS) +class TestBlackScholes: + def test_black_scholes(self, benchmark, pkgid): + initialize_package(pkgid) + setup = lambda: (generate_arrays(pkgid, 5), {}) + pkg = PKGDICT[pkgid] + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + result = benchmark.pedantic(target=FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS) + + +def black_scholes_numpy(S, X, R, V, T): + # S = Underlying stock price + # X = Strike Price + # R = Risk free rate of interest + # V = Volatility + # T = Time to maturity + def cnd(x): + temp = x > 0 + erf = lambda arr: np.exp(-arr * arr) + return temp * (0.5 + erf(x / sqrt2) / 2) + (1 - temp) * (0.5 - erf((-x) / sqrt2) / 2) + + d1 = np.log(S / X) + d1 = d1 + (R + (V * V) * 0.5) * T + d1 = d1 / (V * np.sqrt(T)) + + d2 = d1 - (V * np.sqrt(T)) + cnd_d1 = cnd(d1) + cnd_d2 = cnd(d2) + + C = S * cnd_d1 - (X * np.exp((-R) * T) * cnd_d2) + P = X * np.exp((-R) * T) * (1 - cnd_d2) - (S * (1 - cnd_d1)) + + return (C, P) + + +def black_scholes_dpnp(S, X, R, V, T): + def cnd(x): + temp = x > 0 + return temp * (0.5 + dpnp.erf(x / sqrt2) / 2) + (1 - temp) * (0.5 - dpnp.erf((-x) / sqrt2) / 2) + + d1 = dpnp.log(S / X) + d1 = d1 + (R + (V * V) * 0.5) * T + d1 = d1 / (V * dpnp.sqrt(T)) + + d2 = d1 - (V * dpnp.sqrt(T)) + cnd_d1 = cnd(d1) + cnd_d2 = cnd(d2) + + C = S * cnd_d1 - (X * dpnp.exp((-R) * T) * cnd_d2) + P = X * dpnp.exp((-R) * T) * (1 - cnd_d2) - (S * (1 - cnd_d1)) + + return (C, P) + + +def black_scholes_cupy(S, X, R, V, T): + def cnd(x): + temp = x > 0 + erf = lambda arr: cupy.exp(-arr * arr) + return temp * (0.5 + erf(x / sqrt2) / 2) + (1 - temp) * (0.5 - erf((-x) / sqrt2) / 2) + + d1 = cupy.log(S / X) + d1 = d1 + (R + (V * V) * 0.5) * T + d1 = d1 / (V * cupy.sqrt(T)) + + d2 = d1 - (V * cupy.sqrt(T)) + cnd_d1 = cnd(d1) + cnd_d2 = cnd(d2) + + C = S * cnd_d1 - (X * cupy.exp((-R) * T) * cnd_d2) + P = X * cupy.exp((-R) * T) * (1 - cnd_d2) - (S * (1 - cnd_d1)) + + cupy.cuda.runtime.deviceSynchronize() + + return (C, P) + + +def black_scholes_arrayfire(S, X, R, V, T): + def cnd(x): + temp = x > 0 + return temp * (0.5 + af.erf(x / sqrt2) / 2) + (1 - temp) * (0.5 - af.erf((-x) / sqrt2) / 2) + + d1 = af.log(S / X) + d1 = d1 + (R + (V * V) * 0.5) * T + d1 = d1 / (V * af.sqrt(T)) + + d2 = d1 - (V * af.sqrt(T)) + cnd_d1 = cnd(d1) + cnd_d2 = cnd(d2) + + C = S * cnd_d1 - (X * af.exp((-R) * T) * cnd_d2) + P = X * af.exp((-R) * T) * (1 - cnd_d2) - (S * (1 - cnd_d1)) + + af.eval(C) + af.eval(P) + af.sync() + + return (C, P) + + +def generate_arrays(pkgid, count): + arr_list = [] + pkg = PKGDICT[pkgid] + pkg = pkg.__name__ + if "cupy" == pkg: + for i in range(count): + arr_list.append(cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE)) + cupy.cuda.runtime.deviceSynchronize() + elif "arrayfire" == pkg: + af.device_gc() + for i in range(count): + x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE)) + af.eval(x) + arr_list.append(x) + af.sync() + elif "dpnp" == pkg: + for i in range(count): + arr_list.append(dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE)) + elif "numpy" == pkg: + for i in range(count): + arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE)) + + return arr_list + + +FUNCS = { + "dpnp": black_scholes_dpnp, + "numpy": black_scholes_numpy, + "cupy": black_scholes_cupy, + "arrayfire": black_scholes_arrayfire, +} diff --git a/benchmarks/src/pytest_benchmark/test_elementwise.py b/benchmarks/src/pytest_benchmark/test_elementwise.py new file mode 100644 index 0000000..0933ce6 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_elementwise.py @@ -0,0 +1,316 @@ +# cython: language_level=3 +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2016-2024, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from common import * + +ITERATIONS = 1 + + +@pytest.mark.parametrize("pkgid", IDS, ids=IDS) +class TestElementwise: + def test_group_elementwise(self, benchmark, pkgid): + initialize_package(pkgid) + + setup = lambda: ([generate_arrays(pkgid, 1)[0] / (NSIZE * NSIZE)], {}) + pkg = PKGDICT[pkgid] + + def func(arr): + return pkg.exp(pkg.cos(pkg.sinh(arr))) + pkg.cbrt(pkg.log(arr) * pkg.expm1(-pkg.sqrt(arr))) + + def func_af(arr): + x = af.exp(af.cos(af.sinh(arr))) + af.cbrt(af.log(arr) * af.expm1(-af.sqrt(arr))) + af.eval(x) + af.sync() + return x + + def func_cupy(arr): + x = pkg.exp(pkg.cos(pkg.sinh(arr))) + pkg.cbrt(pkg.log(arr) * pkg.expm1(-pkg.sqrt(arr))) + cupy.cuda.runtime.deviceSynchronize() + return x + + GROUP_FUNCS = {"numpy": func, "cupy": func_cupy, "arrayfire": func_af, "dpnp": func} + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + result = benchmark.pedantic( + target=GROUP_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS + ) + + """ + def test_arccos(self, benchmark, pkg): + setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {}) + + result = benchmark.pedantic( + target=pkg.acos, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_arccosh(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.acosh, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_arcsin(self, benchmark, pkg): + setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {}) + + result = benchmark.pedantic( + target=pkg.asin, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_arcsinh(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.asinh, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_arctan(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.atan, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_arctanh(self, benchmark, pkg): + setup = lambda: ([(generate_arrays(pkg, 1)[0] - 1) / (NSIZE * NSIZE)], {}) + + result = benchmark.pedantic( + target=pkg.atanh, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_cbrt(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.cbrt, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_cos(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.cos, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_cosh(self, benchmark, pkg): + setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {}) + + result = benchmark.pedantic( + target=pkg.cosh, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_sin(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.sin, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_sinh(self, benchmark, pkg): + setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {}) + + result = benchmark.pedantic( + target=pkg.sinh, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_degrees(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.degrees, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_exp(self, benchmark, pkg): + setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {}) + + result = benchmark.pedantic( + target=pkg.exp, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_exp2(self, benchmark, pkg): + setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {}) + + result = benchmark.pedantic( + target=pkg.exp2, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_expm1(self, benchmark, pkg): + setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {}) + + result = benchmark.pedantic( + target=pkg.expm1, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_log(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.log, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_log10(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.log10, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_log1p(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.log1p, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_sqrt(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.sqrt, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_square(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.square, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_tan(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.tan, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_tanh(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.tanh, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + + def test_reciprocal(self, benchmark, pkg): + setup = lambda: (generate_arrays(pkg, 1), {}) + + result = benchmark.pedantic( + target=pkg.reciprocal, + setup=setup, + rounds=ROUNDS, + iterations=ITERATIONS + ) + """ + + +def generate_arrays(pkgid, count): + arr_list = [] + pkg = PKGDICT[pkgid] + pkg = pkg.__name__ + if "cupy" == pkg: + for i in range(count): + arr_list.append(cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE)) + cupy.cuda.runtime.deviceSynchronize() + elif "arrayfire" == pkg: + for i in range(count): + x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE)) + af.eval(x) + arr_list.append(x) + af.sync() + elif "dpnp" == pkg: + for i in range(count): + arr_list.append(dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE)) + elif "numpy" == pkg: + for i in range(count): + arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE)) + + return arr_list diff --git a/benchmarks/src/pytest_benchmark/test_fft.py b/benchmarks/src/pytest_benchmark/test_fft.py new file mode 100644 index 0000000..096573d --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_fft.py @@ -0,0 +1,90 @@ +# cython: language_level=3 +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2016-2024, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from common import * + +ITERATIONS = 1 + + +def generate_arrays(pkgid, count): + arr_list = [] + pkg = PKGDICT[pkgid] + pkg = pkg.__name__ + if "cupy" == pkg: + for i in range(count): + arr_list.append(cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE)) + cupy.cuda.runtime.deviceSynchronize() + elif "arrayfire" == pkg: + for i in range(count): + x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE)) + af.eval(x) + arr_list.append(x) + af.sync() + elif "dpnp" == pkg: + for i in range(count): + arr_list.append(dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE)) + elif "numpy" == pkg: + for i in range(count): + arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE)) + + return arr_list + + +@pytest.mark.parametrize("pkgid", IDS, ids=IDS) +class TestFFT: + def test_fft(self, benchmark, pkgid): + initialize_package(pkgid) + setup = lambda: (generate_arrays(pkgid, 1), {}) + pkg = PKGDICT[pkgid] + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + result = benchmark.pedantic(target=FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS) + + +def fft_af(arr): + res = af.fft(arr) + af.eval(res) + af.sync() + + return res + + +def fft_np(arr): + return np.fft.fft(arr) + + +def fft_dpnp(arr): + return dpnp.fft.fft(arr) + + +def fft_cupy(arr): + res = cupy.fft.fft(arr) + cupy.cuda.runtime.deviceSynchronize() + return res + + +FUNCS = {"dpnp": fft_dpnp, "numpy": fft_np, "cupy": fft_cupy, "arrayfire": fft_af} diff --git a/benchmarks/src/pytest_benchmark/test_gemm.py b/benchmarks/src/pytest_benchmark/test_gemm.py new file mode 100644 index 0000000..e62d5a1 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_gemm.py @@ -0,0 +1,121 @@ +import os + +from common import * + +ITERATIONS = 1 + +alpha = 0.4 +beta = 0.6 + +dim_x = 16 +dim_y = 16 +blk_m = 64 +blk_n = 64 +blk_k = 4 +dim_xa = 64 +dim_ya = 4 +dim_xb = 4 +dim_yb = 64 +assert dim_x * dim_y == dim_xa * dim_ya == dim_xb * dim_yb +config = { + "DIM_X": dim_x, + "DIM_Y": dim_y, + "BLK_M": blk_m, + "BLK_N": blk_n, + "BLK_K": blk_k, + "DIM_XA": dim_xa, + "DIM_YA": dim_ya, + "DIM_XB": dim_xb, + "DIM_YB": dim_yb, + "THR_M": blk_m // dim_x, + "THR_N": blk_n // dim_y, +} + + +def create_cupy_kernel(params): + sgemm_file = os.path.join(os.path.dirname(__file__), "sgemm.cu") + code = None + with open(sgemm_file, "r") as f: + code = f.read() + for k, v in params.items(): + code = "#define " + k + " " + str(v) + "\n" + code + + return cupy.RawKernel(code, "sgemm") + + +kern = create_cupy_kernel(config) + + +@pytest.mark.parametrize("pkgid", IDS, ids=IDS) +class TestGemm: + def test_gemm(self, benchmark, pkgid): + pkg = PKGDICT[pkgid] + initialize_package(pkgid) + + setup = lambda: (generate_arrays(pkgid, 3), {}) + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + result = benchmark.pedantic(target=FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS) + + +def generate_arrays(pkgid, count): + arr_list = [] + pkg = PKGDICT[pkgid] + pkg = pkg.__name__ + if "cupy" == pkg: + cupy.random.seed(1) + for i in range(count): + arr_list.append(cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE)) + cupy.cuda.runtime.deviceSynchronize() + elif "arrayfire" == pkg: + for i in range(count): + x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE)) + af.eval(x) + arr_list.append(x) + af.sync() + elif "dpnp" == pkg: + dpnp.random.seed(1) + for i in range(count): + arr_list.append(dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE)) + elif "numpy" == pkg: + np.random.rand(1) + for i in range(count): + arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE)) + + return arr_list + + +def gemm_np(A, B, C): + return alpha * np.matmul(A, B) + beta * C + + +def gemm_af(A, B, C): + x = af.gemm(A, B, alpha=alpha, beta=beta, accum=C) + af.eval(x) + af.sync() + return x + + +def gemm_dpnp(A, B, C): + return alpha * dpnp.matmul(A, B) + beta * C + + +def gemm_cupy(A, B, C): + m, k = A.shape + k, n = B.shape + + # Inputs matrices need to be in Fortran order. + # A = cupy.asfortranarray(A) + # B = cupy.asfortranarray(B) + # C = cupy.asfortranarray(C) + + grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), 1) + block = (dim_x, dim_y, 1) + args = (m, n, k, A, B, C) + shared_mem = blk_k * (blk_m + 1) * 4 + blk_n * (blk_k + 1) * 4 + kern(grid, block, args=args, shared_mem=shared_mem) + cupy.cuda.runtime.deviceSynchronize() + return C + + +FUNCS = {"numpy": gemm_np, "cupy": gemm_cupy, "arrayfire": gemm_af, "dpnp": gemm_dpnp} diff --git a/benchmarks/src/pytest_benchmark/test_kmeans.py b/benchmarks/src/pytest_benchmark/test_kmeans.py new file mode 100644 index 0000000..53c5183 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_kmeans.py @@ -0,0 +1,287 @@ +from common import * + +ITERATIONS = 10 +TOLERANCE = 1e-4 +NSAMPLES = NSIZE +NFEATURES = 256 +K = 20 + + +@pytest.mark.parametrize("pkgid", IDS, ids=IDS) +class TestKmeans: + def test_kmeans(self, benchmark, pkgid): + initialize_package(pkgid) + pkg = PKGDICT[pkgid] + kmean_class = {"dpnp": kmeans_dpnp, "numpy": kmeans_numpy, "cupy": kmeans_cupy, "arrayfire": kmeans_af} + obj = kmean_class[pkg.__name__]() + + benchmark.extra_info["description"] = f"{NSAMPLES}x{NFEATURES} over {K} centers" + result = benchmark.pedantic(target=obj.kmeans, rounds=ROUNDS, iterations=1) + + +class kmeans_numpy: + def __init__(self): + self.data = np.random.random((NSAMPLES, NFEATURES)) + self.centroid_indices = np.random.choice(self.data.shape[0], K, replace=False) + + def initialize_centroids(self): + """ + Randomly initializes k centroids from the data points. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + k (int): The number of clusters. + + Returns: + np.ndarray: Initial centroids (k, n_features). + """ + + return self.data[self.centroid_indices, :] + + def assign_to_clusters(self, centroids): + """ + Assigns each data point to the closest centroid. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + centroids (np.ndarray): The current centroids (k, n_features). + + Returns: + np.ndarray: An array of cluster assignments for each data point (n_samples,). + """ + distances = np.sqrt(((self.data[:, np.newaxis, :] - centroids[np.newaxis, :, :]) ** 2).sum(axis=2)) + cluster_assignments = np.argmin(distances, axis=1) + return cluster_assignments + + def update_centroids(self, cluster_assignments): + """ + Recalculates the centroids based on the mean of the assigned data points. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + cluster_assignments (np.ndarray): An array of cluster assignments. + k (int): The number of clusters. + + Returns: + np.ndarray: Updated centroids (k, n_features). + """ + new_centroids = np.zeros((K, self.data.shape[1])) + for i in range(K): + points_in_cluster = self.data[cluster_assignments == i] + if len(points_in_cluster) > 0: + new_centroids[i] = np.mean(points_in_cluster, axis=0) + return new_centroids + + def kmeans(self): + """ + Performs the K-Means clustering algorithm. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + k (int): The number of clusters. + max_iterations (int): Maximum number of iterations to run the algorithm. + tolerance (float): The tolerance for convergence (change in centroids). + + Returns: + tuple: A tuple containing: + - np.ndarray: Final centroids (k, n_features). + - np.ndarray: Final cluster assignments for each data point (n_samples,). + """ + centroids = self.initialize_centroids() + cluster_assignments = None + + for i in range(ITERATIONS): + old_centroids = np.copy(centroids) + + # E-step: Assign points to clusters + cluster_assignments = self.assign_to_clusters(centroids) + + # M-step: Update centroids + centroids = self.update_centroids(cluster_assignments) + + # Check for convergence + if np.linalg.norm(centroids - old_centroids) < TOLERANCE: + break + + return centroids, cluster_assignments + + +class kmeans_dpnp: + def __init__(self): + self.data = dpnp.random.random((NSAMPLES, NFEATURES)) + self.centroid_indices = dpnp.array(np.random.choice(self.data.shape[0], K, replace=False).tolist()) + + def initialize_centroids(self): + return self.data[self.centroid_indices, :] + + def assign_to_clusters(self, centroids): + distances = dpnp.sqrt(((self.data[:, dpnp.newaxis, :] - centroids[dpnp.newaxis, :, :]) ** 2).sum(axis=2)) + cluster_assignments = dpnp.argmin(distances, axis=1) + return cluster_assignments + + def update_centroids(self, cluster_assignments): + new_centroids = dpnp.zeros((K, self.data.shape[1])) + for i in range(K): + points_in_cluster = self.data[cluster_assignments == i] + if len(points_in_cluster) > 0: + new_centroids[i] = dpnp.mean(points_in_cluster, axis=0) + return new_centroids + + def kmeans(self): + centroids = self.initialize_centroids() + cluster_assignments = None + for i in range(ITERATIONS): + old_centroids = dpnp.copy(centroids) + + # E-step: Assign points to clusters + cluster_assignments = self.assign_to_clusters(centroids) + + # M-step: Update centroids + centroids = self.update_centroids(cluster_assignments) + + # Check for convergence + if dpnp.linalg.norm(centroids - old_centroids) < TOLERANCE: + break + + return centroids, cluster_assignments + + +class kmeans_cupy: + def __init__(self): + self.data = cupy.random.random((NSAMPLES, NFEATURES)) + self.centroid_indices = cupy.random.choice(self.data.shape[0], K, replace=False) + cupy.cuda.runtime.deviceSynchronize() + + def initialize_centroids(self): + return self.data[self.centroid_indices, :] + + def assign_to_clusters(self, centroids): + distances = cupy.sqrt(((self.data[:, cupy.newaxis, :] - centroids[cupy.newaxis, :, :]) ** 2).sum(axis=2)) + cluster_assignments = cupy.argmin(distances, axis=1) + return cluster_assignments + + def update_centroids(self, cluster_assignments): + new_centroids = cupy.zeros((K, self.data.shape[1])) + for i in range(K): + points_in_cluster = self.data[cluster_assignments == i] + if len(points_in_cluster) > 0: + new_centroids[i] = cupy.mean(points_in_cluster, axis=0) + return new_centroids + + def kmeans(self): + centroids = self.initialize_centroids() + cluster_assignments = None + + for i in range(ITERATIONS): + old_centroids = cupy.copy(centroids) + + # E-step: Assign points to clusters + cluster_assignments = self.assign_to_clusters(centroids) + + # M-step: Update centroids + centroids = self.update_centroids(cluster_assignments) + + # Check for convergence + if cupy.linalg.norm(centroids - old_centroids) < TOLERANCE: + break + + cupy.cuda.runtime.deviceSynchronize() + return centroids, cluster_assignments + + +class kmeans_af: + def __init__(self): + self.data = af.Array(np.random.random((NSAMPLES, NFEATURES)).flatten().tolist(), shape=(NSAMPLES, NFEATURES)) + self.centroid_indices = af.Array(np.random.choice(self.data.shape[0], K, replace=False).tolist()) + + af.eval(self.data) + af.eval(self.centroid_indices) + af.sync() + + def initialize_centroids(self): + """ + Randomly initializes k centroids from the data points. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + k (int): The number of clusters. + + Returns: + np.ndarray: Initial centroids (k, n_features). + """ + + return self.data[self.centroid_indices, :] + # return af.lookup(self.data, self.centroid_indices, axis=0) + + def assign_to_clusters(self, centroids): + """ + Assigns each data point to the closest centroid. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + centroids (np.ndarray): The current centroids (k, n_features). + + Returns: + np.ndarray: An array of cluster assignments for each data point (n_samples,). + """ + dist = (af.moddims(self.data, (NSAMPLES, 1, NFEATURES)) - af.moddims(centroids, (1, K, NFEATURES))) ** 2 + distances = af.sqrt(af.sum(dist, axis=2)) + cluster_assignments = af.where(distances == af.tile(af.min(distances, axis=1), (1, K))) + # cluster_assignments = af.range((NSAMPLES, K))[distances == af.min(distances, axis=1)] + + return cluster_assignments + + def update_centroids(self, cluster_assignments): + """ + Recalculates the centroids based on the mean of the assigned data points. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + cluster_assignments (np.ndarray): An array of cluster assignments. + k (int): The number of clusters. + + Returns: + np.ndarray: Updated centroids (k, n_features). + """ + new_centroids = af.constant(0, (K, self.data.shape[1])) + for i in range(K): + points_in_cluster = self.data[:, af.where(cluster_assignments == i)] + if len(points_in_cluster) > 0: + new_centroids[i] = af.mean(points_in_cluster, axis=0) + return new_centroids + + def kmeans(self): + """ + Performs the K-Means clustering algorithm. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + k (int): The number of clusters. + max_iterations (int): Maximum number of iterations to run the algorithm. + tolerance (float): The tolerance for convergence (change in centroids). + + Returns: + tuple: A tuple containing: + - np.ndarray: Final centroids (k, n_features). + - np.ndarray: Final cluster assignments for each data point (n_samples,). + """ + centroids = self.initialize_centroids() + cluster_assignments = None + + for i in range(ITERATIONS): + old_centroids = af.copy_array(centroids) + + # E-step: Assign points to clusters + cluster_assignments = self.assign_to_clusters(centroids) + + # M-step: Update centroids + centroids = self.update_centroids(cluster_assignments) + + # Check for convergence + if af.norm(centroids - old_centroids) < TOLERANCE: + break + + af.eval(centroids) + af.eval(cluster_assignments) + af.sync() + return centroids, cluster_assignments diff --git a/benchmarks/src/pytest_benchmark/test_linalg.py b/benchmarks/src/pytest_benchmark/test_linalg.py new file mode 100644 index 0000000..2b75601 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_linalg.py @@ -0,0 +1,264 @@ +# cython: language_level=3 +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2016-2024, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from common import * + +ITERATIONS = 1 + +eps = 1e-3 + + +def generate_arrays(pkgid, count, posdef=False): + arr_list = [] + pkg = PKGDICT[pkgid] + pkg = pkg.__name__ + if "cupy" == pkg: + for i in range(count): + x = cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE) + if posdef: + x = x @ x.T + x.T @ x + eps + arr_list.append(x) + cupy.cuda.runtime.deviceSynchronize() + elif "arrayfire" == pkg: + for i in range(count): + x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE)) + if posdef: + x = af.matmul(x, x.T) + af.matmul(x.T, x) + eps + af.eval(x) + arr_list.append(x) + af.sync() + elif "dpnp" == pkg: + for i in range(count): + x = dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE) + if posdef: + x = x @ x.T + x.T @ x + eps + arr_list.append(x) + elif "numpy" == pkg: + for i in range(count): + x = np.random.rand(NSIZE, NSIZE).astype(DTYPE) + if posdef: + x = x @ x.T + x.T @ x + eps + arr_list.append(x) + + return arr_list + + +def svd_np(arr): + return np.linalg.svd(arr) + + +def svd_dpnp(arr): + return dpnp.linalg.svd(arr) + + +def svd_af(arr): + x = af.svd(arr) + for r in x: + af.eval(r) + af.sync() + return x + + +def svd_cupy(arr): + x = cupy.linalg.svd(arr) + cupy.cuda.runtime.deviceSynchronize() + return x + + +def qr_np(arr): + return np.linalg.qr(arr) + + +def qr_dpnp(arr): + return dpnp.linalg.qr(arr) + + +def qr_af(arr): + x = af.qr(arr) + for r in x: + af.eval(r) + af.sync() + return x + + +def qr_cupy(arr): + x = cupy.linalg.qr(arr) + cupy.cuda.runtime.deviceSynchronize() + return x + + +def cholesky_np(arr): + return np.linalg.cholesky(arr) + + +def cholesky_dpnp(arr): + return dpnp.linalg.cholesky(arr) + + +def cholesky_af(arr): + x, info = af.cholesky(arr) + af.eval(x) + af.sync() + return x + + +def cholesky_cupy(arr): + x = cupy.linalg.cholesky(arr) + cupy.cuda.runtime.deviceSynchronize() + return x + + +def qr_cupy(arr): + x = cupy.linalg.qr(arr) + cupy.cuda.runtime.deviceSynchronize() + return x + + +def inv_np(arr): + return np.linalg.inv(arr) + + +def inv_dpnp(arr): + return dpnp.linalg.inv(arr) + + +def inv_af(arr): + x, info = af.inverse(arr) + af.eval(x) + af.sync() + return x + + +def inv_cupy(arr): + x = cupy.linalg.inv(arr) + cupy.cuda.runtime.deviceSynchronize() + return x + + +def det_np(arr): + return np.linalg.det(arr) + + +def det_dpnp(arr): + return dpnp.linalg.det(arr) + + +def det_af(arr): + x = af.det(arr) + af.sync() + return x + + +def det_cupy(arr): + x = cupy.linalg.det(arr) + cupy.cuda.runtime.deviceSynchronize() + return x + + +def norm_np(arr): + return np.linalg.norm(arr) + + +def norm_dpnp(arr): + return dpnp.linalg.norm(arr) + + +def norm_af(arr): + x = af.norm(arr) + af.sync() + return x + + +def norm_cupy(arr): + x = cupy.linalg.norm(arr) + cupy.cuda.runtime.deviceSynchronize() + return x + + +@pytest.mark.parametrize("pkgid", IDS, ids=IDS) +class TestLinalg: + def test_cholesky(self, benchmark, pkgid): + initialize_package(pkgid) + setup = lambda: (generate_arrays(pkgid, 1, True), {}) + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + pkg = PKGDICT[pkgid] + + CHOLESKY_FUNCS = {"numpy": cholesky_np, "cupy": cholesky_cupy, "arrayfire": cholesky_af, "dpnp": cholesky_dpnp} + result = benchmark.pedantic( + target=CHOLESKY_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS + ) + + def test_svd(self, benchmark, pkgid): + initialize_package(pkgid) + setup = lambda: (generate_arrays(pkgid, 1), {}) + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + pkg = PKGDICT[pkgid] + + SVD_FUNCS = {"numpy": svd_np, "cupy": svd_cupy, "arrayfire": svd_af, "dpnp": svd_dpnp} + result = benchmark.pedantic(target=SVD_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS) + + def test_qr(self, benchmark, pkgid): + initialize_package(pkgid) + setup = lambda: (generate_arrays(pkgid, 1), {}) + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + pkg = PKGDICT[pkgid] + + QR_FUNCS = {"numpy": qr_np, "cupy": qr_cupy, "arrayfire": qr_af, "dpnp": qr_dpnp} + result = benchmark.pedantic(target=QR_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS) + + def test_inv(self, benchmark, pkgid): + initialize_package(pkgid) + setup = lambda: (generate_arrays(pkgid, 1), {}) + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + pkg = PKGDICT[pkgid] + + INV_FUNCS = {"numpy": inv_np, "cupy": inv_cupy, "arrayfire": inv_af, "dpnp": inv_dpnp} + result = benchmark.pedantic(target=INV_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS) + + def test_det(self, benchmark, pkgid): + initialize_package(pkgid) + setup = lambda: (generate_arrays(pkgid, 1), {}) + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + pkg = PKGDICT[pkgid] + + DET_FUNCS = {"numpy": det_np, "cupy": det_cupy, "arrayfire": det_af, "dpnp": det_dpnp} + result = benchmark.pedantic(target=DET_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS) + + def test_norm(self, benchmark, pkgid): + initialize_package(pkgid) + setup = lambda: (generate_arrays(pkgid, 1), {}) + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" + pkg = PKGDICT[pkgid] + + NORM_FUNCS = {"numpy": norm_np, "cupy": norm_cupy, "arrayfire": norm_af, "dpnp": norm_dpnp} + result = benchmark.pedantic(target=NORM_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS) diff --git a/benchmarks/src/pytest_benchmark/test_mandelbrot.py b/benchmarks/src/pytest_benchmark/test_mandelbrot.py new file mode 100644 index 0000000..a9cd73b --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_mandelbrot.py @@ -0,0 +1,184 @@ +# SPDX-FileCopyrightText: 2017 Nicolas P. Rougier +# SPDX-FileCopyrightText: 2021 ETH Zurich and the NPBench authors +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: BSD-3-Clause + +# more information at https://github.com/rougier/numpy-book + +from common import * + +xmin = -2 +xmax = 2 +ymin = -2 +ymax = 2 +xn = NSIZE +yn = NSIZE +itermax = 20 +horizon = 2.0 + + +@pytest.mark.parametrize("pkgid", IDS, ids=IDS) +class TestMandelbrot: + def test_mandelbrot(self, benchmark, pkgid): + initialize_package(pkgid) + pkg = PKGDICT[pkgid] + + benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} grid iterated {itermax}x" + result = benchmark.pedantic(target=FUNCS[pkg.__name__], rounds=ROUNDS, iterations=1) + + +def mandelbrot_np(): + # Adapted from + # https://thesamovar.wordpress.com/2009/03/22/fast-fractals-with-python-and-numpy/ + Xi, Yi = np.mgrid[0:xn, 0:yn] + X = np.linspace(xmin, xmax, xn, dtype=np.float64)[Xi] + Y = np.linspace(ymin, ymax, yn, dtype=np.float64)[Yi] + C = X + Y * 1j + + N_ = np.zeros(C.shape, dtype=np.int64) + Z_ = np.zeros(C.shape, dtype=np.complex128) + Xi.shape = Yi.shape = C.shape = xn * yn + + Z = np.zeros(C.shape, np.complex128) + for i in range(itermax): + if not len(Z): + break + + # Compute for relevant points only + np.multiply(Z, Z, Z) + np.add(Z, C, Z) + + # Failed convergence + I = abs(Z) > horizon # noqa: E741 math variable + N_[Xi[I], Yi[I]] = i + 1 + Z_[Xi[I], Yi[I]] = Z[I] + + # Keep going with those who have not diverged yet + np.logical_not(I, I) # np.negative(I, I) not working any longer + Z = Z[I] + Xi, Yi = Xi[I], Yi[I] + C = C[I] + return Z_.T, N_.T + + +def mandelbrot_dpnp(): + # Adapted from + # https://thesamovar.wordpress.com/2009/03/22/fast-fractals-with-python-and-numpy/ + Xi, Yi = dpnp.mgrid[0:xn, 0:yn] + X = dpnp.linspace(xmin, xmax, xn, dtype=dpnp.float64)[Xi] + Y = dpnp.linspace(ymin, ymax, yn, dtype=dpnp.float64)[Yi] + C = X + Y * 1j + N_ = dpnp.zeros(C.shape, dtype=dpnp.int64) + Z_ = dpnp.zeros(C.shape, dtype=dpnp.complex128) + Xi.reshape(xn * yn) + Yi.reshape(xn * yn) + C.reshape(xn * yn) + + Z = dpnp.zeros(C.shape, dtype=dpnp.complex128) + for i in range(itermax): + if not len(Z): + break + + # Compute for relevant points only + dpnp.multiply(Z, Z, Z) + dpnp.add(Z, C, Z) + + # Failed convergence + I = abs(Z) > horizon # noqa: E741 math variable + N_[Xi[I], Yi[I]] = i + 1 + Z_[Xi[I], Yi[I]] = Z[I] + + # Keep going with those who have not diverged yet + dpnp.logical_not(I, I) # dpnp.negative(I, I) not working any longer + Z = Z[I] + Xi, Yi = Xi[I], Yi[I] + C = C[I] + + Z_ = Z_.T + N_ = N_.T + + return Z_, N_ + + +def mandelbrot_cupy(): + # Adapted from + # https://thesamovar.wordpress.com/2009/03/22/fast-fractals-with-python-and-numpy/ + Xi, Yi = cupy.mgrid[0:xn, 0:yn] + X = cupy.linspace(xmin, xmax, xn, dtype=cupy.float64)[Xi] + Y = cupy.linspace(ymin, ymax, yn, dtype=cupy.float64)[Yi] + C = X + Y * 1j + N_ = cupy.zeros(C.shape, dtype=cupy.int64) + Z_ = cupy.zeros(C.shape, dtype=cupy.complex128) + Xi.shape = Yi.shape = C.shape = xn * yn + + Z = cupy.zeros(C.shape, cupy.complex128) + for i in range(itermax): + if not len(Z): + break + + # Compute for relevant points only + cupy.multiply(Z, Z, Z) + cupy.add(Z, C, Z) + + # Failed convergence + I = abs(Z) > horizon # noqa: E741 math variable + N_[Xi[I], Yi[I]] = i + 1 + Z_[Xi[I], Yi[I]] = Z[I] + + # Keep going with those who have not diverged yet + cupy.logical_not(I, I) # cupy.negative(I, I) not working any longer + Z = Z[I] + Xi, Yi = Xi[I], Yi[I] + C = C[I] + + Z_ = Z_.T + N_ = N_.T + + cupy.cuda.runtime.deviceSynchronize() + return Z_, N_ + + +def mandelbrot_af(): + Xi = af.flat(af.range((xn, yn), axis=0, dtype=af.int64)) + Yi = af.flat(af.range((xn, yn), axis=1, dtype=af.int64)) + X = af.iota((xn, 1), tile_shape=(1, yn), dtype=af.float64) * (xmax - xmin) / (xn - 1) + xmin + Y = af.iota((1, yn), tile_shape=(xn, 1), dtype=af.float64) * (ymax - ymin) / (yn - 1) + ymin + + C = af.cplx(X, Y) + N_ = af.constant(0, (xn, yn)) + Z_ = af.constant(0, (xn, yn), dtype=af.complex64) + Z = af.constant(0, (xn, yn), dtype=af.complex64) + for i in range(itermax): + if not len(Z): + break + + # Compute for relevant points only + Z = Z * Z + Z = Z + C + + # Failed convergence + I = af.abs(Z) > horizon # noqa: E741 math variable + + if not af.any_true(I): + break + + N_[Xi[I] * yn + Yi[I]] = i + 1 + Z_[Xi[I] * yn + Yi[I]] = Z[I] + + # Keep going with those who have not diverged yet + I = af.logical_not(I) + Z = Z[I] + Xi = Xi[I] + Yi = Yi[I] + C = C[I] + + Z_ = Z_.T + N_ = N_.T + af.eval(Z_) + af.eval(N_) + af.sync() + return Z_, N_ + + +FUNCS = {"dpnp": mandelbrot_dpnp, "numpy": mandelbrot_np, "cupy": mandelbrot_cupy, "arrayfire": mandelbrot_af} diff --git a/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py b/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py new file mode 100644 index 0000000..752b701 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py @@ -0,0 +1,51 @@ +from common import * + +ITERATIONS = 1 + + +@pytest.mark.parametrize("pkgid", IDS, ids=IDS) +class TestPi: + def test_pi(self, benchmark, pkgid): + initialize_package(pkgid) + pkg = PKGDICT[pkgid] + + benchmark.extra_info["description"] = f"{NNSIZE:.2e} Samples" + result = benchmark.pedantic(target=FUNCS[pkg.__name__], rounds=ROUNDS, iterations=ITERATIONS, args=[NNSIZE]) + + +# Having the function outside is faster than the lambda inside +def in_circle(x, y): + return (x * x + y * y) < 1 + + +def calc_pi_af(samples): + x = af.randu(samples) + y = af.randu(samples) + result = 4 * af.sum(in_circle(x, y)) / samples + + af.sync() + + return result + + +def calc_pi_numpy(samples): + x = np.random.rand(samples).astype(np.float32) + y = np.random.rand(samples).astype(np.float32) + return 4.0 * np.sum(in_circle(x, y)) / samples + + +def calc_pi_cupy(samples): + x = cupy.random.rand(samples, dtype=np.float32) + y = cupy.random.rand(samples, dtype=np.float32) + res = 4.0 * cupy.sum(in_circle(x, y)) / samples + cupy.cuda.runtime.deviceSynchronize() + return res + + +def calc_pi_dpnp(samples): + x = dpnp.random.rand(samples).astype(dpnp.float32) + y = dpnp.random.rand(samples).astype(dpnp.float32) + return 4.0 * dpnp.sum(in_circle(x, y)) / samples + + +FUNCS = {"dpnp": calc_pi_dpnp, "numpy": calc_pi_numpy, "cupy": calc_pi_cupy, "arrayfire": calc_pi_af} diff --git a/benchmarks/src/pytest_benchmark/test_nbody.py b/benchmarks/src/pytest_benchmark/test_nbody.py new file mode 100644 index 0000000..8d35aa9 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_nbody.py @@ -0,0 +1,111 @@ +from common import * + +ITERATIONS = 1 +G = 1 +dt = 1e-3 +softening = 1e-4 +M = 1 + + +@pytest.mark.parametrize("pkgid", IDS, ids=IDS) +class TestNbody: + def test_nbody(self, benchmark, pkgid): + pkg = PKGDICT[pkgid] + setup = lambda: (generate_arrays(pkgid), {}) + + benchmark.extra_info["description"] = f"{NSIZE:.2e} Bodies" + result = benchmark.pedantic(target=nbody, setup=setup, rounds=ROUNDS, iterations=ITERATIONS) + + +def acceleration(pkg, mass, pos): + x = pos[:, 0:1] + y = pos[:, 1:2] + z = pos[:, 2:3] + + # matrix that stores all pairwise particle separations: r_j - r_i + dx = x.T - x + dy = y.T - y + dz = z.T - z + + # matrix that stores 1/r^3 for all particle pairwise particle separations + inv_r3 = dx**2 + dy**2 + dz**2 + softening**2 + inv_r3[inv_r3 > 0] = inv_r3[inv_r3 > 0] ** (-1.5) + + ax = G * (dx * inv_r3) @ mass + ay = G * (dy * inv_r3) @ mass + az = G * (dz * inv_r3) @ mass + + return pkg.hstack((ax, ay, az)) + + +def acceleration_af(mass, pos): + x = pos[:, 0:1] + y = pos[:, 1:2] + z = pos[:, 2:3] + + # matrix that stores all pairwise particle separations: r_j - r_i + dx = af.moddims(x, (1, x.shape[0])) - x + dy = af.moddims(y, (1, x.shape[0])) - y + dz = af.moddims(z, (1, x.shape[0])) - z + + # matrix that stores 1/r^3 for all particle pairwise particle separations + # inv_r3 = dx**2 + dy**2 + dz**2 + softening**2 + inv_r3 = dx * dx + dy * dy + dz * dz + softening * softening + inv_r3 = af.pow(inv_r3, -1.5) + + ax = G * af.matmul((dx * inv_r3), mass) + ay = G * af.matmul((dy * inv_r3), mass) + az = G * af.matmul((dz * inv_r3), mass) + + return af.join(1, ax, ay, az) + + +def nbody(pkg, mass, pos, vel): + vel -= pkg.mean(mass * vel, axis=0) / pkg.mean(mass) + + acc = None + if pkg.__name__ == "arrayfire": + acc = acceleration_af(mass, pos) + else: + acc = acceleration(pkg, mass, pos) + + for i in range(ITERATIONS): + vel += acc * dt / 2.0 + pos += vel * dt + vel += acc * dt / 2.0 + + energy = 0.5 * pkg.sum(mass * vel**2) + + return float(energy) + + +def generate_arrays(pkgid): + arr_list = [] + pkg = PKGDICT[pkgid] + + initialize_package(pkgid) + pkgname = pkg.__name__ + count = 2 + if "cupy" == pkgname: + arr_list.append(M * cupy.ones((NSIZE, 1), dtype=DTYPE)) + for i in range(count): + arr_list.append(cupy.random.rand(NSIZE, 3, dtype=DTYPE)) + cupy.cuda.runtime.deviceSynchronize() + elif "arrayfire" == pkgname: + af.device_gc() + arr_list.append(M * af.constant(1, (NSIZE, 1), dtype=getattr(af, DTYPE))) + for i in range(count): + arr_list.append(af.randu((NSIZE, 3), dtype=getattr(af, DTYPE))) + for arr in arr_list: + af.eval(arr) + af.sync() + elif "dpnp" == pkgname: + arr_list.append(M * dpnp.ones((NSIZE, 1), dtype=DTYPE)) + for i in range(count): + arr_list.append(dpnp.random.rand(NSIZE, 3).astype(DTYPE)) + elif "numpy" == pkgname: + arr_list.append(M * np.ones((NSIZE, 1), dtype=DTYPE)) + for i in range(count): + arr_list.append(np.random.rand(NSIZE, 3).astype(DTYPE)) + + return (pkg, arr_list[0], arr_list[1], arr_list[2]) diff --git a/benchmarks/src/pytest_benchmark/test_nn.py b/benchmarks/src/pytest_benchmark/test_nn.py new file mode 100644 index 0000000..7940cb0 --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_nn.py @@ -0,0 +1,379 @@ +from common import * + +INPUT_SIZE = 28 * 28 +HIDDEN_SIZE = 1000 +OUTPUT_SIZE = 10 +LEARNING_RATE = 0.01 +ITERATIONS = 10 +BATCH_SIZE = 2560 +SAMPLES = 25600 + + +@pytest.mark.parametrize("pkgid", IDS, ids=IDS) +class TestNeuralNetwork: + def test_neural_network(self, benchmark, pkgid): + initialize_package(pkgid) + pkg = PKGDICT[pkgid] + nn = { + "dpnp": NeuralNetwork_dpnp, + "numpy": NeuralNetwork_numpy, + "cupy": NeuralNetwork_cupy, + "arrayfire": NeuralNetwork_af, + } + + obj = nn[pkg.__name__]() + + benchmark.extra_info["description"] = ( + f"{INPUT_SIZE}x{HIDDEN_SIZE}x{OUTPUT_SIZE} trained with {SAMPLES:.2e} samples" + ) + result = benchmark.pedantic(target=obj.train, rounds=ROUNDS, iterations=1) + + +class NeuralNetwork_numpy: + def __init__(self): + self.input_size = INPUT_SIZE + self.hidden_size = HIDDEN_SIZE + self.output_size = OUTPUT_SIZE + self.learning_rate = LEARNING_RATE + + # Initialize weights and biases + # He initialization (for ReLU) is often a good choice + self.W1 = np.random.randn(self.input_size, self.hidden_size) * np.sqrt(2.0 / self.input_size) + self.b1 = np.zeros((1, self.hidden_size)) + self.W2 = np.random.randn(self.hidden_size, self.output_size) * np.sqrt(2.0 / self.hidden_size) + self.b2 = np.zeros((1, self.output_size)) + + self.X_train = np.random.rand(SAMPLES, INPUT_SIZE) + self.y_train = np.zeros((SAMPLES * OUTPUT_SIZE)) + self.y_train[ + np.arange(SAMPLES) * OUTPUT_SIZE + np.floor(np.random.rand(SAMPLES) * OUTPUT_SIZE).astype(int) + ] = 1 + self.y_train = self.y_train.reshape((SAMPLES, OUTPUT_SIZE)) + + def relu(self, x): + return np.maximum(0, x) + + def relu_derivative(self, x): + return (x > 0).astype(float) + + def softmax(self, x): + exp_scores = np.exp(x - np.max(x, axis=1, keepdims=True)) # Subtract max for numerical stability + return exp_scores / np.sum(exp_scores, axis=1, keepdims=True) + + def forward(self, X): + # Hidden layer + self.z1 = np.dot(X, self.W1) + self.b1 + self.a1 = self.relu(self.z1) + + # Output layer + self.z2 = np.dot(self.a1, self.W2) + self.b2 + self.a2 = self.softmax(self.z2) + return self.a2 + + def backward(self, X, y, output): + # Calculate gradients for the output layer + error_output = output - y # Derivative of cross-entropy loss w.r.t. softmax input + dW2 = np.dot(self.a1.T, error_output) + db2 = np.sum(error_output, axis=0, keepdims=True) + + # Calculate gradients for the hidden layer + error_hidden = np.dot(error_output, self.W2.T) * self.relu_derivative(self.z1) + dW1 = np.dot(X.T, error_hidden) + db1 = np.sum(error_hidden, axis=0, keepdims=True) + + # Update weights and biases + self.W2 -= self.learning_rate * dW2 + self.b2 -= self.learning_rate * db2 + self.W1 -= self.learning_rate * dW1 + self.b1 -= self.learning_rate * db1 + + def train(self): + X_train = self.X_train + y_train = self.y_train + + num_samples = X_train.shape[0] + + for epoch in range(ITERATIONS): + # Shuffle data for each epoch + X_shuffled = X_train + y_shuffled = y_train + + for i in range(0, num_samples, BATCH_SIZE): + X_batch = X_shuffled[i : i + BATCH_SIZE, :] + y_batch = y_shuffled[i : i + BATCH_SIZE, :] + + # Forward pass + output = self.forward(X_batch) + + # Backward pass and update weights + self.backward(X_batch, y_batch, output) + + def predict(self, X): + return np.argmax(self.forward(X), axis=1) + + +class NeuralNetwork_dpnp: + def __init__(self): + self.input_size = INPUT_SIZE + self.hidden_size = HIDDEN_SIZE + self.output_size = OUTPUT_SIZE + self.learning_rate = LEARNING_RATE + + # Initialize weights and biases + # He initialization (for ReLU) is often a good choice + self.W1 = dpnp.random.randn(self.input_size, self.hidden_size) * np.sqrt(2.0 / self.input_size) + self.b1 = dpnp.zeros((1, self.hidden_size)) + self.W2 = dpnp.random.randn(self.hidden_size, self.output_size) * np.sqrt(2.0 / self.hidden_size) + self.b2 = dpnp.zeros((1, self.output_size)) + + self.X_train = dpnp.random.rand(SAMPLES, INPUT_SIZE) + self.y_train = dpnp.zeros((SAMPLES * OUTPUT_SIZE)) + self.y_train[ + dpnp.arange(SAMPLES) * OUTPUT_SIZE + dpnp.floor(dpnp.random.rand(SAMPLES) * OUTPUT_SIZE).astype(int) + ] = 1 + self.y_train = self.y_train.reshape((SAMPLES, OUTPUT_SIZE)) + + def relu(self, x): + return dpnp.maximum(0, x) + + def relu_derivative(self, x): + return (x > 0).astype(float) + + def softmax(self, x): + exp_scores = dpnp.exp(x - dpnp.max(x, axis=1, keepdims=True)) # Subtract max for numerical stability + return exp_scores / np.sum(exp_scores, axis=1, keepdims=True) + + def forward(self, X): + # Hidden layer + self.z1 = dpnp.dot(X, self.W1) + self.b1 + self.a1 = self.relu(self.z1) + + # Output layer + self.z2 = dpnp.dot(self.a1, self.W2) + self.b2 + self.a2 = self.softmax(self.z2) + return self.a2 + + def backward(self, X, y, output): + # Calculate gradients for the output layer + error_output = output - y # Derivative of cross-entropy loss w.r.t. softmax input + dW2 = dpnp.dot(self.a1.T, error_output) + db2 = dpnp.sum(error_output, axis=0, keepdims=True) + + # Calculate gradients for the hidden layer + error_hidden = dpnp.dot(error_output, self.W2.T) * self.relu_derivative(self.z1) + dW1 = dpnp.dot(X.T, error_hidden) + db1 = dpnp.sum(error_hidden, axis=0, keepdims=True) + + # Update weights and biases + self.W2 -= self.learning_rate * dW2 + self.b2 -= self.learning_rate * db2 + self.W1 -= self.learning_rate * dW1 + self.b1 -= self.learning_rate * db1 + + def train(self): + X_train = self.X_train + y_train = self.y_train + + num_samples = X_train.shape[0] + + for epoch in range(ITERATIONS): + # Shuffle data for each epoch + X_shuffled = X_train + y_shuffled = y_train + + for i in range(0, num_samples, BATCH_SIZE): + X_batch = X_shuffled[i : i + BATCH_SIZE, :] + y_batch = y_shuffled[i : i + BATCH_SIZE, :] + + # Forward pass + output = self.forward(X_batch) + + # Backward pass and update weights + self.backward(X_batch, y_batch, output) + + def predict(self, X): + return dpnp.argmax(self.forward(X), axis=1) + + +class NeuralNetwork_cupy: + def __init__(self): + self.input_size = INPUT_SIZE + self.hidden_size = HIDDEN_SIZE + self.output_size = OUTPUT_SIZE + self.learning_rate = LEARNING_RATE + + # Initialize weights and biases + # He initialization (for ReLU) is often a good choice + self.W1 = cupy.random.randn(self.input_size, self.hidden_size) * np.sqrt(2.0 / self.input_size) + self.b1 = cupy.zeros((1, self.hidden_size)) + self.W2 = cupy.random.randn(self.hidden_size, self.output_size) * np.sqrt(2.0 / self.hidden_size) + self.b2 = cupy.zeros((1, self.output_size)) + + self.X_train = cupy.random.rand(SAMPLES, INPUT_SIZE) + self.y_train = cupy.zeros((SAMPLES * OUTPUT_SIZE)) + self.y_train[ + cupy.arange(SAMPLES) * OUTPUT_SIZE + cupy.floor(cupy.random.rand(SAMPLES) * OUTPUT_SIZE).astype(int) + ] = 1 + self.y_train = self.y_train.reshape((SAMPLES, OUTPUT_SIZE)) + + cupy.cuda.runtime.deviceSynchronize() + + def relu(self, x): + return cupy.maximum(0, x) + + def relu_derivative(self, x): + return (x > 0).astype(float) + + def softmax(self, x): + exp_scores = cupy.exp(x - cupy.max(x, axis=1, keepdims=True)) # Subtract max for numerical stability + return exp_scores / cupy.sum(exp_scores, axis=1, keepdims=True) + + def forward(self, X): + # Hidden layer + self.z1 = cupy.dot(X, self.W1) + self.b1 + self.a1 = self.relu(self.z1) + + # Output layer + self.z2 = cupy.dot(self.a1, self.W2) + self.b2 + self.a2 = self.softmax(self.z2) + return self.a2 + + def backward(self, X, y, output): + # Calculate gradients for the output layer + error_output = output - y # Derivative of cross-entropy loss w.r.t. softmax input + dW2 = cupy.dot(self.a1.T, error_output) + db2 = cupy.sum(error_output, axis=0, keepdims=True) + + # Calculate gradients for the hidden layer + error_hidden = cupy.dot(error_output, self.W2.T) * self.relu_derivative(self.z1) + dW1 = cupy.dot(X.T, error_hidden) + db1 = cupy.sum(error_hidden, axis=0, keepdims=True) + + # Update weights and biases + self.W2 -= self.learning_rate * dW2 + self.b2 -= self.learning_rate * db2 + self.W1 -= self.learning_rate * dW1 + self.b1 -= self.learning_rate * db1 + + def train(self): + X_train = self.X_train + y_train = self.y_train + + num_samples = X_train.shape[0] + + for epoch in range(ITERATIONS): + # Shuffle data for each epoch + X_shuffled = X_train + y_shuffled = y_train + + for i in range(0, num_samples, BATCH_SIZE): + X_batch = X_shuffled[i : i + BATCH_SIZE, :] + y_batch = y_shuffled[i : i + BATCH_SIZE, :] + + # Forward pass + output = self.forward(X_batch) + + # Backward pass and update weights + self.backward(X_batch, y_batch, output) + + cupy.cuda.runtime.deviceSynchronize() + + def predict(self, X): + return cupy.argmax(self.forward(X), axis=1) + + +class NeuralNetwork_af: + def __init__(self): + self.input_size = INPUT_SIZE + self.hidden_size = HIDDEN_SIZE + self.output_size = OUTPUT_SIZE + self.learning_rate = LEARNING_RATE + + # Initialize weights and biases + # He initialization (for ReLU) is often a good choice + self.W1 = af.randn((self.input_size, self.hidden_size)) * np.sqrt(2.0 / self.input_size) + self.b1 = af.constant(0, (1, self.hidden_size)) + self.W2 = af.randn((self.hidden_size, self.output_size)) * np.sqrt(2.0 / self.hidden_size) + self.b2 = af.constant(0, (1, self.output_size)) + + self.X_train = af.randu((SAMPLES, INPUT_SIZE)) + self.y_train = af.constant(0, (SAMPLES, OUTPUT_SIZE)) + + self.y_train = af.constant(0, (SAMPLES, OUTPUT_SIZE)) + self.y_train[af.iota(SAMPLES), af.floor(af.randu(SAMPLES) * OUTPUT_SIZE)] = 1 + + af.eval(self.X_train) + af.eval(self.y_train) + af.eval(self.W1) + af.eval(self.W2) + af.eval(self.b1) + af.eval(self.b2) + af.sync() + + def relu(self, x): + selection = x > 0 + return af.select(x, 0, selection) + + def relu_derivative(self, x): + return af.cast(x > 0, getattr(af, DTYPE)) + + def softmax(self, x): + exp_scores = af.exp(x - af.max(x, axis=1)) # Subtract max for numerical stability + return exp_scores / af.sum(exp_scores, axis=1) + + def forward(self, X): + # Hidden layer + self.z1 = af.matmul(X, self.W1) + self.b1 + self.a1 = self.relu(self.z1) + + # Output layer + self.z2 = af.matmul(self.a1, self.W2) + self.b2 + self.a2 = self.softmax(self.z2) + return self.a2 + + def backward(self, X, y, output): + # Calculate gradients for the output layer + error_output = output - y # Derivative of cross-entropy loss w.r.t. softmax input + dW2 = af.matmul(self.a1.T, error_output) + db2 = af.sum(error_output, axis=0) + + # Calculate gradients for the hidden layer + error_hidden = af.matmul(error_output, self.W2.T) * self.relu_derivative(self.z1) + dW1 = af.matmul(X.T, error_hidden) + db1 = af.sum(error_hidden, axis=0) + + # Update weights and biases + self.W2 -= self.learning_rate * dW2 + self.b2 -= self.learning_rate * db2 + self.W1 -= self.learning_rate * dW1 + self.b1 -= self.learning_rate * db1 + + def train(self): + X_train = self.X_train + y_train = self.y_train + + num_samples = X_train.shape[0] + + for epoch in range(ITERATIONS): + # Shuffle data for each epoch + X_shuffled = X_train + y_shuffled = y_train + + for i in range(0, num_samples, BATCH_SIZE): + X_batch = X_shuffled[i : i + BATCH_SIZE, :] + y_batch = y_shuffled[i : i + BATCH_SIZE, :] + + # Forward pass + output = self.forward(X_batch) + + # Backward pass and update weights + self.backward(X_batch, y_batch, output) + + af.eval(self.W2) + af.eval(self.b2) + af.eval(self.W1) + af.eval(self.b1) + af.sync() + + def predict(self, X): + return af.where(X == af.tile(af.max(self.forward(X), axis=1), (1, X.shape[1]))) diff --git a/benchmarks/src/pytest_benchmark/test_random.py b/benchmarks/src/pytest_benchmark/test_random.py new file mode 100644 index 0000000..19caebb --- /dev/null +++ b/benchmarks/src/pytest_benchmark/test_random.py @@ -0,0 +1,61 @@ +from common import * + +ITERATIONS = 20 + + +def randn_np(): + arr = np.random.normal(size=(NNSIZE)) + + +def randn_dpnp(): + arr = dpnp.random.normal(size=(NNSIZE)) + + +def randn_cupy(): + arr = cupy.random.normal(size=(NNSIZE)) + cupy.cuda.runtime.deviceSynchronize() + + +def randn_af(): + arr = af.randn((NNSIZE)) + af.eval(arr) + af.sync() + + +def randu_np(): + arr = np.random.uniform(size=(NNSIZE)) + + +def randu_dpnp(): + arr = dpnp.random.uniform(size=(NNSIZE)) + + +def randu_cupy(): + arr = cupy.random.uniform(size=(NNSIZE)) + cupy.cuda.runtime.deviceSynchronize() + + +def randu_af(): + arr = af.randu((NNSIZE)) + af.eval(arr) + af.sync() + + +@pytest.mark.parametrize("pkgid", IDS, ids=IDS) +class TestRandom: + def test_normal(self, benchmark, pkgid): + initialize_package(pkgid) + + pkg = PKGDICT[pkgid] + FUNCS = {"dpnp": randn_dpnp, "numpy": randn_np, "cupy": randn_cupy, "arrayfire": randn_af} + + benchmark.extra_info["description"] = f"{NNSIZE:.2e} Samples" + result = benchmark.pedantic(target=FUNCS[pkg.__name__], rounds=ROUNDS, iterations=ITERATIONS) + + def test_uniform(self, benchmark, pkgid): + initialize_package(pkgid) + + pkg = PKGDICT[pkgid] + FUNCS = {"dpnp": randu_dpnp, "numpy": randu_np, "cupy": randu_cupy, "arrayfire": randu_af} + + result = benchmark.pedantic(target=FUNCS[pkg.__name__], rounds=ROUNDS, iterations=ITERATIONS) diff --git a/benchmarks/src/requirements.txt b/benchmarks/src/requirements.txt new file mode 100644 index 0000000..30b908b --- /dev/null +++ b/benchmarks/src/requirements.txt @@ -0,0 +1,10 @@ +--extra-index-url https://software.repos.intel.com/python/pypi +dpnp +numpy +cupy-cuda12x +pytest +pytest-benchmark +matplotlib +gdown +pandas +# arrayfire \ No newline at end of file