diff --git a/.flake8 b/.flake8
index af332dd..0d726f9 100755
--- a/.flake8
+++ b/.flake8
@@ -8,3 +8,4 @@ max-line-length = 119
 exclude =
     .venv/**
     build/**
+    benchmarks/**
\ No newline at end of file
diff --git a/benchmarks/src/README.md b/benchmarks/src/README.md
new file mode 100644
index 0000000..ae84c4e
--- /dev/null
+++ b/benchmarks/src/README.md
@@ -0,0 +1,36 @@
+Benchmarks
+===========
+
+## Setting up environment
+
+```sh
+    python -m pip install -r requirements.txt
+```
+
+## Benchmark parameters
+
+The benchmark packages, rounds, array sizes, and numeric type may be specified on the constants at the top of [pytest_benchmark/common.py](pytest_benchmark/common.py).
+
+Alternatively, they may be specified individually at the top of each test file.
+
+
+## Running
+
+These are the steps to run the benchmarks, and produce the graphs 
+
+Run the benchmarks and store the results in `results.json`
+```sh
+    pytest .\pytest_benchmark --benchmark-json=results.json
+```
+
+To create graphs and store the timing results after creating the `results.json`, run:
+```sh
+    python graphs.py
+```
+
+To modify the tests being shown, modify the `TESTS` list at the top of the `graphs.py` file.
+To modify the labels shown, modify `PKG_LABELS`
+To modify the hardware display, modify `HARDWARE` 
+
+## Notes
+When running with `dpnp`, set the environment variable `DPNP_RAISE_EXCEPION_ON_NUMPY_FALLBACK` to 0.
\ No newline at end of file
diff --git a/benchmarks/src/graphs.py b/benchmarks/src/graphs.py
new file mode 100644
index 0000000..1b2e4e4
--- /dev/null
+++ b/benchmarks/src/graphs.py
@@ -0,0 +1,223 @@
+import json
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+BENCHMARKS_JSON = "results.json"
+
+# Hardware details shown in title
+HARDWARE = "AMD Ryzen 9 9900X 12-Core Processor 63032 MB (fp64 fp16)\noneAPI 2025.1.3 Intel(R) OpenCL Graphics: Intel(R) Arc(TM) B580 Graphics, 11873 MB (fp64 fp16)"
+
+# Show speedup in graph
+SHOW_NUMBERS = True
+
+# Round to digits after decimal
+ROUND_NUMBERS = 1
+
+# package list in graph order; arrayfire packages are added later
+PKG_NAMES = ["numpy", "dpnp", "cupy"]
+
+# color used in graphs
+PKG_COLOR = {
+    "numpy": "tab:blue",
+    "cupy": "tab:green",
+    "dpnp": "tab:red",
+    "afcpu": "tab:orange",
+    "afopencl": "tab:orange",
+    "afcuda": "tab:orange",
+    "afoneapi": "tab:orange",
+}
+
+# labels displayed in the graph
+PKG_LABELS = {
+    "numpy": "numpy[cpu]",
+    "dpnp": "dpnp[level_zero:gpu]",
+    "cupy": "cupy",
+    "afcpu": "afcpu",
+    "afcuda": "afcuda",
+    "afopencl": "afopencl[opencl:gpu]",
+    "afoneapi": "afoneapi[opencl:gpu]",
+}
+
+AFBACKENDS = ["afcpu", "afcuda", "afopencl", "afoneapi"]
+
+# Tests to be shown in graphs
+TESTS = [
+    "qr",
+    "neural_network",
+    "gemm",
+    "mandelbrot",
+    "nbody",
+    "pi",
+    "black_scholes",
+    "fft",
+    "normal",
+    "group_elementwise",
+    # Other tests
+    # 'svd
+    # 'cholesky',
+    # 'det',
+    # 'norm',
+    # 'uniform',
+    # 'inv'
+]
+
+
+def get_benchmark_data():
+    results = {}
+    descriptions = {}
+    with open(BENCHMARKS_JSON) as f:
+        js = json.load(f)
+        for bench in js["benchmarks"]:
+            test_name = bench["name"]
+            test_name = test_name[test_name.find("_") + 1 : test_name.find("[")]
+
+            key = bench["param"]
+            val = bench["stats"]["ops"]
+
+            if len(bench["extra_info"]) != 0 and (not test_name in descriptions):
+                descriptions[test_name] = bench["extra_info"]["description"]
+
+            if test_name not in results:
+                results[test_name] = {key: val}
+            else:
+                results[test_name][key] = val
+
+    return results, descriptions
+
+
+def create_graph(test_name, test_results):
+    names = []
+    values = []
+    for name in test_results:
+        names.append(name)
+        values.append(test_results[name])
+
+    bar = plt.bar(names, values)
+    plt.title(test_name)
+
+    plt.savefig("img/" + test_name + ".png")
+    plt.close()
+
+
+def generate_individual_graphs():
+    results, descriptions = get_benchmark_data()
+
+    for test in results:
+        create_graph(test, results[test])
+
+
+# Stores the timing results in a csv file
+def store_csv():
+    data_dict = {}
+    data_dict["Test(seconds)"] = []
+    results = {}
+    for pkg in PKG_LABELS.keys():
+        data_dict[pkg] = []
+        results[pkg] = {}
+
+    with open(BENCHMARKS_JSON) as f:
+        js = json.load(f)
+        for bench in js["benchmarks"]:
+            test_name = bench["name"]
+            test_name = test_name[test_name.find("_") + 1 : test_name.find("[")]
+
+            pkg = bench["param"]
+            time = bench["stats"]["mean"]
+
+            if not test_name in data_dict["Test(seconds)"]:
+                data_dict["Test(seconds)"].append(test_name)
+
+            results[pkg][test_name] = time
+
+    for test in data_dict["Test(seconds)"]:
+        for pkg in PKG_LABELS.keys():
+            if test in results[pkg]:
+                data_dict[pkg].append(results[pkg][test])
+            else:
+                data_dict[pkg].append(np.nan)
+
+    df = pd.DataFrame(data_dict)
+    df.to_csv("summary.csv")
+
+
+def generate_group_graph(test_list=None, show_numbers=False, filename="comparison"):
+    results, descriptions = get_benchmark_data()
+
+    width = 1 / (1 + len(PKG_NAMES))
+    multiplier = 0
+
+    tests = None
+    if test_list:
+        tests = test_list
+    else:
+        tests = results.keys()
+
+    tests_values = {}
+    x = np.arange(len(tests))
+
+    for name in PKG_NAMES:
+        tests_values[name] = []
+
+    max_val = 1
+    for test in tests:
+        for name in PKG_NAMES:
+            base_value = results[test]["numpy"]
+            if name in results[test]:
+                val = results[test][name] / base_value
+
+                if ROUND_NUMBERS:
+                    val = round(val, ROUND_NUMBERS)
+
+                if max_val < val:
+                    max_val = val
+
+                tests_values[name].append(val)
+            else:
+                tests_values[name].append(np.nan)
+
+    fig, ax = plt.subplots(layout="constrained")
+
+    for name in PKG_NAMES:
+        offset = width * multiplier
+        rects = ax.barh(x + offset, tests_values[name], width, label=PKG_LABELS[name], color=PKG_COLOR[name])
+
+        if show_numbers:
+            ax.bar_label(rects, padding=3, rotation=0)
+        multiplier += 1
+
+    xlabels = []
+    for test in tests:
+        xlabels.append(test + "\n" + descriptions[test])
+
+    ax.set_xlabel("Speedup")
+    ax.set_xscale("log")
+    ax.set_title(f"Runtime Comparison\n{HARDWARE}")
+    ax.set_yticks(x + width, xlabels, rotation=0)
+    xmin, xmax = ax.get_xlim()
+    ax.set_xlim(xmin, xmax * 2)
+
+    ax.legend(loc="lower right", ncols=len(PKG_NAMES))
+    fig.set_figheight(8)
+    fig.set_figwidth(13)
+    fig.savefig(f"img/{filename}.png")
+    plt.show()
+
+
+def main():
+    store_csv()
+    for backend in AFBACKENDS:
+        try:
+            filename = f"comparison_{backend}"
+            if not backend in PKG_NAMES:
+                PKG_NAMES.insert(1, backend)
+            generate_group_graph(TESTS, SHOW_NUMBERS, filename)
+            PKG_NAMES.remove(backend)
+        except Exception as e:
+            print(e)
+            print("No data for", backend)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/src/pytest_benchmark/common.py b/benchmarks/src/pytest_benchmark/common.py
new file mode 100644
index 0000000..9e82a47
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/common.py
@@ -0,0 +1,105 @@
+# cython: language_level=3
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2016-2024, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import gc
+import math
+
+import cupy
+import dpctl
+import dpnp
+import numpy as np
+import pytest
+
+import arrayfire as af
+
+# modify parameters for most benchmarks
+ROUNDS = 30
+NSIZE = 2**13
+NNSIZE = NSIZE**2
+DTYPE = "float32"
+
+# comment a line to remove that package from testing
+PKGDICT = {
+    "dpnp": dpnp,
+    "numpy": np,
+    "cupy": cupy,
+    # "afcpu": af,
+    "afopencl": af,
+    "afcuda": af,
+    "afoneapi": af,
+}
+
+PKGS = []
+IDS = []
+
+for key, value in PKGDICT.items():
+    IDS.append(key)
+    PKGS.append(value)
+
+
+# Initialize packages and cleanup memory before each round
+def initialize_package(PKG_ID):
+    pkg = PKGDICT[PKG_ID]
+
+    try:
+        af.device_gc()
+        mempool = cupy.get_default_memory_pool()
+        mempool.free_all_blocks()
+    except:
+        pass
+
+    if PKG_ID == "afcpu":
+        af.set_backend(af.BackendType.cpu)
+        af.device_gc()
+        af.info()
+    elif PKG_ID == "afopencl":
+        af.set_backend(af.BackendType.opencl)
+        af.device_gc()
+        af.info()
+    elif PKG_ID == "afcuda":
+        af.set_backend(af.BackendType.cuda)
+        af.device_gc()
+        af.info()
+    elif PKG_ID == "afoneapi":
+        af.set_backend(af.BackendType.oneapi)
+        af.device_gc()
+        af.info()
+    elif PKG_ID == "numpy":
+        np.random.seed(0)
+    elif PKG_ID == "dpnp":
+        dpnp.random.seed(0)
+        print(dpctl.get_devices()[0])
+    elif PKG_ID == "cupy":
+        cupy.random.seed(0)
+        print(cupy.cuda.Device())
+        mempool = cupy.get_default_memory_pool()
+        mempool.free_all_blocks()
+    else:
+        raise NotImplementedError()
+
+    # Free all unused memory
+    gc.collect()
diff --git a/benchmarks/src/pytest_benchmark/sgemm.cu b/benchmarks/src/pytest_benchmark/sgemm.cu
new file mode 100644
index 0000000..9e4791f
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/sgemm.cu
@@ -0,0 +1,200 @@
+/*
+Original works by:
+--------------------------------------------------------
+MAGMA
+Copyright (c) 2017 The University of Tennessee. All rights reserved.
+Licensed under modified BSD license
+*/
+
+
+// These parameters will be determined by utils.read_code
+//#define DIM_X  ${DIM_X}
+//#define DIM_Y  ${DIM_Y}
+//#define BLK_M  ${BLK_M}
+//#define BLK_N  ${BLK_N}
+//#define BLK_K  ${BLK_K}
+//#define DIM_XA  ${DIM_XA}
+//#define DIM_YA  ${DIM_YA}
+//#define DIM_XB  ${DIM_XB}
+//#define DIM_YB  ${DIM_YB}
+//#define THR_N  ${THR_N}
+//#define THR_M  ${THR_M}
+
+#define fetch(arr, col, m, n, bound) arr[min(n*col + m, bound)]
+
+
+extern "C" __global__
+void sgemm(
+        int M, int N, int K,
+        const float* A,
+        const float* B,
+        float * C)
+{
+    int idx = threadIdx.x;
+    int idy = threadIdx.y;
+
+    int idt = DIM_X * idy + idx;
+
+    int idxA = idt % DIM_XA;
+    int idyA = idt / DIM_XA;
+
+    int idxB = idt % DIM_XB;
+    int idyB = idt / DIM_XB;
+
+    int blx = blockIdx.x;
+    int bly = blockIdx.y;
+
+    __shared__ float sA[BLK_K][BLK_M + 1];
+    __shared__ float sB[BLK_N][BLK_K + 1];
+
+    // registers for the innermost loop
+    float rC[THR_N][THR_M];
+    float rA[THR_M];
+    float rB[THR_N];
+
+    float ra[BLK_K / DIM_YA][BLK_M / DIM_XA];
+    float rb[BLK_N / DIM_YB][BLK_K / DIM_XB];
+
+    const float* offs_dA = A + blx * BLK_M       + idyA * M + idxA;
+    int boundA = (M * (K - 1) + M) - (blx * BLK_M + idyA * M + idxA) - 1;
+    const float* offs_dB = B + bly * BLK_N * K + idyB * K + idxB;
+    int boundB = (K * (N - 1) + K) - (bly * BLK_N * K + idyB * K + idxB) - 1;
+
+    int m, n, k, kk;
+    
+    #pragma unroll
+    for (n = 0; n < THR_N; n++) {
+        #pragma unroll
+        for (m = 0 ; m < THR_M; m++) {
+            rC[n][m] = 0;
+        }
+    }
+
+    // blockwise transpose to transpose load
+    #pragma unroll
+    for (n = 0; n < BLK_K; n += DIM_YA) {
+        #pragma unroll
+        for (m = 0; m < BLK_M; m += DIM_XA) {
+            sA[n + idyA][m + idxA] = fetch(offs_dA, M, m, n, boundA);
+        }
+    }
+    // blockwise transpose to transpose load
+    #pragma unroll
+    for (n = 0; n < BLK_N; n += DIM_YB) {
+        #pragma unroll
+        for (m = 0; m < BLK_K; m += DIM_XB) {
+            sB[n + idyB][m + idxB] = fetch(offs_dB, K, m, n, boundB);
+        }
+    }
+    __syncthreads();
+
+    for (kk = 0; kk < K - BLK_K; kk += BLK_K)
+    {
+        offs_dA += BLK_K * M;
+        boundA -= BLK_K * M;
+        offs_dB += BLK_K;
+        boundB -= BLK_K;
+        
+        #pragma unroll
+        for (n = 0; n < BLK_K / DIM_YA; n++) {
+            #pragma unroll
+            for (m = 0; m < BLK_M / DIM_XA; m++) {
+                ra[n][m] = fetch(offs_dA, M, m * DIM_XA, n * DIM_YA, boundA);
+            }
+        }
+
+        #pragma unroll
+        for (n = 0; n < BLK_N / DIM_YB; n++) {
+            #pragma unroll
+            for (m = 0; m < BLK_K / DIM_XB; m++) {
+                rb[n][m] = fetch(offs_dB, K, m * DIM_XB, n * DIM_YB, boundB);
+            }
+        }
+
+        // multiply
+        #pragma unroll
+        for (k = 0; k < BLK_K; k++)
+        {
+            #pragma unroll
+            for (m = 0; m < THR_M; m++) {
+                rA[m] = sA[k][m * DIM_X + idx];
+            }
+            
+            #pragma unroll
+            for (n = 0; n < THR_N; n++) {
+                rB[n] = sB[n * DIM_Y + idy][k];
+            }
+
+            #pragma unroll
+            for (n = 0; n < THR_N; n++) {
+                #pragma unroll
+                for (m = 0; m < THR_M; m++) {
+                    rC[n][m] += rA[m] * rB[n];
+                }
+            }
+        }
+        __syncthreads();
+
+        // store A regs->smem
+        #pragma unroll
+        for (n = 0; n < BLK_K / DIM_YA; n++)
+        {
+            #pragma unroll
+            for (m = 0; m < BLK_M / DIM_XA; m++)
+            {
+                sA[n * DIM_YA + idyA][m * DIM_XA + idxA] = ra[n][m];
+            }
+        }
+
+        #pragma unroll
+        for (n = 0; n < BLK_N / DIM_YB; n++)
+        {
+            #pragma unroll
+            for (m = 0; m < BLK_K / DIM_XB; m++)
+            {
+                sB[n * DIM_YB + idyB][m * DIM_XB + idxB] = rb[n][m];
+            }
+        }
+        __syncthreads();
+    }
+
+    // Multiply last full (BLK_K) or partial block of columns of A and
+    // rows of B.
+    // It's okay that m,n exceed matrix bounds as all work is in registers
+    // or shared memory, and out-of-bounds rC[n][m] will not be saved later.
+
+    kk = K - kk;
+    #pragma unroll
+    for (k = 0; k < kk; k++)
+    {
+        #pragma unroll
+        for (m = 0; m < THR_M; m++) {
+            rA[m] = sA[k][m * DIM_X + idx];
+        }
+
+        #pragma unroll
+        for (n = 0; n < THR_N; n++) {
+            rB[n] = sB[n * DIM_Y + idy][k];
+        }
+        
+        #pragma unroll
+        for (n = 0; n < THR_N; n++) {
+            #pragma unroll
+            for (m = 0; m < THR_M; m++) {
+                rC[n][m] += rA[m] * rB[n];
+            }
+        }
+    }
+
+    #pragma unroll
+    for (n = 0; n < THR_N; n++) {
+        int coord_dCn = bly * BLK_N + n * DIM_Y + idy;
+        #pragma unroll
+        for (m = 0; m < THR_M; m++) {
+            int coord_dCm = blx * BLK_M + m * DIM_X + idx;
+            if (coord_dCm < M && coord_dCn < N) {
+                C[coord_dCn * M + coord_dCm] = rC[n][m];
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/benchmarks/src/pytest_benchmark/test_blackscholes.py b/benchmarks/src/pytest_benchmark/test_blackscholes.py
new file mode 100644
index 0000000..fae2fc5
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_blackscholes.py
@@ -0,0 +1,149 @@
+#!/usr/bin/python
+
+#######################################################
+# Copyright (c) 2015, ArrayFire
+# All rights reserved.
+#
+# This file is distributed under 3-clause BSD license.
+# The complete license agreement can be obtained at:
+# http://arrayfire.com/licenses/BSD-3-Clause
+########################################################
+
+from common import *
+
+ITERATIONS = 1
+
+sqrt2 = math.sqrt(2.0)
+
+
+@pytest.mark.parametrize("pkgid", IDS, ids=IDS)
+class TestBlackScholes:
+    def test_black_scholes(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        setup = lambda: (generate_arrays(pkgid, 5), {})
+        pkg = PKGDICT[pkgid]
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        result = benchmark.pedantic(target=FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS)
+
+
+def black_scholes_numpy(S, X, R, V, T):
+    # S = Underlying stock price
+    # X = Strike Price
+    # R = Risk free rate of interest
+    # V = Volatility
+    # T = Time to maturity
+    def cnd(x):
+        temp = x > 0
+        erf = lambda arr: np.exp(-arr * arr)
+        return temp * (0.5 + erf(x / sqrt2) / 2) + (1 - temp) * (0.5 - erf((-x) / sqrt2) / 2)
+
+    d1 = np.log(S / X)
+    d1 = d1 + (R + (V * V) * 0.5) * T
+    d1 = d1 / (V * np.sqrt(T))
+
+    d2 = d1 - (V * np.sqrt(T))
+    cnd_d1 = cnd(d1)
+    cnd_d2 = cnd(d2)
+
+    C = S * cnd_d1 - (X * np.exp((-R) * T) * cnd_d2)
+    P = X * np.exp((-R) * T) * (1 - cnd_d2) - (S * (1 - cnd_d1))
+
+    return (C, P)
+
+
+def black_scholes_dpnp(S, X, R, V, T):
+    def cnd(x):
+        temp = x > 0
+        return temp * (0.5 + dpnp.erf(x / sqrt2) / 2) + (1 - temp) * (0.5 - dpnp.erf((-x) / sqrt2) / 2)
+
+    d1 = dpnp.log(S / X)
+    d1 = d1 + (R + (V * V) * 0.5) * T
+    d1 = d1 / (V * dpnp.sqrt(T))
+
+    d2 = d1 - (V * dpnp.sqrt(T))
+    cnd_d1 = cnd(d1)
+    cnd_d2 = cnd(d2)
+
+    C = S * cnd_d1 - (X * dpnp.exp((-R) * T) * cnd_d2)
+    P = X * dpnp.exp((-R) * T) * (1 - cnd_d2) - (S * (1 - cnd_d1))
+
+    return (C, P)
+
+
+def black_scholes_cupy(S, X, R, V, T):
+    def cnd(x):
+        temp = x > 0
+        erf = lambda arr: cupy.exp(-arr * arr)
+        return temp * (0.5 + erf(x / sqrt2) / 2) + (1 - temp) * (0.5 - erf((-x) / sqrt2) / 2)
+
+    d1 = cupy.log(S / X)
+    d1 = d1 + (R + (V * V) * 0.5) * T
+    d1 = d1 / (V * cupy.sqrt(T))
+
+    d2 = d1 - (V * cupy.sqrt(T))
+    cnd_d1 = cnd(d1)
+    cnd_d2 = cnd(d2)
+
+    C = S * cnd_d1 - (X * cupy.exp((-R) * T) * cnd_d2)
+    P = X * cupy.exp((-R) * T) * (1 - cnd_d2) - (S * (1 - cnd_d1))
+
+    cupy.cuda.runtime.deviceSynchronize()
+
+    return (C, P)
+
+
+def black_scholes_arrayfire(S, X, R, V, T):
+    def cnd(x):
+        temp = x > 0
+        return temp * (0.5 + af.erf(x / sqrt2) / 2) + (1 - temp) * (0.5 - af.erf((-x) / sqrt2) / 2)
+
+    d1 = af.log(S / X)
+    d1 = d1 + (R + (V * V) * 0.5) * T
+    d1 = d1 / (V * af.sqrt(T))
+
+    d2 = d1 - (V * af.sqrt(T))
+    cnd_d1 = cnd(d1)
+    cnd_d2 = cnd(d2)
+
+    C = S * cnd_d1 - (X * af.exp((-R) * T) * cnd_d2)
+    P = X * af.exp((-R) * T) * (1 - cnd_d2) - (S * (1 - cnd_d1))
+
+    af.eval(C)
+    af.eval(P)
+    af.sync()
+
+    return (C, P)
+
+
+def generate_arrays(pkgid, count):
+    arr_list = []
+    pkg = PKGDICT[pkgid]
+    pkg = pkg.__name__
+    if "cupy" == pkg:
+        for i in range(count):
+            arr_list.append(cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE))
+        cupy.cuda.runtime.deviceSynchronize()
+    elif "arrayfire" == pkg:
+        af.device_gc()
+        for i in range(count):
+            x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE))
+            af.eval(x)
+            arr_list.append(x)
+        af.sync()
+    elif "dpnp" == pkg:
+        for i in range(count):
+            arr_list.append(dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE))
+    elif "numpy" == pkg:
+        for i in range(count):
+            arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE))
+
+    return arr_list
+
+
+FUNCS = {
+    "dpnp": black_scholes_dpnp,
+    "numpy": black_scholes_numpy,
+    "cupy": black_scholes_cupy,
+    "arrayfire": black_scholes_arrayfire,
+}
diff --git a/benchmarks/src/pytest_benchmark/test_elementwise.py b/benchmarks/src/pytest_benchmark/test_elementwise.py
new file mode 100644
index 0000000..0933ce6
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_elementwise.py
@@ -0,0 +1,316 @@
+# cython: language_level=3
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2016-2024, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from common import *
+
+ITERATIONS = 1
+
+
+@pytest.mark.parametrize("pkgid", IDS, ids=IDS)
+class TestElementwise:
+    def test_group_elementwise(self, benchmark, pkgid):
+        initialize_package(pkgid)
+
+        setup = lambda: ([generate_arrays(pkgid, 1)[0] / (NSIZE * NSIZE)], {})
+        pkg = PKGDICT[pkgid]
+
+        def func(arr):
+            return pkg.exp(pkg.cos(pkg.sinh(arr))) + pkg.cbrt(pkg.log(arr) * pkg.expm1(-pkg.sqrt(arr)))
+
+        def func_af(arr):
+            x = af.exp(af.cos(af.sinh(arr))) + af.cbrt(af.log(arr) * af.expm1(-af.sqrt(arr)))
+            af.eval(x)
+            af.sync()
+            return x
+
+        def func_cupy(arr):
+            x = pkg.exp(pkg.cos(pkg.sinh(arr))) + pkg.cbrt(pkg.log(arr) * pkg.expm1(-pkg.sqrt(arr)))
+            cupy.cuda.runtime.deviceSynchronize()
+            return x
+
+        GROUP_FUNCS = {"numpy": func, "cupy": func_cupy, "arrayfire": func_af, "dpnp": func}
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        result = benchmark.pedantic(
+            target=GROUP_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS
+        )
+
+    """
+    def test_arccos(self, benchmark, pkg):
+        setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {})
+
+        result = benchmark.pedantic(
+            target=pkg.acos,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_arccosh(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.acosh,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_arcsin(self, benchmark, pkg):
+        setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {})
+
+        result = benchmark.pedantic(
+            target=pkg.asin,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_arcsinh(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.asinh,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_arctan(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.atan,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_arctanh(self, benchmark, pkg):
+        setup = lambda: ([(generate_arrays(pkg, 1)[0] - 1) / (NSIZE * NSIZE)], {})
+
+        result = benchmark.pedantic(
+            target=pkg.atanh,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_cbrt(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.cbrt,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_cos(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.cos,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_cosh(self, benchmark, pkg):
+        setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {})
+
+        result = benchmark.pedantic(
+            target=pkg.cosh,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_sin(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.sin,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_sinh(self, benchmark, pkg):
+        setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {})
+
+        result = benchmark.pedantic(
+            target=pkg.sinh,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_degrees(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.degrees,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_exp(self, benchmark, pkg):
+        setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {})
+
+        result = benchmark.pedantic(
+            target=pkg.exp,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_exp2(self, benchmark, pkg):
+        setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {})
+
+        result = benchmark.pedantic(
+            target=pkg.exp2,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_expm1(self, benchmark, pkg):
+        setup = lambda: ([generate_arrays(pkg, 1)[0] / (NSIZE * NSIZE)], {})
+
+        result = benchmark.pedantic(
+            target=pkg.expm1,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_log(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.log,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_log10(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.log10,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_log1p(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.log1p,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_sqrt(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.sqrt,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_square(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.square,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_tan(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.tan,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_tanh(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.tanh,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+
+    def test_reciprocal(self, benchmark, pkg):
+        setup = lambda: (generate_arrays(pkg, 1), {})
+
+        result = benchmark.pedantic(
+            target=pkg.reciprocal,
+            setup=setup,
+            rounds=ROUNDS,
+            iterations=ITERATIONS
+        )
+    """
+
+
+def generate_arrays(pkgid, count):
+    arr_list = []
+    pkg = PKGDICT[pkgid]
+    pkg = pkg.__name__
+    if "cupy" == pkg:
+        for i in range(count):
+            arr_list.append(cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE))
+        cupy.cuda.runtime.deviceSynchronize()
+    elif "arrayfire" == pkg:
+        for i in range(count):
+            x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE))
+            af.eval(x)
+            arr_list.append(x)
+        af.sync()
+    elif "dpnp" == pkg:
+        for i in range(count):
+            arr_list.append(dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE))
+    elif "numpy" == pkg:
+        for i in range(count):
+            arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE))
+
+    return arr_list
diff --git a/benchmarks/src/pytest_benchmark/test_fft.py b/benchmarks/src/pytest_benchmark/test_fft.py
new file mode 100644
index 0000000..096573d
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_fft.py
@@ -0,0 +1,90 @@
+# cython: language_level=3
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2016-2024, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from common import *
+
+ITERATIONS = 1
+
+
+def generate_arrays(pkgid, count):
+    arr_list = []
+    pkg = PKGDICT[pkgid]
+    pkg = pkg.__name__
+    if "cupy" == pkg:
+        for i in range(count):
+            arr_list.append(cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE))
+        cupy.cuda.runtime.deviceSynchronize()
+    elif "arrayfire" == pkg:
+        for i in range(count):
+            x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE))
+            af.eval(x)
+            arr_list.append(x)
+        af.sync()
+    elif "dpnp" == pkg:
+        for i in range(count):
+            arr_list.append(dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE))
+    elif "numpy" == pkg:
+        for i in range(count):
+            arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE))
+
+    return arr_list
+
+
+@pytest.mark.parametrize("pkgid", IDS, ids=IDS)
+class TestFFT:
+    def test_fft(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        setup = lambda: (generate_arrays(pkgid, 1), {})
+        pkg = PKGDICT[pkgid]
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        result = benchmark.pedantic(target=FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS)
+
+
+def fft_af(arr):
+    res = af.fft(arr)
+    af.eval(res)
+    af.sync()
+
+    return res
+
+
+def fft_np(arr):
+    return np.fft.fft(arr)
+
+
+def fft_dpnp(arr):
+    return dpnp.fft.fft(arr)
+
+
+def fft_cupy(arr):
+    res = cupy.fft.fft(arr)
+    cupy.cuda.runtime.deviceSynchronize()
+    return res
+
+
+FUNCS = {"dpnp": fft_dpnp, "numpy": fft_np, "cupy": fft_cupy, "arrayfire": fft_af}
diff --git a/benchmarks/src/pytest_benchmark/test_gemm.py b/benchmarks/src/pytest_benchmark/test_gemm.py
new file mode 100644
index 0000000..e62d5a1
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_gemm.py
@@ -0,0 +1,121 @@
+import os
+
+from common import *
+
+ITERATIONS = 1
+
+alpha = 0.4
+beta = 0.6
+
+dim_x = 16
+dim_y = 16
+blk_m = 64
+blk_n = 64
+blk_k = 4
+dim_xa = 64
+dim_ya = 4
+dim_xb = 4
+dim_yb = 64
+assert dim_x * dim_y == dim_xa * dim_ya == dim_xb * dim_yb
+config = {
+    "DIM_X": dim_x,
+    "DIM_Y": dim_y,
+    "BLK_M": blk_m,
+    "BLK_N": blk_n,
+    "BLK_K": blk_k,
+    "DIM_XA": dim_xa,
+    "DIM_YA": dim_ya,
+    "DIM_XB": dim_xb,
+    "DIM_YB": dim_yb,
+    "THR_M": blk_m // dim_x,
+    "THR_N": blk_n // dim_y,
+}
+
+
+def create_cupy_kernel(params):
+    sgemm_file = os.path.join(os.path.dirname(__file__), "sgemm.cu")
+    code = None
+    with open(sgemm_file, "r") as f:
+        code = f.read()
+        for k, v in params.items():
+            code = "#define " + k + " " + str(v) + "\n" + code
+
+    return cupy.RawKernel(code, "sgemm")
+
+
+kern = create_cupy_kernel(config)
+
+
+@pytest.mark.parametrize("pkgid", IDS, ids=IDS)
+class TestGemm:
+    def test_gemm(self, benchmark, pkgid):
+        pkg = PKGDICT[pkgid]
+        initialize_package(pkgid)
+
+        setup = lambda: (generate_arrays(pkgid, 3), {})
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        result = benchmark.pedantic(target=FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS)
+
+
+def generate_arrays(pkgid, count):
+    arr_list = []
+    pkg = PKGDICT[pkgid]
+    pkg = pkg.__name__
+    if "cupy" == pkg:
+        cupy.random.seed(1)
+        for i in range(count):
+            arr_list.append(cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE))
+        cupy.cuda.runtime.deviceSynchronize()
+    elif "arrayfire" == pkg:
+        for i in range(count):
+            x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE))
+            af.eval(x)
+            arr_list.append(x)
+        af.sync()
+    elif "dpnp" == pkg:
+        dpnp.random.seed(1)
+        for i in range(count):
+            arr_list.append(dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE))
+    elif "numpy" == pkg:
+        np.random.rand(1)
+        for i in range(count):
+            arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE))
+
+    return arr_list
+
+
+def gemm_np(A, B, C):
+    return alpha * np.matmul(A, B) + beta * C
+
+
+def gemm_af(A, B, C):
+    x = af.gemm(A, B, alpha=alpha, beta=beta, accum=C)
+    af.eval(x)
+    af.sync()
+    return x
+
+
+def gemm_dpnp(A, B, C):
+    return alpha * dpnp.matmul(A, B) + beta * C
+
+
+def gemm_cupy(A, B, C):
+    m, k = A.shape
+    k, n = B.shape
+
+    # Inputs matrices need to be in Fortran order.
+    # A = cupy.asfortranarray(A)
+    # B = cupy.asfortranarray(B)
+    # C = cupy.asfortranarray(C)
+
+    grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), 1)
+    block = (dim_x, dim_y, 1)
+    args = (m, n, k, A, B, C)
+    shared_mem = blk_k * (blk_m + 1) * 4 + blk_n * (blk_k + 1) * 4
+    kern(grid, block, args=args, shared_mem=shared_mem)
+    cupy.cuda.runtime.deviceSynchronize()
+    return C
+
+
+FUNCS = {"numpy": gemm_np, "cupy": gemm_cupy, "arrayfire": gemm_af, "dpnp": gemm_dpnp}
diff --git a/benchmarks/src/pytest_benchmark/test_kmeans.py b/benchmarks/src/pytest_benchmark/test_kmeans.py
new file mode 100644
index 0000000..53c5183
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_kmeans.py
@@ -0,0 +1,287 @@
+from common import *
+
+ITERATIONS = 10
+TOLERANCE = 1e-4
+NSAMPLES = NSIZE
+NFEATURES = 256
+K = 20
+
+
+@pytest.mark.parametrize("pkgid", IDS, ids=IDS)
+class TestKmeans:
+    def test_kmeans(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        pkg = PKGDICT[pkgid]
+        kmean_class = {"dpnp": kmeans_dpnp, "numpy": kmeans_numpy, "cupy": kmeans_cupy, "arrayfire": kmeans_af}
+        obj = kmean_class[pkg.__name__]()
+
+        benchmark.extra_info["description"] = f"{NSAMPLES}x{NFEATURES} over {K} centers"
+        result = benchmark.pedantic(target=obj.kmeans, rounds=ROUNDS, iterations=1)
+
+
+class kmeans_numpy:
+    def __init__(self):
+        self.data = np.random.random((NSAMPLES, NFEATURES))
+        self.centroid_indices = np.random.choice(self.data.shape[0], K, replace=False)
+
+    def initialize_centroids(self):
+        """
+        Randomly initializes k centroids from the data points.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            k (int): The number of clusters.
+
+        Returns:
+            np.ndarray: Initial centroids (k, n_features).
+        """
+
+        return self.data[self.centroid_indices, :]
+
+    def assign_to_clusters(self, centroids):
+        """
+        Assigns each data point to the closest centroid.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            centroids (np.ndarray): The current centroids (k, n_features).
+
+        Returns:
+            np.ndarray: An array of cluster assignments for each data point (n_samples,).
+        """
+        distances = np.sqrt(((self.data[:, np.newaxis, :] - centroids[np.newaxis, :, :]) ** 2).sum(axis=2))
+        cluster_assignments = np.argmin(distances, axis=1)
+        return cluster_assignments
+
+    def update_centroids(self, cluster_assignments):
+        """
+        Recalculates the centroids based on the mean of the assigned data points.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            cluster_assignments (np.ndarray): An array of cluster assignments.
+            k (int): The number of clusters.
+
+        Returns:
+            np.ndarray: Updated centroids (k, n_features).
+        """
+        new_centroids = np.zeros((K, self.data.shape[1]))
+        for i in range(K):
+            points_in_cluster = self.data[cluster_assignments == i]
+            if len(points_in_cluster) > 0:
+                new_centroids[i] = np.mean(points_in_cluster, axis=0)
+        return new_centroids
+
+    def kmeans(self):
+        """
+        Performs the K-Means clustering algorithm.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            k (int): The number of clusters.
+            max_iterations (int): Maximum number of iterations to run the algorithm.
+            tolerance (float): The tolerance for convergence (change in centroids).
+
+        Returns:
+            tuple: A tuple containing:
+                - np.ndarray: Final centroids (k, n_features).
+                - np.ndarray: Final cluster assignments for each data point (n_samples,).
+        """
+        centroids = self.initialize_centroids()
+        cluster_assignments = None
+
+        for i in range(ITERATIONS):
+            old_centroids = np.copy(centroids)
+
+            # E-step: Assign points to clusters
+            cluster_assignments = self.assign_to_clusters(centroids)
+
+            # M-step: Update centroids
+            centroids = self.update_centroids(cluster_assignments)
+
+            # Check for convergence
+            if np.linalg.norm(centroids - old_centroids) < TOLERANCE:
+                break
+
+        return centroids, cluster_assignments
+
+
+class kmeans_dpnp:
+    def __init__(self):
+        self.data = dpnp.random.random((NSAMPLES, NFEATURES))
+        self.centroid_indices = dpnp.array(np.random.choice(self.data.shape[0], K, replace=False).tolist())
+
+    def initialize_centroids(self):
+        return self.data[self.centroid_indices, :]
+
+    def assign_to_clusters(self, centroids):
+        distances = dpnp.sqrt(((self.data[:, dpnp.newaxis, :] - centroids[dpnp.newaxis, :, :]) ** 2).sum(axis=2))
+        cluster_assignments = dpnp.argmin(distances, axis=1)
+        return cluster_assignments
+
+    def update_centroids(self, cluster_assignments):
+        new_centroids = dpnp.zeros((K, self.data.shape[1]))
+        for i in range(K):
+            points_in_cluster = self.data[cluster_assignments == i]
+            if len(points_in_cluster) > 0:
+                new_centroids[i] = dpnp.mean(points_in_cluster, axis=0)
+        return new_centroids
+
+    def kmeans(self):
+        centroids = self.initialize_centroids()
+        cluster_assignments = None
+        for i in range(ITERATIONS):
+            old_centroids = dpnp.copy(centroids)
+
+            # E-step: Assign points to clusters
+            cluster_assignments = self.assign_to_clusters(centroids)
+
+            # M-step: Update centroids
+            centroids = self.update_centroids(cluster_assignments)
+
+            # Check for convergence
+            if dpnp.linalg.norm(centroids - old_centroids) < TOLERANCE:
+                break
+
+        return centroids, cluster_assignments
+
+
+class kmeans_cupy:
+    def __init__(self):
+        self.data = cupy.random.random((NSAMPLES, NFEATURES))
+        self.centroid_indices = cupy.random.choice(self.data.shape[0], K, replace=False)
+        cupy.cuda.runtime.deviceSynchronize()
+
+    def initialize_centroids(self):
+        return self.data[self.centroid_indices, :]
+
+    def assign_to_clusters(self, centroids):
+        distances = cupy.sqrt(((self.data[:, cupy.newaxis, :] - centroids[cupy.newaxis, :, :]) ** 2).sum(axis=2))
+        cluster_assignments = cupy.argmin(distances, axis=1)
+        return cluster_assignments
+
+    def update_centroids(self, cluster_assignments):
+        new_centroids = cupy.zeros((K, self.data.shape[1]))
+        for i in range(K):
+            points_in_cluster = self.data[cluster_assignments == i]
+            if len(points_in_cluster) > 0:
+                new_centroids[i] = cupy.mean(points_in_cluster, axis=0)
+        return new_centroids
+
+    def kmeans(self):
+        centroids = self.initialize_centroids()
+        cluster_assignments = None
+
+        for i in range(ITERATIONS):
+            old_centroids = cupy.copy(centroids)
+
+            # E-step: Assign points to clusters
+            cluster_assignments = self.assign_to_clusters(centroids)
+
+            # M-step: Update centroids
+            centroids = self.update_centroids(cluster_assignments)
+
+            # Check for convergence
+            if cupy.linalg.norm(centroids - old_centroids) < TOLERANCE:
+                break
+
+        cupy.cuda.runtime.deviceSynchronize()
+        return centroids, cluster_assignments
+
+
+class kmeans_af:
+    def __init__(self):
+        self.data = af.Array(np.random.random((NSAMPLES, NFEATURES)).flatten().tolist(), shape=(NSAMPLES, NFEATURES))
+        self.centroid_indices = af.Array(np.random.choice(self.data.shape[0], K, replace=False).tolist())
+
+        af.eval(self.data)
+        af.eval(self.centroid_indices)
+        af.sync()
+
+    def initialize_centroids(self):
+        """
+        Randomly initializes k centroids from the data points.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            k (int): The number of clusters.
+
+        Returns:
+            np.ndarray: Initial centroids (k, n_features).
+        """
+
+        return self.data[self.centroid_indices, :]
+        # return af.lookup(self.data, self.centroid_indices, axis=0)
+
+    def assign_to_clusters(self, centroids):
+        """
+        Assigns each data point to the closest centroid.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            centroids (np.ndarray): The current centroids (k, n_features).
+
+        Returns:
+            np.ndarray: An array of cluster assignments for each data point (n_samples,).
+        """
+        dist = (af.moddims(self.data, (NSAMPLES, 1, NFEATURES)) - af.moddims(centroids, (1, K, NFEATURES))) ** 2
+        distances = af.sqrt(af.sum(dist, axis=2))
+        cluster_assignments = af.where(distances == af.tile(af.min(distances, axis=1), (1, K)))
+        # cluster_assignments = af.range((NSAMPLES, K))[distances == af.min(distances, axis=1)]
+
+        return cluster_assignments
+
+    def update_centroids(self, cluster_assignments):
+        """
+        Recalculates the centroids based on the mean of the assigned data points.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            cluster_assignments (np.ndarray): An array of cluster assignments.
+            k (int): The number of clusters.
+
+        Returns:
+            np.ndarray: Updated centroids (k, n_features).
+        """
+        new_centroids = af.constant(0, (K, self.data.shape[1]))
+        for i in range(K):
+            points_in_cluster = self.data[:, af.where(cluster_assignments == i)]
+            if len(points_in_cluster) > 0:
+                new_centroids[i] = af.mean(points_in_cluster, axis=0)
+        return new_centroids
+
+    def kmeans(self):
+        """
+        Performs the K-Means clustering algorithm.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            k (int): The number of clusters.
+            max_iterations (int): Maximum number of iterations to run the algorithm.
+            tolerance (float): The tolerance for convergence (change in centroids).
+
+        Returns:
+            tuple: A tuple containing:
+                - np.ndarray: Final centroids (k, n_features).
+                - np.ndarray: Final cluster assignments for each data point (n_samples,).
+        """
+        centroids = self.initialize_centroids()
+        cluster_assignments = None
+
+        for i in range(ITERATIONS):
+            old_centroids = af.copy_array(centroids)
+
+            # E-step: Assign points to clusters
+            cluster_assignments = self.assign_to_clusters(centroids)
+
+            # M-step: Update centroids
+            centroids = self.update_centroids(cluster_assignments)
+
+            # Check for convergence
+            if af.norm(centroids - old_centroids) < TOLERANCE:
+                break
+
+        af.eval(centroids)
+        af.eval(cluster_assignments)
+        af.sync()
+        return centroids, cluster_assignments
diff --git a/benchmarks/src/pytest_benchmark/test_linalg.py b/benchmarks/src/pytest_benchmark/test_linalg.py
new file mode 100644
index 0000000..2b75601
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_linalg.py
@@ -0,0 +1,264 @@
+# cython: language_level=3
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2016-2024, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from common import *
+
+ITERATIONS = 1
+
+eps = 1e-3
+
+
+def generate_arrays(pkgid, count, posdef=False):
+    arr_list = []
+    pkg = PKGDICT[pkgid]
+    pkg = pkg.__name__
+    if "cupy" == pkg:
+        for i in range(count):
+            x = cupy.random.rand(NSIZE, NSIZE, dtype=DTYPE)
+            if posdef:
+                x = x @ x.T + x.T @ x + eps
+            arr_list.append(x)
+        cupy.cuda.runtime.deviceSynchronize()
+    elif "arrayfire" == pkg:
+        for i in range(count):
+            x = af.randu((NSIZE, NSIZE), dtype=getattr(af, DTYPE))
+            if posdef:
+                x = af.matmul(x, x.T) + af.matmul(x.T, x) + eps
+            af.eval(x)
+            arr_list.append(x)
+        af.sync()
+    elif "dpnp" == pkg:
+        for i in range(count):
+            x = dpnp.random.rand(NSIZE, NSIZE).astype(DTYPE)
+            if posdef:
+                x = x @ x.T + x.T @ x + eps
+            arr_list.append(x)
+    elif "numpy" == pkg:
+        for i in range(count):
+            x = np.random.rand(NSIZE, NSIZE).astype(DTYPE)
+            if posdef:
+                x = x @ x.T + x.T @ x + eps
+            arr_list.append(x)
+
+    return arr_list
+
+
+def svd_np(arr):
+    return np.linalg.svd(arr)
+
+
+def svd_dpnp(arr):
+    return dpnp.linalg.svd(arr)
+
+
+def svd_af(arr):
+    x = af.svd(arr)
+    for r in x:
+        af.eval(r)
+    af.sync()
+    return x
+
+
+def svd_cupy(arr):
+    x = cupy.linalg.svd(arr)
+    cupy.cuda.runtime.deviceSynchronize()
+    return x
+
+
+def qr_np(arr):
+    return np.linalg.qr(arr)
+
+
+def qr_dpnp(arr):
+    return dpnp.linalg.qr(arr)
+
+
+def qr_af(arr):
+    x = af.qr(arr)
+    for r in x:
+        af.eval(r)
+    af.sync()
+    return x
+
+
+def qr_cupy(arr):
+    x = cupy.linalg.qr(arr)
+    cupy.cuda.runtime.deviceSynchronize()
+    return x
+
+
+def cholesky_np(arr):
+    return np.linalg.cholesky(arr)
+
+
+def cholesky_dpnp(arr):
+    return dpnp.linalg.cholesky(arr)
+
+
+def cholesky_af(arr):
+    x, info = af.cholesky(arr)
+    af.eval(x)
+    af.sync()
+    return x
+
+
+def cholesky_cupy(arr):
+    x = cupy.linalg.cholesky(arr)
+    cupy.cuda.runtime.deviceSynchronize()
+    return x
+
+
+def qr_cupy(arr):
+    x = cupy.linalg.qr(arr)
+    cupy.cuda.runtime.deviceSynchronize()
+    return x
+
+
+def inv_np(arr):
+    return np.linalg.inv(arr)
+
+
+def inv_dpnp(arr):
+    return dpnp.linalg.inv(arr)
+
+
+def inv_af(arr):
+    x, info = af.inverse(arr)
+    af.eval(x)
+    af.sync()
+    return x
+
+
+def inv_cupy(arr):
+    x = cupy.linalg.inv(arr)
+    cupy.cuda.runtime.deviceSynchronize()
+    return x
+
+
+def det_np(arr):
+    return np.linalg.det(arr)
+
+
+def det_dpnp(arr):
+    return dpnp.linalg.det(arr)
+
+
+def det_af(arr):
+    x = af.det(arr)
+    af.sync()
+    return x
+
+
+def det_cupy(arr):
+    x = cupy.linalg.det(arr)
+    cupy.cuda.runtime.deviceSynchronize()
+    return x
+
+
+def norm_np(arr):
+    return np.linalg.norm(arr)
+
+
+def norm_dpnp(arr):
+    return dpnp.linalg.norm(arr)
+
+
+def norm_af(arr):
+    x = af.norm(arr)
+    af.sync()
+    return x
+
+
+def norm_cupy(arr):
+    x = cupy.linalg.norm(arr)
+    cupy.cuda.runtime.deviceSynchronize()
+    return x
+
+
+@pytest.mark.parametrize("pkgid", IDS, ids=IDS)
+class TestLinalg:
+    def test_cholesky(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        setup = lambda: (generate_arrays(pkgid, 1, True), {})
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        pkg = PKGDICT[pkgid]
+
+        CHOLESKY_FUNCS = {"numpy": cholesky_np, "cupy": cholesky_cupy, "arrayfire": cholesky_af, "dpnp": cholesky_dpnp}
+        result = benchmark.pedantic(
+            target=CHOLESKY_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS
+        )
+
+    def test_svd(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        setup = lambda: (generate_arrays(pkgid, 1), {})
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        pkg = PKGDICT[pkgid]
+
+        SVD_FUNCS = {"numpy": svd_np, "cupy": svd_cupy, "arrayfire": svd_af, "dpnp": svd_dpnp}
+        result = benchmark.pedantic(target=SVD_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS)
+
+    def test_qr(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        setup = lambda: (generate_arrays(pkgid, 1), {})
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        pkg = PKGDICT[pkgid]
+
+        QR_FUNCS = {"numpy": qr_np, "cupy": qr_cupy, "arrayfire": qr_af, "dpnp": qr_dpnp}
+        result = benchmark.pedantic(target=QR_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS)
+
+    def test_inv(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        setup = lambda: (generate_arrays(pkgid, 1), {})
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        pkg = PKGDICT[pkgid]
+
+        INV_FUNCS = {"numpy": inv_np, "cupy": inv_cupy, "arrayfire": inv_af, "dpnp": inv_dpnp}
+        result = benchmark.pedantic(target=INV_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS)
+
+    def test_det(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        setup = lambda: (generate_arrays(pkgid, 1), {})
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        pkg = PKGDICT[pkgid]
+
+        DET_FUNCS = {"numpy": det_np, "cupy": det_cupy, "arrayfire": det_af, "dpnp": det_dpnp}
+        result = benchmark.pedantic(target=DET_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS)
+
+    def test_norm(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        setup = lambda: (generate_arrays(pkgid, 1), {})
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
+        pkg = PKGDICT[pkgid]
+
+        NORM_FUNCS = {"numpy": norm_np, "cupy": norm_cupy, "arrayfire": norm_af, "dpnp": norm_dpnp}
+        result = benchmark.pedantic(target=NORM_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS)
diff --git a/benchmarks/src/pytest_benchmark/test_mandelbrot.py b/benchmarks/src/pytest_benchmark/test_mandelbrot.py
new file mode 100644
index 0000000..a9cd73b
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_mandelbrot.py
@@ -0,0 +1,184 @@
+# SPDX-FileCopyrightText: 2017 Nicolas P. Rougier
+# SPDX-FileCopyrightText: 2021 ETH Zurich and the NPBench authors
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+# more information at https://github.com/rougier/numpy-book
+
+from common import *
+
+xmin = -2
+xmax = 2
+ymin = -2
+ymax = 2
+xn = NSIZE
+yn = NSIZE
+itermax = 20
+horizon = 2.0
+
+
+@pytest.mark.parametrize("pkgid", IDS, ids=IDS)
+class TestMandelbrot:
+    def test_mandelbrot(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        pkg = PKGDICT[pkgid]
+
+        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} grid iterated {itermax}x"
+        result = benchmark.pedantic(target=FUNCS[pkg.__name__], rounds=ROUNDS, iterations=1)
+
+
+def mandelbrot_np():
+    # Adapted from
+    # https://thesamovar.wordpress.com/2009/03/22/fast-fractals-with-python-and-numpy/
+    Xi, Yi = np.mgrid[0:xn, 0:yn]
+    X = np.linspace(xmin, xmax, xn, dtype=np.float64)[Xi]
+    Y = np.linspace(ymin, ymax, yn, dtype=np.float64)[Yi]
+    C = X + Y * 1j
+
+    N_ = np.zeros(C.shape, dtype=np.int64)
+    Z_ = np.zeros(C.shape, dtype=np.complex128)
+    Xi.shape = Yi.shape = C.shape = xn * yn
+
+    Z = np.zeros(C.shape, np.complex128)
+    for i in range(itermax):
+        if not len(Z):
+            break
+
+        # Compute for relevant points only
+        np.multiply(Z, Z, Z)
+        np.add(Z, C, Z)
+
+        # Failed convergence
+        I = abs(Z) > horizon  # noqa: E741 math variable
+        N_[Xi[I], Yi[I]] = i + 1
+        Z_[Xi[I], Yi[I]] = Z[I]
+
+        # Keep going with those who have not diverged yet
+        np.logical_not(I, I)  # np.negative(I, I) not working any longer
+        Z = Z[I]
+        Xi, Yi = Xi[I], Yi[I]
+        C = C[I]
+    return Z_.T, N_.T
+
+
+def mandelbrot_dpnp():
+    # Adapted from
+    # https://thesamovar.wordpress.com/2009/03/22/fast-fractals-with-python-and-numpy/
+    Xi, Yi = dpnp.mgrid[0:xn, 0:yn]
+    X = dpnp.linspace(xmin, xmax, xn, dtype=dpnp.float64)[Xi]
+    Y = dpnp.linspace(ymin, ymax, yn, dtype=dpnp.float64)[Yi]
+    C = X + Y * 1j
+    N_ = dpnp.zeros(C.shape, dtype=dpnp.int64)
+    Z_ = dpnp.zeros(C.shape, dtype=dpnp.complex128)
+    Xi.reshape(xn * yn)
+    Yi.reshape(xn * yn)
+    C.reshape(xn * yn)
+
+    Z = dpnp.zeros(C.shape, dtype=dpnp.complex128)
+    for i in range(itermax):
+        if not len(Z):
+            break
+
+        # Compute for relevant points only
+        dpnp.multiply(Z, Z, Z)
+        dpnp.add(Z, C, Z)
+
+        # Failed convergence
+        I = abs(Z) > horizon  # noqa: E741 math variable
+        N_[Xi[I], Yi[I]] = i + 1
+        Z_[Xi[I], Yi[I]] = Z[I]
+
+        # Keep going with those who have not diverged yet
+        dpnp.logical_not(I, I)  # dpnp.negative(I, I) not working any longer
+        Z = Z[I]
+        Xi, Yi = Xi[I], Yi[I]
+        C = C[I]
+
+    Z_ = Z_.T
+    N_ = N_.T
+
+    return Z_, N_
+
+
+def mandelbrot_cupy():
+    # Adapted from
+    # https://thesamovar.wordpress.com/2009/03/22/fast-fractals-with-python-and-numpy/
+    Xi, Yi = cupy.mgrid[0:xn, 0:yn]
+    X = cupy.linspace(xmin, xmax, xn, dtype=cupy.float64)[Xi]
+    Y = cupy.linspace(ymin, ymax, yn, dtype=cupy.float64)[Yi]
+    C = X + Y * 1j
+    N_ = cupy.zeros(C.shape, dtype=cupy.int64)
+    Z_ = cupy.zeros(C.shape, dtype=cupy.complex128)
+    Xi.shape = Yi.shape = C.shape = xn * yn
+
+    Z = cupy.zeros(C.shape, cupy.complex128)
+    for i in range(itermax):
+        if not len(Z):
+            break
+
+        # Compute for relevant points only
+        cupy.multiply(Z, Z, Z)
+        cupy.add(Z, C, Z)
+
+        # Failed convergence
+        I = abs(Z) > horizon  # noqa: E741 math variable
+        N_[Xi[I], Yi[I]] = i + 1
+        Z_[Xi[I], Yi[I]] = Z[I]
+
+        # Keep going with those who have not diverged yet
+        cupy.logical_not(I, I)  # cupy.negative(I, I) not working any longer
+        Z = Z[I]
+        Xi, Yi = Xi[I], Yi[I]
+        C = C[I]
+
+    Z_ = Z_.T
+    N_ = N_.T
+
+    cupy.cuda.runtime.deviceSynchronize()
+    return Z_, N_
+
+
+def mandelbrot_af():
+    Xi = af.flat(af.range((xn, yn), axis=0, dtype=af.int64))
+    Yi = af.flat(af.range((xn, yn), axis=1, dtype=af.int64))
+    X = af.iota((xn, 1), tile_shape=(1, yn), dtype=af.float64) * (xmax - xmin) / (xn - 1) + xmin
+    Y = af.iota((1, yn), tile_shape=(xn, 1), dtype=af.float64) * (ymax - ymin) / (yn - 1) + ymin
+
+    C = af.cplx(X, Y)
+    N_ = af.constant(0, (xn, yn))
+    Z_ = af.constant(0, (xn, yn), dtype=af.complex64)
+    Z = af.constant(0, (xn, yn), dtype=af.complex64)
+    for i in range(itermax):
+        if not len(Z):
+            break
+
+        # Compute for relevant points only
+        Z = Z * Z
+        Z = Z + C
+
+        # Failed convergence
+        I = af.abs(Z) > horizon  # noqa: E741 math variable
+
+        if not af.any_true(I):
+            break
+
+        N_[Xi[I] * yn + Yi[I]] = i + 1
+        Z_[Xi[I] * yn + Yi[I]] = Z[I]
+
+        # Keep going with those who have not diverged yet
+        I = af.logical_not(I)
+        Z = Z[I]
+        Xi = Xi[I]
+        Yi = Yi[I]
+        C = C[I]
+
+    Z_ = Z_.T
+    N_ = N_.T
+    af.eval(Z_)
+    af.eval(N_)
+    af.sync()
+    return Z_, N_
+
+
+FUNCS = {"dpnp": mandelbrot_dpnp, "numpy": mandelbrot_np, "cupy": mandelbrot_cupy, "arrayfire": mandelbrot_af}
diff --git a/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py b/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py
new file mode 100644
index 0000000..752b701
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py
@@ -0,0 +1,51 @@
+from common import *
+
+ITERATIONS = 1
+
+
+@pytest.mark.parametrize("pkgid", IDS, ids=IDS)
+class TestPi:
+    def test_pi(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        pkg = PKGDICT[pkgid]
+
+        benchmark.extra_info["description"] = f"{NNSIZE:.2e} Samples"
+        result = benchmark.pedantic(target=FUNCS[pkg.__name__], rounds=ROUNDS, iterations=ITERATIONS, args=[NNSIZE])
+
+
+# Having the function outside is faster than the lambda inside
+def in_circle(x, y):
+    return (x * x + y * y) < 1
+
+
+def calc_pi_af(samples):
+    x = af.randu(samples)
+    y = af.randu(samples)
+    result = 4 * af.sum(in_circle(x, y)) / samples
+
+    af.sync()
+
+    return result
+
+
+def calc_pi_numpy(samples):
+    x = np.random.rand(samples).astype(np.float32)
+    y = np.random.rand(samples).astype(np.float32)
+    return 4.0 * np.sum(in_circle(x, y)) / samples
+
+
+def calc_pi_cupy(samples):
+    x = cupy.random.rand(samples, dtype=np.float32)
+    y = cupy.random.rand(samples, dtype=np.float32)
+    res = 4.0 * cupy.sum(in_circle(x, y)) / samples
+    cupy.cuda.runtime.deviceSynchronize()
+    return res
+
+
+def calc_pi_dpnp(samples):
+    x = dpnp.random.rand(samples).astype(dpnp.float32)
+    y = dpnp.random.rand(samples).astype(dpnp.float32)
+    return 4.0 * dpnp.sum(in_circle(x, y)) / samples
+
+
+FUNCS = {"dpnp": calc_pi_dpnp, "numpy": calc_pi_numpy, "cupy": calc_pi_cupy, "arrayfire": calc_pi_af}
diff --git a/benchmarks/src/pytest_benchmark/test_nbody.py b/benchmarks/src/pytest_benchmark/test_nbody.py
new file mode 100644
index 0000000..8d35aa9
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_nbody.py
@@ -0,0 +1,111 @@
+from common import *
+
+ITERATIONS = 1
+G = 1
+dt = 1e-3
+softening = 1e-4
+M = 1
+
+
+@pytest.mark.parametrize("pkgid", IDS, ids=IDS)
+class TestNbody:
+    def test_nbody(self, benchmark, pkgid):
+        pkg = PKGDICT[pkgid]
+        setup = lambda: (generate_arrays(pkgid), {})
+
+        benchmark.extra_info["description"] = f"{NSIZE:.2e} Bodies"
+        result = benchmark.pedantic(target=nbody, setup=setup, rounds=ROUNDS, iterations=ITERATIONS)
+
+
+def acceleration(pkg, mass, pos):
+    x = pos[:, 0:1]
+    y = pos[:, 1:2]
+    z = pos[:, 2:3]
+
+    # matrix that stores all pairwise particle separations: r_j - r_i
+    dx = x.T - x
+    dy = y.T - y
+    dz = z.T - z
+
+    # matrix that stores 1/r^3 for all particle pairwise particle separations
+    inv_r3 = dx**2 + dy**2 + dz**2 + softening**2
+    inv_r3[inv_r3 > 0] = inv_r3[inv_r3 > 0] ** (-1.5)
+
+    ax = G * (dx * inv_r3) @ mass
+    ay = G * (dy * inv_r3) @ mass
+    az = G * (dz * inv_r3) @ mass
+
+    return pkg.hstack((ax, ay, az))
+
+
+def acceleration_af(mass, pos):
+    x = pos[:, 0:1]
+    y = pos[:, 1:2]
+    z = pos[:, 2:3]
+
+    # matrix that stores all pairwise particle separations: r_j - r_i
+    dx = af.moddims(x, (1, x.shape[0])) - x
+    dy = af.moddims(y, (1, x.shape[0])) - y
+    dz = af.moddims(z, (1, x.shape[0])) - z
+
+    # matrix that stores 1/r^3 for all particle pairwise particle separations
+    # inv_r3 = dx**2 + dy**2 + dz**2 + softening**2
+    inv_r3 = dx * dx + dy * dy + dz * dz + softening * softening
+    inv_r3 = af.pow(inv_r3, -1.5)
+
+    ax = G * af.matmul((dx * inv_r3), mass)
+    ay = G * af.matmul((dy * inv_r3), mass)
+    az = G * af.matmul((dz * inv_r3), mass)
+
+    return af.join(1, ax, ay, az)
+
+
+def nbody(pkg, mass, pos, vel):
+    vel -= pkg.mean(mass * vel, axis=0) / pkg.mean(mass)
+
+    acc = None
+    if pkg.__name__ == "arrayfire":
+        acc = acceleration_af(mass, pos)
+    else:
+        acc = acceleration(pkg, mass, pos)
+
+    for i in range(ITERATIONS):
+        vel += acc * dt / 2.0
+        pos += vel * dt
+        vel += acc * dt / 2.0
+
+    energy = 0.5 * pkg.sum(mass * vel**2)
+
+    return float(energy)
+
+
+def generate_arrays(pkgid):
+    arr_list = []
+    pkg = PKGDICT[pkgid]
+
+    initialize_package(pkgid)
+    pkgname = pkg.__name__
+    count = 2
+    if "cupy" == pkgname:
+        arr_list.append(M * cupy.ones((NSIZE, 1), dtype=DTYPE))
+        for i in range(count):
+            arr_list.append(cupy.random.rand(NSIZE, 3, dtype=DTYPE))
+        cupy.cuda.runtime.deviceSynchronize()
+    elif "arrayfire" == pkgname:
+        af.device_gc()
+        arr_list.append(M * af.constant(1, (NSIZE, 1), dtype=getattr(af, DTYPE)))
+        for i in range(count):
+            arr_list.append(af.randu((NSIZE, 3), dtype=getattr(af, DTYPE)))
+        for arr in arr_list:
+            af.eval(arr)
+        af.sync()
+    elif "dpnp" == pkgname:
+        arr_list.append(M * dpnp.ones((NSIZE, 1), dtype=DTYPE))
+        for i in range(count):
+            arr_list.append(dpnp.random.rand(NSIZE, 3).astype(DTYPE))
+    elif "numpy" == pkgname:
+        arr_list.append(M * np.ones((NSIZE, 1), dtype=DTYPE))
+        for i in range(count):
+            arr_list.append(np.random.rand(NSIZE, 3).astype(DTYPE))
+
+    return (pkg, arr_list[0], arr_list[1], arr_list[2])
diff --git a/benchmarks/src/pytest_benchmark/test_nn.py b/benchmarks/src/pytest_benchmark/test_nn.py
new file mode 100644
index 0000000..7940cb0
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_nn.py
@@ -0,0 +1,379 @@
+from common import *
+
+INPUT_SIZE = 28 * 28
+HIDDEN_SIZE = 1000
+OUTPUT_SIZE = 10
+LEARNING_RATE = 0.01
+ITERATIONS = 10
+BATCH_SIZE = 2560
+SAMPLES = 25600
+
+
+@pytest.mark.parametrize("pkgid", IDS, ids=IDS)
+class TestNeuralNetwork:
+    def test_neural_network(self, benchmark, pkgid):
+        initialize_package(pkgid)
+        pkg = PKGDICT[pkgid]
+        nn = {
+            "dpnp": NeuralNetwork_dpnp,
+            "numpy": NeuralNetwork_numpy,
+            "cupy": NeuralNetwork_cupy,
+            "arrayfire": NeuralNetwork_af,
+        }
+
+        obj = nn[pkg.__name__]()
+
+        benchmark.extra_info["description"] = (
+            f"{INPUT_SIZE}x{HIDDEN_SIZE}x{OUTPUT_SIZE} trained with {SAMPLES:.2e} samples"
+        )
+        result = benchmark.pedantic(target=obj.train, rounds=ROUNDS, iterations=1)
+
+
+class NeuralNetwork_numpy:
+    def __init__(self):
+        self.input_size = INPUT_SIZE
+        self.hidden_size = HIDDEN_SIZE
+        self.output_size = OUTPUT_SIZE
+        self.learning_rate = LEARNING_RATE
+
+        # Initialize weights and biases
+        # He initialization (for ReLU) is often a good choice
+        self.W1 = np.random.randn(self.input_size, self.hidden_size) * np.sqrt(2.0 / self.input_size)
+        self.b1 = np.zeros((1, self.hidden_size))
+        self.W2 = np.random.randn(self.hidden_size, self.output_size) * np.sqrt(2.0 / self.hidden_size)
+        self.b2 = np.zeros((1, self.output_size))
+
+        self.X_train = np.random.rand(SAMPLES, INPUT_SIZE)
+        self.y_train = np.zeros((SAMPLES * OUTPUT_SIZE))
+        self.y_train[
+            np.arange(SAMPLES) * OUTPUT_SIZE + np.floor(np.random.rand(SAMPLES) * OUTPUT_SIZE).astype(int)
+        ] = 1
+        self.y_train = self.y_train.reshape((SAMPLES, OUTPUT_SIZE))
+
+    def relu(self, x):
+        return np.maximum(0, x)
+
+    def relu_derivative(self, x):
+        return (x > 0).astype(float)
+
+    def softmax(self, x):
+        exp_scores = np.exp(x - np.max(x, axis=1, keepdims=True))  # Subtract max for numerical stability
+        return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
+
+    def forward(self, X):
+        # Hidden layer
+        self.z1 = np.dot(X, self.W1) + self.b1
+        self.a1 = self.relu(self.z1)
+
+        # Output layer
+        self.z2 = np.dot(self.a1, self.W2) + self.b2
+        self.a2 = self.softmax(self.z2)
+        return self.a2
+
+    def backward(self, X, y, output):
+        # Calculate gradients for the output layer
+        error_output = output - y  # Derivative of cross-entropy loss w.r.t. softmax input
+        dW2 = np.dot(self.a1.T, error_output)
+        db2 = np.sum(error_output, axis=0, keepdims=True)
+
+        # Calculate gradients for the hidden layer
+        error_hidden = np.dot(error_output, self.W2.T) * self.relu_derivative(self.z1)
+        dW1 = np.dot(X.T, error_hidden)
+        db1 = np.sum(error_hidden, axis=0, keepdims=True)
+
+        # Update weights and biases
+        self.W2 -= self.learning_rate * dW2
+        self.b2 -= self.learning_rate * db2
+        self.W1 -= self.learning_rate * dW1
+        self.b1 -= self.learning_rate * db1
+
+    def train(self):
+        X_train = self.X_train
+        y_train = self.y_train
+
+        num_samples = X_train.shape[0]
+
+        for epoch in range(ITERATIONS):
+            # Shuffle data for each epoch
+            X_shuffled = X_train
+            y_shuffled = y_train
+
+            for i in range(0, num_samples, BATCH_SIZE):
+                X_batch = X_shuffled[i : i + BATCH_SIZE, :]
+                y_batch = y_shuffled[i : i + BATCH_SIZE, :]
+
+                # Forward pass
+                output = self.forward(X_batch)
+
+                # Backward pass and update weights
+                self.backward(X_batch, y_batch, output)
+
+    def predict(self, X):
+        return np.argmax(self.forward(X), axis=1)
+
+
+class NeuralNetwork_dpnp:
+    def __init__(self):
+        self.input_size = INPUT_SIZE
+        self.hidden_size = HIDDEN_SIZE
+        self.output_size = OUTPUT_SIZE
+        self.learning_rate = LEARNING_RATE
+
+        # Initialize weights and biases
+        # He initialization (for ReLU) is often a good choice
+        self.W1 = dpnp.random.randn(self.input_size, self.hidden_size) * np.sqrt(2.0 / self.input_size)
+        self.b1 = dpnp.zeros((1, self.hidden_size))
+        self.W2 = dpnp.random.randn(self.hidden_size, self.output_size) * np.sqrt(2.0 / self.hidden_size)
+        self.b2 = dpnp.zeros((1, self.output_size))
+
+        self.X_train = dpnp.random.rand(SAMPLES, INPUT_SIZE)
+        self.y_train = dpnp.zeros((SAMPLES * OUTPUT_SIZE))
+        self.y_train[
+            dpnp.arange(SAMPLES) * OUTPUT_SIZE + dpnp.floor(dpnp.random.rand(SAMPLES) * OUTPUT_SIZE).astype(int)
+        ] = 1
+        self.y_train = self.y_train.reshape((SAMPLES, OUTPUT_SIZE))
+
+    def relu(self, x):
+        return dpnp.maximum(0, x)
+
+    def relu_derivative(self, x):
+        return (x > 0).astype(float)
+
+    def softmax(self, x):
+        exp_scores = dpnp.exp(x - dpnp.max(x, axis=1, keepdims=True))  # Subtract max for numerical stability
+        return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
+
+    def forward(self, X):
+        # Hidden layer
+        self.z1 = dpnp.dot(X, self.W1) + self.b1
+        self.a1 = self.relu(self.z1)
+
+        # Output layer
+        self.z2 = dpnp.dot(self.a1, self.W2) + self.b2
+        self.a2 = self.softmax(self.z2)
+        return self.a2
+
+    def backward(self, X, y, output):
+        # Calculate gradients for the output layer
+        error_output = output - y  # Derivative of cross-entropy loss w.r.t. softmax input
+        dW2 = dpnp.dot(self.a1.T, error_output)
+        db2 = dpnp.sum(error_output, axis=0, keepdims=True)
+
+        # Calculate gradients for the hidden layer
+        error_hidden = dpnp.dot(error_output, self.W2.T) * self.relu_derivative(self.z1)
+        dW1 = dpnp.dot(X.T, error_hidden)
+        db1 = dpnp.sum(error_hidden, axis=0, keepdims=True)
+
+        # Update weights and biases
+        self.W2 -= self.learning_rate * dW2
+        self.b2 -= self.learning_rate * db2
+        self.W1 -= self.learning_rate * dW1
+        self.b1 -= self.learning_rate * db1
+
+    def train(self):
+        X_train = self.X_train
+        y_train = self.y_train
+
+        num_samples = X_train.shape[0]
+
+        for epoch in range(ITERATIONS):
+            # Shuffle data for each epoch
+            X_shuffled = X_train
+            y_shuffled = y_train
+
+            for i in range(0, num_samples, BATCH_SIZE):
+                X_batch = X_shuffled[i : i + BATCH_SIZE, :]
+                y_batch = y_shuffled[i : i + BATCH_SIZE, :]
+
+                # Forward pass
+                output = self.forward(X_batch)
+
+                # Backward pass and update weights
+                self.backward(X_batch, y_batch, output)
+
+    def predict(self, X):
+        return dpnp.argmax(self.forward(X), axis=1)
+
+
+class NeuralNetwork_cupy:
+    def __init__(self):
+        self.input_size = INPUT_SIZE
+        self.hidden_size = HIDDEN_SIZE
+        self.output_size = OUTPUT_SIZE
+        self.learning_rate = LEARNING_RATE
+
+        # Initialize weights and biases
+        # He initialization (for ReLU) is often a good choice
+        self.W1 = cupy.random.randn(self.input_size, self.hidden_size) * np.sqrt(2.0 / self.input_size)
+        self.b1 = cupy.zeros((1, self.hidden_size))
+        self.W2 = cupy.random.randn(self.hidden_size, self.output_size) * np.sqrt(2.0 / self.hidden_size)
+        self.b2 = cupy.zeros((1, self.output_size))
+
+        self.X_train = cupy.random.rand(SAMPLES, INPUT_SIZE)
+        self.y_train = cupy.zeros((SAMPLES * OUTPUT_SIZE))
+        self.y_train[
+            cupy.arange(SAMPLES) * OUTPUT_SIZE + cupy.floor(cupy.random.rand(SAMPLES) * OUTPUT_SIZE).astype(int)
+        ] = 1
+        self.y_train = self.y_train.reshape((SAMPLES, OUTPUT_SIZE))
+
+        cupy.cuda.runtime.deviceSynchronize()
+
+    def relu(self, x):
+        return cupy.maximum(0, x)
+
+    def relu_derivative(self, x):
+        return (x > 0).astype(float)
+
+    def softmax(self, x):
+        exp_scores = cupy.exp(x - cupy.max(x, axis=1, keepdims=True))  # Subtract max for numerical stability
+        return exp_scores / cupy.sum(exp_scores, axis=1, keepdims=True)
+
+    def forward(self, X):
+        # Hidden layer
+        self.z1 = cupy.dot(X, self.W1) + self.b1
+        self.a1 = self.relu(self.z1)
+
+        # Output layer
+        self.z2 = cupy.dot(self.a1, self.W2) + self.b2
+        self.a2 = self.softmax(self.z2)
+        return self.a2
+
+    def backward(self, X, y, output):
+        # Calculate gradients for the output layer
+        error_output = output - y  # Derivative of cross-entropy loss w.r.t. softmax input
+        dW2 = cupy.dot(self.a1.T, error_output)
+        db2 = cupy.sum(error_output, axis=0, keepdims=True)
+
+        # Calculate gradients for the hidden layer
+        error_hidden = cupy.dot(error_output, self.W2.T) * self.relu_derivative(self.z1)
+        dW1 = cupy.dot(X.T, error_hidden)
+        db1 = cupy.sum(error_hidden, axis=0, keepdims=True)
+
+        # Update weights and biases
+        self.W2 -= self.learning_rate * dW2
+        self.b2 -= self.learning_rate * db2
+        self.W1 -= self.learning_rate * dW1
+        self.b1 -= self.learning_rate * db1
+
+    def train(self):
+        X_train = self.X_train
+        y_train = self.y_train
+
+        num_samples = X_train.shape[0]
+
+        for epoch in range(ITERATIONS):
+            # Shuffle data for each epoch
+            X_shuffled = X_train
+            y_shuffled = y_train
+
+            for i in range(0, num_samples, BATCH_SIZE):
+                X_batch = X_shuffled[i : i + BATCH_SIZE, :]
+                y_batch = y_shuffled[i : i + BATCH_SIZE, :]
+
+                # Forward pass
+                output = self.forward(X_batch)
+
+                # Backward pass and update weights
+                self.backward(X_batch, y_batch, output)
+
+        cupy.cuda.runtime.deviceSynchronize()
+
+    def predict(self, X):
+        return cupy.argmax(self.forward(X), axis=1)
+
+
+class NeuralNetwork_af:
+    def __init__(self):
+        self.input_size = INPUT_SIZE
+        self.hidden_size = HIDDEN_SIZE
+        self.output_size = OUTPUT_SIZE
+        self.learning_rate = LEARNING_RATE
+
+        # Initialize weights and biases
+        # He initialization (for ReLU) is often a good choice
+        self.W1 = af.randn((self.input_size, self.hidden_size)) * np.sqrt(2.0 / self.input_size)
+        self.b1 = af.constant(0, (1, self.hidden_size))
+        self.W2 = af.randn((self.hidden_size, self.output_size)) * np.sqrt(2.0 / self.hidden_size)
+        self.b2 = af.constant(0, (1, self.output_size))
+
+        self.X_train = af.randu((SAMPLES, INPUT_SIZE))
+        self.y_train = af.constant(0, (SAMPLES, OUTPUT_SIZE))
+
+        self.y_train = af.constant(0, (SAMPLES, OUTPUT_SIZE))
+        self.y_train[af.iota(SAMPLES), af.floor(af.randu(SAMPLES) * OUTPUT_SIZE)] = 1
+
+        af.eval(self.X_train)
+        af.eval(self.y_train)
+        af.eval(self.W1)
+        af.eval(self.W2)
+        af.eval(self.b1)
+        af.eval(self.b2)
+        af.sync()
+
+    def relu(self, x):
+        selection = x > 0
+        return af.select(x, 0, selection)
+
+    def relu_derivative(self, x):
+        return af.cast(x > 0, getattr(af, DTYPE))
+
+    def softmax(self, x):
+        exp_scores = af.exp(x - af.max(x, axis=1))  # Subtract max for numerical stability
+        return exp_scores / af.sum(exp_scores, axis=1)
+
+    def forward(self, X):
+        # Hidden layer
+        self.z1 = af.matmul(X, self.W1) + self.b1
+        self.a1 = self.relu(self.z1)
+
+        # Output layer
+        self.z2 = af.matmul(self.a1, self.W2) + self.b2
+        self.a2 = self.softmax(self.z2)
+        return self.a2
+
+    def backward(self, X, y, output):
+        # Calculate gradients for the output layer
+        error_output = output - y  # Derivative of cross-entropy loss w.r.t. softmax input
+        dW2 = af.matmul(self.a1.T, error_output)
+        db2 = af.sum(error_output, axis=0)
+
+        # Calculate gradients for the hidden layer
+        error_hidden = af.matmul(error_output, self.W2.T) * self.relu_derivative(self.z1)
+        dW1 = af.matmul(X.T, error_hidden)
+        db1 = af.sum(error_hidden, axis=0)
+
+        # Update weights and biases
+        self.W2 -= self.learning_rate * dW2
+        self.b2 -= self.learning_rate * db2
+        self.W1 -= self.learning_rate * dW1
+        self.b1 -= self.learning_rate * db1
+
+    def train(self):
+        X_train = self.X_train
+        y_train = self.y_train
+
+        num_samples = X_train.shape[0]
+
+        for epoch in range(ITERATIONS):
+            # Shuffle data for each epoch
+            X_shuffled = X_train
+            y_shuffled = y_train
+
+            for i in range(0, num_samples, BATCH_SIZE):
+                X_batch = X_shuffled[i : i + BATCH_SIZE, :]
+                y_batch = y_shuffled[i : i + BATCH_SIZE, :]
+
+                # Forward pass
+                output = self.forward(X_batch)
+
+                # Backward pass and update weights
+                self.backward(X_batch, y_batch, output)
+
+        af.eval(self.W2)
+        af.eval(self.b2)
+        af.eval(self.W1)
+        af.eval(self.b1)
+        af.sync()
+
+    def predict(self, X):
+        return af.where(X == af.tile(af.max(self.forward(X), axis=1), (1, X.shape[1])))
diff --git a/benchmarks/src/pytest_benchmark/test_random.py b/benchmarks/src/pytest_benchmark/test_random.py
new file mode 100644
index 0000000..19caebb
--- /dev/null
+++ b/benchmarks/src/pytest_benchmark/test_random.py
@@ -0,0 +1,61 @@
+from common import *
+
+ITERATIONS = 20
+
+
+def randn_np():
+    arr = np.random.normal(size=(NNSIZE))
+
+
+def randn_dpnp():
+    arr = dpnp.random.normal(size=(NNSIZE))
+
+
+def randn_cupy():
+    arr = cupy.random.normal(size=(NNSIZE))
+    cupy.cuda.runtime.deviceSynchronize()
+
+
+def randn_af():
+    arr = af.randn((NNSIZE))
+    af.eval(arr)
+    af.sync()
+
+
+def randu_np():
+    arr = np.random.uniform(size=(NNSIZE))
+
+
+def randu_dpnp():
+    arr = dpnp.random.uniform(size=(NNSIZE))
+
+
+def randu_cupy():
+    arr = cupy.random.uniform(size=(NNSIZE))
+    cupy.cuda.runtime.deviceSynchronize()
+
+
+def randu_af():
+    arr = af.randu((NNSIZE))
+    af.eval(arr)
+    af.sync()
+
+
+@pytest.mark.parametrize("pkgid", IDS, ids=IDS)
+class TestRandom:
+    def test_normal(self, benchmark, pkgid):
+        initialize_package(pkgid)
+
+        pkg = PKGDICT[pkgid]
+        FUNCS = {"dpnp": randn_dpnp, "numpy": randn_np, "cupy": randn_cupy, "arrayfire": randn_af}
+
+        benchmark.extra_info["description"] = f"{NNSIZE:.2e} Samples"
+        result = benchmark.pedantic(target=FUNCS[pkg.__name__], rounds=ROUNDS, iterations=ITERATIONS)
+
+    def test_uniform(self, benchmark, pkgid):
+        initialize_package(pkgid)
+
+        pkg = PKGDICT[pkgid]
+        FUNCS = {"dpnp": randu_dpnp, "numpy": randu_np, "cupy": randu_cupy, "arrayfire": randu_af}
+
+        result = benchmark.pedantic(target=FUNCS[pkg.__name__], rounds=ROUNDS, iterations=ITERATIONS)
diff --git a/benchmarks/src/requirements.txt b/benchmarks/src/requirements.txt
new file mode 100644
index 0000000..30b908b
--- /dev/null
+++ b/benchmarks/src/requirements.txt
@@ -0,0 +1,10 @@
+--extra-index-url https://software.repos.intel.com/python/pypi
+dpnp
+numpy
+cupy-cuda12x
+pytest
+pytest-benchmark
+matplotlib
+gdown
+pandas
+# arrayfire
\ No newline at end of file