From 15d981aebbfa4869d7aa4b2d61033fcd8935f41b Mon Sep 17 00:00:00 2001 From: FattiMei Date: Mon, 15 Jul 2024 22:13:50 +0200 Subject: [PATCH 1/4] Add example that profiles parallel sum --- examples/demo_flops.py | 66 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 examples/demo_flops.py diff --git a/examples/demo_flops.py b/examples/demo_flops.py new file mode 100644 index 000000000..1c4193308 --- /dev/null +++ b/examples/demo_flops.py @@ -0,0 +1,66 @@ +import pyopencl as cl +import numpy as np +import matplotlib.pyplot as plt + + +src = """ + __kernel void sum(__global T *x, __global T *y, __global T *z) { + const int i = get_global_id(0); + + z[i] = x[i] + y[i]; + } +""" + + +# allocates buffers of increasing size, for each run do a parallel sum interpreting +# the buffer as an array of i8, i16, ... +# profile the kernels to find the throughput in GFLOPS, useful to estimate the raw computational speed of the hardware +if __name__ == '__main__': + types = [ + ('i8' , 'char' , 1), + ('i16', 'short' , 2), + ('i32', 'int' , 4), + ('i64', 'long' , 8), + # ('f16', 'half' , 2), + ('f32', 'float' , 4), + ('f64', 'double', 8) + ] + + + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx, properties = cl.command_queue_properties.PROFILING_ENABLE) + + + buffer_size = [2 ** i for i in range(10, 31)] + data = np.zeros((len(buffer_size), len(types))) + + for row, nbytes in enumerate(buffer_size): + x = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, nbytes) + y = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, nbytes) + z = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, nbytes) + + for col, (label, literal, sizeof) in enumerate(types): + sums = nbytes // sizeof + header = f'#define T {literal}\n' + kernel = cl.Program(ctx, header + src).build().sum + + event = kernel(queue, (sums,), None, x, y, z) + event.wait() + + FLOPS = 1e9 * sums / (event.profile.end - event.profile.start) + GFLOPS = FLOPS / 1e6 + + data[row, col] = GFLOPS + + x.release() + y.release() + z.release() + + for col, (_, label, _) in enumerate(types): + plt.semilogx(buffer_size, data[:, col], label = label) + + plt.title(f'{ctx.devices[0].name}') + plt.legend() + plt.xlabel('sizeof(vector)') + plt.ylabel('GFLOPS') + plt.show() From dfef25abb980c7a0f94e2d88b0adb5a49a52e225 Mon Sep 17 00:00:00 2001 From: FattiMei Date: Tue, 16 Jul 2024 07:12:48 +0200 Subject: [PATCH 2/4] refactor: comply with ruff requirements --- examples/demo_flops.py | 50 ++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/examples/demo_flops.py b/examples/demo_flops.py index 1c4193308..9eb918b18 100644 --- a/examples/demo_flops.py +++ b/examples/demo_flops.py @@ -1,6 +1,6 @@ -import pyopencl as cl -import numpy as np import matplotlib.pyplot as plt +import numpy as np +import pyopencl as cl src = """ @@ -14,22 +14,24 @@ # allocates buffers of increasing size, for each run do a parallel sum interpreting # the buffer as an array of i8, i16, ... -# profile the kernels to find the throughput in GFLOPS, useful to estimate the raw computational speed of the hardware -if __name__ == '__main__': +# profile the kernels to find the throughput in GFLOPS, useful to estimate the raw +# computational speed of the hardware +if __name__ == "__main__": types = [ - ('i8' , 'char' , 1), - ('i16', 'short' , 2), - ('i32', 'int' , 4), - ('i64', 'long' , 8), - # ('f16', 'half' , 2), - ('f32', 'float' , 4), - ('f64', 'double', 8) + ("i8", "char", 1), + ("i16", "short", 2), + ("i32", "int", 4), + ("i64", "long", 8), + # ("f16", "half" , 2), + ("f32", "float", 4), + ("f64", "double", 8) ] - - ctx = cl.create_some_context() - queue = cl.CommandQueue(ctx, properties = cl.command_queue_properties.PROFILING_ENABLE) - + ctx = cl.create_some_context() + queue = cl.CommandQueue( + ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE + ) buffer_size = [2 ** i for i in range(10, 31)] data = np.zeros((len(buffer_size), len(types))) @@ -39,12 +41,12 @@ y = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, nbytes) z = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, nbytes) - for col, (label, literal, sizeof) in enumerate(types): - sums = nbytes // sizeof - header = f'#define T {literal}\n' - kernel = cl.Program(ctx, header + src).build().sum + for col, (_label, literal, sizeof) in enumerate(types): + sums = nbytes // sizeof + header = f"#define T {literal}\n" + kernel = cl.Program(ctx, header + src).build().sum - event = kernel(queue, (sums,), None, x, y, z) + event = kernel(queue, (sums,), None, x, y, z) event.wait() FLOPS = 1e9 * sums / (event.profile.end - event.profile.start) @@ -57,10 +59,10 @@ z.release() for col, (_, label, _) in enumerate(types): - plt.semilogx(buffer_size, data[:, col], label = label) + plt.semilogx(buffer_size, data[:, col], label=label) - plt.title(f'{ctx.devices[0].name}') + plt.title(f"{ctx.devices[0].name}") plt.legend() - plt.xlabel('sizeof(vector)') - plt.ylabel('GFLOPS') + plt.xlabel("sizeof(vector)") + plt.ylabel("GFLOPS") plt.show() From df99f35c70e52ae454bfc40915047665513902b8 Mon Sep 17 00:00:00 2001 From: FattiMei Date: Tue, 16 Jul 2024 07:17:02 +0200 Subject: [PATCH 3/4] Add warm-up runs and multiple measurements per run --- examples/demo_flops.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/examples/demo_flops.py b/examples/demo_flops.py index 9eb918b18..06ab20844 100644 --- a/examples/demo_flops.py +++ b/examples/demo_flops.py @@ -1,5 +1,6 @@ import matplotlib.pyplot as plt import numpy as np + import pyopencl as cl @@ -11,6 +12,10 @@ } """ +MAX_ALLOCATION_SIZE = 2 ** 30 +WARM_UP_RUNS = 4 +HOT_RUNS = 10 + # allocates buffers of increasing size, for each run do a parallel sum interpreting # the buffer as an array of i8, i16, ... @@ -33,7 +38,7 @@ properties=cl.command_queue_properties.PROFILING_ENABLE ) - buffer_size = [2 ** i for i in range(10, 31)] + buffer_size = [2 ** i for i in range(10, 31) if 2 ** i < MAX_ALLOCATION_SIZE] data = np.zeros((len(buffer_size), len(types))) for row, nbytes in enumerate(buffer_size): @@ -46,10 +51,16 @@ header = f"#define T {literal}\n" kernel = cl.Program(ctx, header + src).build().sum - event = kernel(queue, (sums,), None, x, y, z) - event.wait() + events = [ + kernel(queue, (sums,), None, x, y, z) + for _ in range(WARM_UP_RUNS + HOT_RUNS) + ] + events[-1].wait() + events = events[WARM_UP_RUNS:] - FLOPS = 1e9 * sums / (event.profile.end - event.profile.start) + FLOPS = np.mean( + 1e9 * sums / np.array([e.profile.end - e.profile.start for e in events]) + ) GFLOPS = FLOPS / 1e6 data[row, col] = GFLOPS From 72b1ab45b991ecd222b953f644354fa75153fd53 Mon Sep 17 00:00:00 2001 From: FattiMei Date: Tue, 16 Jul 2024 07:24:36 +0200 Subject: [PATCH 4/4] Add matplotlib dependency to examples ci --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dd89ec63c..06d631a15 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -153,7 +153,7 @@ jobs: - uses: actions/checkout@v4 - name: "Main Script" run: | - EXTRA_INSTALL="pillow cgen mako imageio" + EXTRA_INSTALL="pillow cgen mako imageio matplotlib" curl -L -O https://tiker.net/ci-support-v0 . ci-support-v0