From 15d981aebbfa4869d7aa4b2d61033fcd8935f41b Mon Sep 17 00:00:00 2001
From: FattiMei <teomei68@gmail.com>
Date: Mon, 15 Jul 2024 22:13:50 +0200
Subject: [PATCH 1/4] Add example that profiles parallel sum

---
 examples/demo_flops.py | 66 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 examples/demo_flops.py

diff --git a/examples/demo_flops.py b/examples/demo_flops.py
new file mode 100644
index 000000000..1c4193308
--- /dev/null
+++ b/examples/demo_flops.py
@@ -0,0 +1,66 @@
+import pyopencl as cl
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+src = """
+    __kernel void sum(__global T *x, __global T *y, __global T *z) {
+        const int i = get_global_id(0);
+
+        z[i] = x[i] + y[i];
+    }
+"""
+
+
+# allocates buffers of increasing size, for each run do a parallel sum interpreting
+# the buffer as an array of i8, i16, ...
+# profile the kernels to find the throughput in GFLOPS, useful to estimate the raw computational speed of the hardware
+if __name__ == '__main__':
+    types = [
+        ('i8' , 'char'  , 1),
+        ('i16', 'short' , 2),
+        ('i32', 'int'   , 4),
+        ('i64', 'long'  , 8),
+        # ('f16', 'half'  , 2),
+        ('f32', 'float' , 4),
+        ('f64', 'double', 8)
+    ]
+
+
+    ctx   = cl.create_some_context()
+    queue = cl.CommandQueue(ctx, properties = cl.command_queue_properties.PROFILING_ENABLE)
+
+
+    buffer_size = [2 ** i for i in range(10, 31)]
+    data = np.zeros((len(buffer_size), len(types)))
+
+    for row, nbytes in enumerate(buffer_size):
+        x = cl.Buffer(ctx, cl.mem_flags.READ_ONLY,  nbytes)
+        y = cl.Buffer(ctx, cl.mem_flags.READ_ONLY,  nbytes)
+        z = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, nbytes)
+
+        for col, (label, literal, sizeof) in enumerate(types):
+            sums    = nbytes // sizeof
+            header  = f'#define T {literal}\n'
+            kernel  = cl.Program(ctx, header + src).build().sum
+
+            event   = kernel(queue, (sums,), None, x, y, z)
+            event.wait()
+
+            FLOPS = 1e9 * sums / (event.profile.end - event.profile.start)
+            GFLOPS = FLOPS / 1e6
+
+            data[row, col] = GFLOPS
+
+        x.release()
+        y.release()
+        z.release()
+
+    for col, (_, label, _) in enumerate(types):
+        plt.semilogx(buffer_size, data[:, col], label = label)
+
+    plt.title(f'{ctx.devices[0].name}')
+    plt.legend()
+    plt.xlabel('sizeof(vector)')
+    plt.ylabel('GFLOPS')
+    plt.show()

From dfef25abb980c7a0f94e2d88b0adb5a49a52e225 Mon Sep 17 00:00:00 2001
From: FattiMei <teomei68@gmail.com>
Date: Tue, 16 Jul 2024 07:12:48 +0200
Subject: [PATCH 2/4] refactor: comply with ruff requirements

---
 examples/demo_flops.py | 50 ++++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/examples/demo_flops.py b/examples/demo_flops.py
index 1c4193308..9eb918b18 100644
--- a/examples/demo_flops.py
+++ b/examples/demo_flops.py
@@ -1,6 +1,6 @@
-import pyopencl as cl
-import numpy as np
 import matplotlib.pyplot as plt
+import numpy as np
+import pyopencl as cl
 
 
 src = """
@@ -14,22 +14,24 @@
 
 # allocates buffers of increasing size, for each run do a parallel sum interpreting
 # the buffer as an array of i8, i16, ...
-# profile the kernels to find the throughput in GFLOPS, useful to estimate the raw computational speed of the hardware
-if __name__ == '__main__':
+# profile the kernels to find the throughput in GFLOPS, useful to estimate the raw
+# computational speed of the hardware
+if __name__ == "__main__":
     types = [
-        ('i8' , 'char'  , 1),
-        ('i16', 'short' , 2),
-        ('i32', 'int'   , 4),
-        ('i64', 'long'  , 8),
-        # ('f16', 'half'  , 2),
-        ('f32', 'float' , 4),
-        ('f64', 'double', 8)
+        ("i8", "char", 1),
+        ("i16", "short", 2),
+        ("i32", "int", 4),
+        ("i64", "long", 8),
+        # ("f16", "half"  , 2),
+        ("f32", "float", 4),
+        ("f64", "double", 8)
     ]
 
-
-    ctx   = cl.create_some_context()
-    queue = cl.CommandQueue(ctx, properties = cl.command_queue_properties.PROFILING_ENABLE)
-
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(
+        ctx,
+        properties=cl.command_queue_properties.PROFILING_ENABLE
+    )
 
     buffer_size = [2 ** i for i in range(10, 31)]
     data = np.zeros((len(buffer_size), len(types)))
@@ -39,12 +41,12 @@
         y = cl.Buffer(ctx, cl.mem_flags.READ_ONLY,  nbytes)
         z = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, nbytes)
 
-        for col, (label, literal, sizeof) in enumerate(types):
-            sums    = nbytes // sizeof
-            header  = f'#define T {literal}\n'
-            kernel  = cl.Program(ctx, header + src).build().sum
+        for col, (_label, literal, sizeof) in enumerate(types):
+            sums = nbytes // sizeof
+            header = f"#define T {literal}\n"
+            kernel = cl.Program(ctx, header + src).build().sum
 
-            event   = kernel(queue, (sums,), None, x, y, z)
+            event = kernel(queue, (sums,), None, x, y, z)
             event.wait()
 
             FLOPS = 1e9 * sums / (event.profile.end - event.profile.start)
@@ -57,10 +59,10 @@
         z.release()
 
     for col, (_, label, _) in enumerate(types):
-        plt.semilogx(buffer_size, data[:, col], label = label)
+        plt.semilogx(buffer_size, data[:, col], label=label)
 
-    plt.title(f'{ctx.devices[0].name}')
+    plt.title(f"{ctx.devices[0].name}")
     plt.legend()
-    plt.xlabel('sizeof(vector)')
-    plt.ylabel('GFLOPS')
+    plt.xlabel("sizeof(vector)")
+    plt.ylabel("GFLOPS")
     plt.show()

From df99f35c70e52ae454bfc40915047665513902b8 Mon Sep 17 00:00:00 2001
From: FattiMei <teomei68@gmail.com>
Date: Tue, 16 Jul 2024 07:17:02 +0200
Subject: [PATCH 3/4] Add warm-up runs and multiple measurements per run

---
 examples/demo_flops.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/examples/demo_flops.py b/examples/demo_flops.py
index 9eb918b18..06ab20844 100644
--- a/examples/demo_flops.py
+++ b/examples/demo_flops.py
@@ -1,5 +1,6 @@
 import matplotlib.pyplot as plt
 import numpy as np
+
 import pyopencl as cl
 
 
@@ -11,6 +12,10 @@
     }
 """
 
+MAX_ALLOCATION_SIZE = 2 ** 30
+WARM_UP_RUNS = 4
+HOT_RUNS = 10
+
 
 # allocates buffers of increasing size, for each run do a parallel sum interpreting
 # the buffer as an array of i8, i16, ...
@@ -33,7 +38,7 @@
         properties=cl.command_queue_properties.PROFILING_ENABLE
     )
 
-    buffer_size = [2 ** i for i in range(10, 31)]
+    buffer_size = [2 ** i for i in range(10, 31) if 2 ** i < MAX_ALLOCATION_SIZE]
     data = np.zeros((len(buffer_size), len(types)))
 
     for row, nbytes in enumerate(buffer_size):
@@ -46,10 +51,16 @@
             header = f"#define T {literal}\n"
             kernel = cl.Program(ctx, header + src).build().sum
 
-            event = kernel(queue, (sums,), None, x, y, z)
-            event.wait()
+            events = [
+                kernel(queue, (sums,), None, x, y, z)
+                for _ in range(WARM_UP_RUNS + HOT_RUNS)
+            ]
+            events[-1].wait()
+            events = events[WARM_UP_RUNS:]
 
-            FLOPS = 1e9 * sums / (event.profile.end - event.profile.start)
+            FLOPS = np.mean(
+                1e9 * sums / np.array([e.profile.end - e.profile.start for e in events])
+            )
             GFLOPS = FLOPS / 1e6
 
             data[row, col] = GFLOPS

From 72b1ab45b991ecd222b953f644354fa75153fd53 Mon Sep 17 00:00:00 2001
From: FattiMei <teomei68@gmail.com>
Date: Tue, 16 Jul 2024 07:24:36 +0200
Subject: [PATCH 4/4] Add matplotlib dependency to examples ci

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index dd89ec63c..06d631a15 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -153,7 +153,7 @@ jobs:
         -   uses: actions/checkout@v4
         -   name: "Main Script"
             run: |
-                EXTRA_INSTALL="pillow cgen mako imageio"
+                EXTRA_INSTALL="pillow cgen mako imageio matplotlib"
 
                 curl -L -O https://tiker.net/ci-support-v0
                 . ci-support-v0