diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index dd89ec63c..06d631a15 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -153,7 +153,7 @@ jobs:
         -   uses: actions/checkout@v4
         -   name: "Main Script"
             run: |
-                EXTRA_INSTALL="pillow cgen mako imageio"
+                EXTRA_INSTALL="pillow cgen mako imageio matplotlib"
 
                 curl -L -O https://tiker.net/ci-support-v0
                 . ci-support-v0
diff --git a/examples/demo_flops.py b/examples/demo_flops.py
new file mode 100644
index 000000000..06ab20844
--- /dev/null
+++ b/examples/demo_flops.py
@@ -0,0 +1,79 @@
+import matplotlib.pyplot as plt
+import numpy as np
+
+import pyopencl as cl
+
+
+src = """
+    __kernel void sum(__global T *x, __global T *y, __global T *z) {
+        const int i = get_global_id(0);
+
+        z[i] = x[i] + y[i];
+    }
+"""
+
+MAX_ALLOCATION_SIZE = 2 ** 30
+WARM_UP_RUNS = 4
+HOT_RUNS = 10
+
+
+# allocates buffers of increasing size, for each run do a parallel sum interpreting
+# the buffer as an array of i8, i16, ...
+# profile the kernels to find the throughput in GFLOPS, useful to estimate the raw
+# computational speed of the hardware
+if __name__ == "__main__":
+    types = [
+        ("i8", "char", 1),
+        ("i16", "short", 2),
+        ("i32", "int", 4),
+        ("i64", "long", 8),
+        # ("f16", "half"  , 2),
+        ("f32", "float", 4),
+        ("f64", "double", 8)
+    ]
+
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(
+        ctx,
+        properties=cl.command_queue_properties.PROFILING_ENABLE
+    )
+
+    buffer_size = [2 ** i for i in range(10, 31) if 2 ** i < MAX_ALLOCATION_SIZE]
+    data = np.zeros((len(buffer_size), len(types)))
+
+    for row, nbytes in enumerate(buffer_size):
+        x = cl.Buffer(ctx, cl.mem_flags.READ_ONLY,  nbytes)
+        y = cl.Buffer(ctx, cl.mem_flags.READ_ONLY,  nbytes)
+        z = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, nbytes)
+
+        for col, (_label, literal, sizeof) in enumerate(types):
+            sums = nbytes // sizeof
+            header = f"#define T {literal}\n"
+            kernel = cl.Program(ctx, header + src).build().sum
+
+            events = [
+                kernel(queue, (sums,), None, x, y, z)
+                for _ in range(WARM_UP_RUNS + HOT_RUNS)
+            ]
+            events[-1].wait()
+            events = events[WARM_UP_RUNS:]
+
+            FLOPS = np.mean(
+                1e9 * sums / np.array([e.profile.end - e.profile.start for e in events])
+            )
+            GFLOPS = FLOPS / 1e6
+
+            data[row, col] = GFLOPS
+
+        x.release()
+        y.release()
+        z.release()
+
+    for col, (_, label, _) in enumerate(types):
+        plt.semilogx(buffer_size, data[:, col], label=label)
+
+    plt.title(f"{ctx.devices[0].name}")
+    plt.legend()
+    plt.xlabel("sizeof(vector)")
+    plt.ylabel("GFLOPS")
+    plt.show()