From 803313b73a3ed05e5f8f741e1a1d2ad44e1e7bbb Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 30 Aug 2025 21:58:52 +0000
Subject: [PATCH 1/5] Initial plan


From 84e9584ce54d8ef498ce3853d4d05a9310a67a48 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 30 Aug 2025 22:04:33 +0000
Subject: [PATCH 2/5] Implement pytest for 04_atomic_add/atomic_add_bench.py

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 examples/04_atomic_add/atomic_add_bench.py | 38 +++++++++++++
 tests/examples/test_atomic_add_bench.py    | 62 ++++++++++++++++++++++
 2 files changed, 100 insertions(+)
 create mode 100644 tests/examples/test_atomic_add_bench.py

diff --git a/examples/04_atomic_add/atomic_add_bench.py b/examples/04_atomic_add/atomic_add_bench.py
index 6b292736..0f45331b 100755
--- a/examples/04_atomic_add/atomic_add_bench.py
+++ b/examples/04_atomic_add/atomic_add_bench.py
@@ -82,6 +82,44 @@ def parse_args():
     return vars(parser.parse_args())
 
 
+def bench_atomic_add(
+    shmem,
+    source_rank,
+    destination_rank,
+    source_buffer,
+    result_buffer,
+    BLOCK_SIZE,
+    dtype,
+    verbose=False,
+    validate=False,
+    num_experiments=1,
+    num_warmup=0,
+):
+    """
+    Wrapper function for testing compatibility, follows the same signature as bench_load.
+    """
+    # Convert dtype to string for args dict
+    dtype_str_map = {
+        torch.int8: "int8",
+        torch.float16: "fp16",
+        torch.bfloat16: "bf16",
+        torch.float32: "fp32",
+    }
+    datatype_str = dtype_str_map.get(dtype, "fp16")
+
+    # Create args dict as expected by run_experiment
+    args = {
+        "datatype": datatype_str,
+        "block_size": BLOCK_SIZE,
+        "verbose": verbose,
+        "validate": validate,
+        "num_experiments": num_experiments,
+        "num_warmup": num_warmup,
+    }
+
+    return run_experiment(shmem, args, source_rank, destination_rank, source_buffer, result_buffer)
+
+
 def run_experiment(shmem, args, source_rank, destination_rank, source_buffer, result_buffer):
     dtype = torch_dtype_from_str(args["datatype"])
     cur_rank = shmem.get_rank()
diff --git a/tests/examples/test_atomic_add_bench.py b/tests/examples/test_atomic_add_bench.py
new file mode 100644
index 00000000..2749b1ce
--- /dev/null
+++ b/tests/examples/test_atomic_add_bench.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+import pytest
+import torch
+import triton
+import triton.language as tl
+import numpy as np
+import iris
+
+import importlib.util
+from pathlib import Path
+
+current_dir = Path(__file__).parent
+file_path = (current_dir / "../../examples/04_atomic_add/atomic_add_bench.py").resolve()
+module_name = "atomic_add_bench"
+spec = importlib.util.spec_from_file_location(module_name, file_path)
+module = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(module)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "buffer_size, heap_size",
+    [
+        ((1 << 32), (1 << 33)),
+    ],
+)
+@pytest.mark.parametrize(
+    "block_size",
+    [
+        512,
+        1024,
+    ],
+)
+def test_atomic_add_bench(dtype, buffer_size, heap_size, block_size):
+    shmem = iris.iris(heap_size)
+    num_ranks = shmem.get_num_ranks()
+
+    bandwidth_matrix = np.zeros((num_ranks, num_ranks), dtype=np.float32)
+    element_size_bytes = torch.tensor([], dtype=dtype).element_size()
+    source_buffer = shmem.arange(buffer_size // element_size_bytes, dtype=dtype)
+    result_buffer = shmem.zeros_like(source_buffer)
+
+    shmem.barrier()
+
+    for source_rank in range(num_ranks):
+        for destination_rank in range(num_ranks):
+            bandwidth_gbps = module.bench_atomic_add(
+                shmem, source_rank, destination_rank, source_buffer, result_buffer, block_size, dtype
+            )
+            bandwidth_matrix[source_rank, destination_rank] = bandwidth_gbps
+            shmem.barrier()

From 192ba5b7304c8e3b10a3df3426f7b00a6c8424c1 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 30 Aug 2025 22:06:02 +0000
Subject: [PATCH 3/5] Update atomic_add README with correct title and testing
 instructions

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 examples/04_atomic_add/README.md | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/examples/04_atomic_add/README.md b/examples/04_atomic_add/README.md
index 242cabe5..a582b93a 100644
--- a/examples/04_atomic_add/README.md
+++ b/examples/04_atomic_add/README.md
@@ -3,9 +3,9 @@ SPDX-License-Identifier: MIT
 Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
 -->
 
-# Load benchmark
+# Atomic Add benchmark
 
-Load benchmark using Iris.
+Atomic Add benchmark using Iris.
 
 ## Usage
 
@@ -24,4 +24,14 @@ GPU 04  ->     15.58     15.68     15.21     15.32    769.53     15.67     15.58
 GPU 05  ->     15.59     15.49     15.24     15.50     15.57    773.01     15.67     15.59
 GPU 06  ->     15.41     15.41     15.15     15.06     15.50     15.67    778.30     15.58
 GPU 07  ->     15.22     15.33     15.07     15.06     15.66     15.54     15.56    765.45
-```
\ No newline at end of file
+```
+
+## Testing
+
+The benchmark can be tested using pytest:
+
+```terminal
+pytest tests/examples/test_atomic_add_bench.py
+```
+
+This will run parametrized tests across different data types (int8, fp16, bf16, fp32), buffer sizes, and block sizes to ensure the atomic add functionality works correctly.

From 6d3ad46cb366a2a64684f11a9a91fee354134d5c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 31 Aug 2025 03:32:14 +0000
Subject: [PATCH 4/5] Fix HIP kernel configuration error in atomic_add_bench
 test by using shmem.ones() instead of shmem.arange()

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 tests/examples/test_atomic_add_bench.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/examples/test_atomic_add_bench.py b/tests/examples/test_atomic_add_bench.py
index 2749b1ce..c3fa2577 100644
--- a/tests/examples/test_atomic_add_bench.py
+++ b/tests/examples/test_atomic_add_bench.py
@@ -48,7 +48,7 @@ def test_atomic_add_bench(dtype, buffer_size, heap_size, block_size):
 
     bandwidth_matrix = np.zeros((num_ranks, num_ranks), dtype=np.float32)
     element_size_bytes = torch.tensor([], dtype=dtype).element_size()
-    source_buffer = shmem.arange(buffer_size // element_size_bytes, dtype=dtype)
+    source_buffer = shmem.ones(buffer_size // element_size_bytes, dtype=dtype)
     result_buffer = shmem.zeros_like(source_buffer)
 
     shmem.barrier()

From 26b0163fde44606adf3db78a5d6b20b8cdf3d41a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 31 Aug 2025 04:10:48 +0000
Subject: [PATCH 5/5] Remove int8 support from atomic_add_bench - not supported
 by atomic_add operation

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 examples/04_atomic_add/atomic_add_bench.py | 8 +++++---
 tests/examples/test_atomic_add_bench.py    | 3 ++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/examples/04_atomic_add/atomic_add_bench.py b/examples/04_atomic_add/atomic_add_bench.py
index 0f45331b..ce29764c 100755
--- a/examples/04_atomic_add/atomic_add_bench.py
+++ b/examples/04_atomic_add/atomic_add_bench.py
@@ -45,7 +45,8 @@ def torch_dtype_from_str(datatype: str) -> torch.dtype:
     dtype_map = {
         "fp16": torch.float16,
         "fp32": torch.float32,
-        "int8": torch.int8,
+        "int32": torch.int32,
+        "int64": torch.int64,
         "bf16": torch.bfloat16,
     }
     try:
@@ -65,7 +66,7 @@ def parse_args():
         "--datatype",
         type=str,
         default="fp16",
-        choices=["fp16", "fp32", "int8", "bf16"],
+        choices=["fp16", "fp32", "int32", "int64", "bf16"],
         help="Datatype of computation",
     )
     parser.add_argument("-z", "--buffer_size", type=int, default=1 << 32, help="Buffer Size")
@@ -100,7 +101,8 @@ def bench_atomic_add(
     """
     # Convert dtype to string for args dict
     dtype_str_map = {
-        torch.int8: "int8",
+        torch.int32: "int32",
+        torch.int64: "int64",
         torch.float16: "fp16",
         torch.bfloat16: "bf16",
         torch.float32: "fp32",
diff --git a/tests/examples/test_atomic_add_bench.py b/tests/examples/test_atomic_add_bench.py
index c3fa2577..c8acd418 100644
--- a/tests/examples/test_atomic_add_bench.py
+++ b/tests/examples/test_atomic_add_bench.py
@@ -23,7 +23,8 @@
 @pytest.mark.parametrize(
     "dtype",
     [
-        torch.int8,
+        torch.int32,
+        torch.int64,
         torch.float16,
         torch.bfloat16,
         torch.float32,