diff --git a/.github/workflows/run_simulation_tests.yml b/.github/workflows/run_simulation_tests.yml
deleted file mode 100644
index 7ea89ef..0000000
--- a/.github/workflows/run_simulation_tests.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-name: Run Python Simulation Tests
-
-on:
-  push:
-    branches: [ "main" ]
-  pull_request:
-    branches: [ "main" ]
-
-permissions:
-  contents: read
-
-jobs:
-  build:
-
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v4
-    - name: Set up Python 3.10
-      uses: actions/setup-python@v3
-      with:
-        python-version: "3.10"
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
-        python -m pip install wget awscli
-        python -m pip install pytest
-        python -m pip install neuronx-cc==2.*
-    - name: Test with pytest
-      run: |
-        PYTHONPATH=$PYTHONPATH:src/ pytest test/unit/ --simulation-only
\ No newline at end of file
diff --git a/test/integration/flash_attention/flash_attention.py b/test/integration/flash_attention/flash_attention.py
deleted file mode 100644
index 5e14a95..0000000
--- a/test/integration/flash_attention/flash_attention.py
+++ /dev/null
@@ -1,160 +0,0 @@
-import os
-
-import neuronxcc.nki.language as nl
-import numpy as np
-import torch
-import torch_xla.core.xla_model as xm
-
-from neuronxcc.nki.kernels.attention import flash_attn_bwd, flash_fwd
-
-
-def _flash_attn_forward(q, k, v, causal, mixed_precision, seed, dropout_p, softmax_scale, sliding_window):
-    # flash_fwd assumes spmd grid uses key/value heads
-    bs, num_heads_kv, _, _ = k.shape
-    attn_output, lse = flash_fwd[bs, num_heads_kv](
-        q,
-        k,
-        v,
-        seed,
-        use_causal_mask=causal,
-        mixed_precision=mixed_precision,
-        dropout_p=dropout_p,
-        softmax_scale=softmax_scale,
-        sliding_window=sliding_window,
-    )
-    return attn_output, lse
-
-
-def _flash_attn_backward(
-    q, k, v, o, dout, lse, seed, causal, mixed_precision, dropout_p, softmax_scale, sliding_window
-):
-    bs, num_heads_kv, _, _ = k.shape
-    dq, dk, dv = flash_attn_bwd[bs, num_heads_kv](
-        q,
-        k,
-        v,
-        o,
-        dout,
-        lse,
-        seed,
-        use_causal_mask=causal,
-        mixed_precision=mixed_precision,
-        dropout_p=dropout_p,
-        softmax_scale=softmax_scale,
-        sliding_window=sliding_window,
-    )
-    return dq, dk, dv
-
-
-class NKIAttnFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        q,
-        k,
-        v,
-        softmax_scale,
-        causal: bool,
-        mixed_precision: bool,
-        seed,
-        dropout_p: float,
-        sliding_window: int,
-    ):
-        if softmax_scale is None:
-            softmax_scale = q.shape[-2] ** (-0.5)
-
-        if seed is None and dropout_p > 0.0:
-            # NKI only supports 32bit seed
-            seed = np.array([xm.get_rng_state()]).astype(np.int32)
-            seed = torch.from_numpy(seed).to(q.device)
-
-        attn_output, lse = _flash_attn_forward(
-            q,
-            k,
-            v.permute(0, 1, 3, 2),
-            causal=causal,
-            mixed_precision=mixed_precision,
-            seed=seed,
-            dropout_p=dropout_p,
-            softmax_scale=softmax_scale,
-            sliding_window=sliding_window,
-        )
-        ctx.save_for_backward(q, k, v, attn_output, lse, seed)
-        ctx.causal = causal
-        ctx.mixed_precision = mixed_precision
-        ctx.dropout_p = dropout_p
-        ctx.softmax_scale = softmax_scale
-        ctx.sliding_window = sliding_window
-
-        # Move seed manually if the dropout is used
-        # https://github.com/pytorch/xla/blob/v1.13.0/torch_xla/csrc/tensor.cpp#L323
-        if dropout_p > 0.0:
-            orig_seed = xm.get_rng_state()
-            running_seed = (orig_seed * 214013 + 2531011) & 0xFFFFFFFFFFFFFFFF
-            xm.set_rng_state(int(running_seed))
-        return attn_output
-
-    @staticmethod
-    def backward(ctx, dout, *args):
-        q, k, v, attn_output, lse, seed = ctx.saved_tensors
-        dout = dout.permute(0, 1, 3, 2)
-        attn_output = attn_output.permute(0, 1, 3, 2)
-        dq, dk, dv = _flash_attn_backward(
-            q,
-            k,
-            v,
-            attn_output,
-            dout,
-            lse,
-            seed=seed,
-            causal=ctx.causal,
-            mixed_precision=ctx.mixed_precision,
-            dropout_p=ctx.dropout_p,
-            softmax_scale=ctx.softmax_scale,
-            sliding_window=ctx.sliding_window,
-        )
-        return dq, dk, dv, None, None, None, None, None, None
-
-
-def nki_flash_attn_func(
-    q,
-    k,
-    v,
-    dropout_p=0.0,
-    softmax_scale=None,
-    causal=True,
-    mixed_precision=True,
-    seed=None,
-    sliding_window=-1,
-):
-    """
-    Arguments:
-        q: (batch_size, nheads, seqlen, headdim)
-        k: (batch_size, nheads_k, seqlen, headdim)
-        v: (batch_size, nheads_k, seqlen, headdim)
-        dropout_p: float. Dropout probability.
-        softmax_scale: float. The scaling of QK^T before applying softmax.
-            Default to 1 / sqrt(headdim).
-        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
-        mixed_precision: bool. Whether to enable the higher precisions on the softmax.
-        seed: int32 torch.Tensor. The seed for the dropout.
-
-    Return:
-        out: (batch_size, seqlen, nheads, headdim).
-    """
-
-    _, _, seqlen, _ = q.shape
-    if seqlen % 2048 != 0:
-        raise NotImplementedError("Only support sequence as multiples of 2K")
-
-    # Permute QKV to match the kernel required layouts
-    q = q.permute(0, 1, 3, 2)
-    k = k.permute(0, 1, 3, 2)
-    v = v.permute(0, 1, 3, 2)
-
-    if os.environ.get("XLA_USE_BF16") or os.environ.get("XLA_DOWNCAST_BF16"):
-        q = q.to(torch.bfloat16)
-        k = k.to(torch.bfloat16)
-        v = v.to(torch.bfloat16)
-
-    return NKIAttnFunc.apply(q, k, v, softmax_scale, causal, mixed_precision, seed, dropout_p, sliding_window)
diff --git a/test/integration/flash_attention/flash_attention_benchmark.py b/test/integration/flash_attention/flash_attention_benchmark.py
deleted file mode 100644
index 918a14f..0000000
--- a/test/integration/flash_attention/flash_attention_benchmark.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import torch
-import unittest
-import os
-import sys
-import pytest
-import torch.nn as nn
-
-os.environ["NEURON_CC_FLAGS"] = "--model-type=transformer --distribution-strategy=llm-training"
-os.environ["NEURON_FUSE_SOFTMAX"] = "1"
-
-import torch_xla
-import torch_xla.core.xla_model as xm
-from neuronxcc.starfish.support.util import allclose
-
-from flash_attention import nki_flash_attn_func
-
-parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-sys.path.append(parent_dir)
-from perf_utils.LatencyCollector import benchmark
-
-if len(sys.argv) != 2:
-    print("Usage: python flash_attention_benchmark.py <metric_path>")
-    exit(1)
-metric_path = os.path.abspath(sys.argv[1])
-
-torch.manual_seed(0)
-dtype = torch.bfloat16
-bs = 1
-num_heads = 4
-head_dim = 128
-seq_len = 32*1024
-query_states_cpu = torch.randn(bs, num_heads, seq_len, head_dim, dtype=dtype) - 0.5
-query_states_cpu.requires_grad_()
-key_states_cpu = torch.randn(bs, num_heads, seq_len, head_dim, dtype=dtype) - 0.5
-key_states_cpu.requires_grad_()
-value_states_cpu = torch.randn(bs, num_heads, seq_len, head_dim, dtype=dtype) - 0.5
-value_states_cpu.requires_grad_()
-
-# Run the Neuron kernel implementation with torch-xla
-query_states_xla = query_states_cpu.to(xm.xla_device()).detach().requires_grad_()
-key_states_xla = key_states_cpu.to(xm.xla_device()).detach().requires_grad_()
-value_states_xla = value_states_cpu.to(xm.xla_device()).detach().requires_grad_()
-
-def model_fwd_bwd(q, k, v):
-    attn_nki = nki_flash_attn_func(q, k, v)
-    loss_actual = torch.sum(attn_nki**2)
-    loss_actual.backward()
-    xm.mark_step()
-    return loss_actual 
-
-n_runs = 10
-bench_result = benchmark(n_runs, f"flash_attention_bs{bs}_heads{num_heads}_seq{seq_len}_head_dim{head_dim}",
-                         model_fwd_bwd,
-                         (query_states_xla, key_states_xla, value_states_xla),
-                         metric_path)
\ No newline at end of file
diff --git a/test/integration/flash_attention/flash_attention_correctness.py b/test/integration/flash_attention/flash_attention_correctness.py
deleted file mode 100644
index 945336f..0000000
--- a/test/integration/flash_attention/flash_attention_correctness.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import torch
-import unittest
-import os
-import sys
-import pytest
-import torch.nn as nn
-
-os.environ["NEURON_CC_FLAGS"] = "--model-type=transformer --distribution-strategy=llm-training"
-os.environ["NEURON_FUSE_SOFTMAX"] = "1"
-
-import torch_xla
-import torch_xla.core.xla_model as xm
-from neuronxcc.starfish.support.util import allclose
-
-from flash_attention import nki_flash_attn_func
-
-@pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float32])
-@pytest.mark.parametrize('causal,sliding_window', [(True, -1), (True, 128), (False, -1)])
-@pytest.mark.parametrize('num_heads,num_kv_heads', [(4, 4), (4, 1), (8, 2)]) # MHA, MQA, GQA
-def test_attention(dtype, causal, sliding_window, num_heads, num_kv_heads):
-    def torch_golden_attn_cpu(query_states, key_states, value_states):
-        if num_heads != num_kv_heads:
-            key_states = key_states.repeat_interleave(num_heads // num_kv_heads, dim=1)
-            value_states = value_states.repeat_interleave(num_heads // num_kv_heads, dim=1)
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * (query_states.shape[-1] ** (-0.5))
-
-        if sliding_window > 0:
-            causal_mask = torch.triu(
-                torch.tril(torch.ones(1, 1, query_states.shape[2], key_states.shape[2])),
-                diagonal=-(sliding_window - 1),
-            ).bool()
-            causal_mask = ~causal_mask
-            attn_weights = attn_weights.masked_fill_(causal_mask, -10000.0)
-        elif causal:
-            causal_mask = torch.triu(torch.ones((1, 1, query_states.shape[2], key_states.shape[2])), diagonal=1).bool()
-            attn_weights = attn_weights.masked_fill_(causal_mask, -10000.0)
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.double).to(query_states.dtype)
-
-        attn_output = torch.matmul(attn_weights, value_states)
-        return attn_output
-
-    torch.manual_seed(0)
-    bs = 1
-    head_dim = 128
-    seq_len = 4096
-    query_states_cpu = torch.randn(bs, num_heads, seq_len, head_dim, dtype=dtype) - 0.5
-    query_states_cpu.requires_grad_()
-    key_states_cpu = torch.randn(bs, num_kv_heads, seq_len, head_dim, dtype=dtype) - 0.5
-    key_states_cpu.requires_grad_()
-    value_states_cpu = torch.randn(bs, num_kv_heads, seq_len, head_dim, dtype=dtype) - 0.5
-    value_states_cpu.requires_grad_()
-
-    # Run the CPU golden results
-    golden_attn = torch_golden_attn_cpu(query_states_cpu, key_states_cpu, value_states_cpu)
-    loss_golden = torch.sum(golden_attn**2)
-    loss_golden.backward()
-
-    # Run the Neuron kernel implementation with torch-xla
-    query_states_xla = query_states_cpu.to(xm.xla_device()).detach().requires_grad_()
-    key_states_xla = key_states_cpu.to(xm.xla_device()).detach().requires_grad_()
-    value_states_xla = value_states_cpu.to(xm.xla_device()).detach().requires_grad_()
-    attn_nki = nki_flash_attn_func(
-        query_states_xla, key_states_xla, value_states_xla, causal=causal, sliding_window=sliding_window
-    )
-    loss_actual = torch.sum(attn_nki**2)
-    loss_actual.backward()
-    xm.mark_step()
-
-    actual_dv = value_states_xla.grad.to('cpu')
-    actual_dq = query_states_xla.grad.to('cpu')
-    actual_dk = key_states_xla.grad.to('cpu')
-
-    # Compare against cpu result
-    assert(allclose(loss_actual.to('cpu').to(torch.float32).detach().numpy(), loss_golden.to(torch.float32).detach().numpy(), atol=1e-5, rtol=0.05, verbose=1))
-    assert(allclose(actual_dv.to(torch.float32).numpy(), value_states_cpu.grad.to(torch.float32).numpy(), atol=1e-5, rtol=0.05, verbose=1))
-    assert(allclose(actual_dq.to(torch.float32).numpy(), query_states_cpu.grad.to(torch.float32).numpy(), atol=1e-5, rtol=0.05, verbose=1))
-    assert(allclose(actual_dk.to(torch.float32).numpy(), key_states_cpu.grad.to(torch.float32).numpy(), atol=1e-5, rtol=0.05, verbose=1))
\ No newline at end of file
diff --git a/test/integration/flash_attention/run.sh b/test/integration/flash_attention/run.sh
deleted file mode 100644
index 02870a4..0000000
--- a/test/integration/flash_attention/run.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-set -e
-
-if [ "$#" -ne 1 ]; then
-    echo "Illegal number of parameters. Must call with ./run.sh <metric_path> "
-    exit 1
-fi
-
-metric_path=$1
-
-echo ${test_name} is writing the benchmark result to ${metric_path}
-
-pytest flash_attention_correctness.py
-python flash_attention_benchmark.py $metric_path
-
diff --git a/test/integration/fused_sd_attention_small_head/run.sh b/test/integration/fused_sd_attention_small_head/run.sh
deleted file mode 100755
index 638bf3c..0000000
--- a/test/integration/fused_sd_attention_small_head/run.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-set -e
-
-
-if [ "$#" -ne 1 ]; then
-    echo "Illegal number of parameters. Must call with ./run.sh <metric_path> "
-    exit 1
-fi
-
-metric_path=$1
-
-echo ${test_name} is writing the benchmark result to ${metric_path}
-
-pip install diffusers==0.20.2 transformers==4.33.1 accelerate==0.22.0 safetensors==0.3.1 matplotlib
-python3 sd2_512_compile.py
-python3 sd2_512_benchmark.py $1
\ No newline at end of file
diff --git a/test/integration/fused_sd_attention_small_head/sd2_512_benchmark.py b/test/integration/fused_sd_attention_small_head/sd2_512_benchmark.py
deleted file mode 100644
index 5d63424..0000000
--- a/test/integration/fused_sd_attention_small_head/sd2_512_benchmark.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import os
-os.environ["NEURON_FUSE_SOFTMAX"] = "1"
-
-import torch
-import torch.nn as nn
-import torch_neuronx
-
-from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
-from diffusers.models.unet_2d_condition import UNet2DConditionOutput
-
-parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-sys.path.append(parent_dir)
-from perf_utils.LatencyCollector import benchmark
-
-import sys
-if len(sys.argv) != 2:
-    print("Usage: python sd2_512_benchmark.py <metric_path>")
-    exit(1)
-metric_path = os.path.abspath(sys.argv[1])
-
-# Define datatype
-DTYPE = torch.bfloat16
-
-class UNetWrap(nn.Module):
-    def __init__(self, unet):
-        super().__init__()
-        self.unet = unet
-
-    def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None):
-        out_tuple = self.unet(sample, timestep, encoder_hidden_states, return_dict=False)
-        return out_tuple
-
-class NeuronUNet(nn.Module):
-    def __init__(self, unetwrap):
-        super().__init__()
-        self.unetwrap = unetwrap
-        self.config = unetwrap.unet.config
-        self.in_channels = unetwrap.unet.in_channels
-        self.device = unetwrap.unet.device
-
-    def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None, return_dict=False):
-        sample = self.unetwrap(sample, timestep.to(dtype=DTYPE).expand((sample.shape[0],)), encoder_hidden_states)[0]
-        return UNet2DConditionOutput(sample=sample)
-
-class NeuronTextEncoder(nn.Module):
-    def __init__(self, text_encoder):
-        super().__init__()
-        self.neuron_text_encoder = text_encoder
-        self.config = text_encoder.config
-        self.dtype = text_encoder.dtype
-        self.device = text_encoder.device
-
-    def forward(self, emb, attention_mask = None):
-        return [self.neuron_text_encoder(emb)['last_hidden_state']]
-    
-def decode_latents(self, latents):
-    latents = latents.to(torch.float)
-    latents = 1 / self.vae.config.scaling_factor * latents
-    image = self.vae.decode(latents).sample
-    image = (image / 2 + 0.5).clamp(0, 1)
-    image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-    return image
-
-StableDiffusionPipeline.decode_latents = decode_latents
-
-# --- Load all compiled models and benchmark pipeline ---
-COMPILER_WORKDIR_ROOT = 'sd2_compile_dir_512'
-model_id = "stabilityai/stable-diffusion-2-1-base"
-text_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder/model.pt')
-decoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder/model.pt')
-unet_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'unet/model.pt')
-post_quant_conv_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv/model.pt')
-
-pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=DTYPE)
-pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-
-# Load the compiled UNet onto two neuron cores.
-pipe.unet = NeuronUNet(UNetWrap(pipe.unet))
-#TODO: FIXME, dual core execution is not working on release pipeline at the moment.
-device_ids = [0]
-pipe.unet.unetwrap = torch_neuronx.DataParallel(torch.jit.load(unet_filename), device_ids, set_dynamic_batching=True)
-
-class NeuronTypeConversionWrapper(nn.Module):
-    def __init__(self, network):
-        super().__init__()
-        self.network = network
-
-    def forward(self, x):
-        return self.network(x.float())
-
-# Load other compiled models onto a single neuron core.
-pipe.text_encoder = NeuronTextEncoder(pipe.text_encoder)
-pipe.text_encoder.neuron_text_encoder = torch.jit.load(text_encoder_filename)
-pipe.vae.decoder = NeuronTypeConversionWrapper(torch.jit.load(decoder_filename))
-pipe.vae.post_quant_conv = NeuronTypeConversionWrapper(torch.jit.load(post_quant_conv_filename))
-
-prompt = "a photo of an astronaut riding a horse on mars"
-n_runs = 10
-bench_result = benchmark(n_runs, "stable_diffusion_512", pipe, prompt, metric_path)
diff --git a/test/integration/fused_sd_attention_small_head/sd2_512_compile.py b/test/integration/fused_sd_attention_small_head/sd2_512_compile.py
deleted file mode 100644
index 4d6710f..0000000
--- a/test/integration/fused_sd_attention_small_head/sd2_512_compile.py
+++ /dev/null
@@ -1,319 +0,0 @@
-import os
-os.environ["NEURON_FUSE_SOFTMAX"] = "1"
-
-import torch
-import torch.nn as nn
-import torch_neuronx
-
-import copy
-from diffusers import StableDiffusionPipeline
-from diffusers.models.unet_2d_condition import UNet2DConditionOutput
-
-from packaging import version
-import diffusers
-diffusers_version = version.parse(diffusers.__version__)
-assert diffusers_version >= version.parse('0.18.0'), "Must use diffusers 0.18.0 or newer"
-
-from diffusers.models.attention_processor import Attention
-
-# Define datatype
-DTYPE = torch.bfloat16
-
-# Have to do this double wrapper trick to compile the unet, because
-# of the special UNet2DConditionOutput output type.
-class UNetWrap(nn.Module):
-    def __init__(self, unet):
-        super().__init__()
-        self.unet = unet
-
-    def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None):
-        out_tuple = self.unet(sample, timestep, encoder_hidden_states, return_dict=False)
-        return out_tuple
-
-class NeuronUNet(nn.Module):
-    def __init__(self, unetwrap):
-        super().__init__()
-        self.unetwrap = unetwrap
-        self.config = unetwrap.unet.config
-        self.in_channels = unetwrap.unet.in_channels
-        self.device = unetwrap.unet.device
-
-    def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None):
-        sample = self.unetwrap(sample, timestep.to(dtype=DTYPE).expand((sample.shape[0],)), encoder_hidden_states)[0]
-        return UNet2DConditionOutput(sample=sample)
-    
-class NeuronTextEncoder(nn.Module):
-    def __init__(self, text_encoder):
-        super().__init__()
-        self.neuron_text_encoder = text_encoder
-        self.config = text_encoder.config
-        self.dtype = text_encoder.dtype
-        self.device = text_encoder.device
-
-    def forward(self, emb, attention_mask = None):
-        return [self.neuron_text_encoder(emb)['last_hidden_state']]
-
-
-# Optimized attention
-def get_attention_scores(self, query, key, attn_mask):       
-    dtype = query.dtype
-
-    if self.upcast_attention:
-        query = query.float()
-        key = key.float()
-
-    # Check for square matmuls
-    if(query.size() == key.size()):
-        attention_scores = custom_badbmm(
-            key,
-            query.transpose(-1, -2)
-        )
-
-        if self.upcast_softmax:
-            attention_scores = attention_scores.float()
-
-        attention_probs = attention_scores.softmax(dim=1).permute(0,2,1)
-        attention_probs = attention_probs.to(dtype)
-
-    else:
-        attention_scores = custom_badbmm(
-            query,
-            key.transpose(-1, -2)
-        )
-
-        if self.upcast_softmax:
-            attention_scores = attention_scores.float()
-
-        attention_probs = attention_scores.softmax(dim=-1)
-        attention_probs = attention_probs.to(dtype)
-        
-    return attention_probs
-
-# In the original badbmm the bias is all zeros, so only apply scale
-def custom_badbmm(a, b):
-    bmm = torch.bmm(a, b)
-    scaled = bmm * 0.125
-    return scaled
-
-
-# For saving compiler artifacts
-COMPILER_WORKDIR_ROOT = 'sd2_compile_dir_512'
-
-# Model ID for SD version pipeline
-model_id = "stabilityai/stable-diffusion-2-1-base"
-
-# --- Compile UNet and save ---
-pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=DTYPE)
-
-from neuronxcc.nki.kernels.attention import fused_self_attn_for_SD_small_head_size
-from torch_neuronx import nki_jit
-
-nki_func = nki_jit(fused_self_attn_for_SD_small_head_size)
-
-class AttnProcessor:
-    r"""
-    Default processor for performing attention-related computations.
-    """
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        temb=None,
-    ):
-        residual = hidden_states
-
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        if query.shape[1] == 4096 and query.shape[2] <= 128 and \
-            query.shape == key.shape and query.shape == value.shape:
-            kernel = nki_func
-            # breakpoint()
-            result = torch.empty(value.shape, dtype=value.dtype, device=query.device)
-            # Each batch is independeant, use a launch grid of size batch size
-            kernel[query.shape[0]](query.transpose(1, 2), key, value, result)
-            hidden_states = result
-        else:
-            attention_probs = attn.get_attention_scores(query, key, attention_mask)
-            hidden_states = torch.bmm(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-# Replace original cross-attention module with custom cross-attention module for better performance
-import diffusers.models.attention_processor as attn_processor
-attn_processor.AttnProcessor.__call__ = AttnProcessor.__call__
-
-# Apply double wrapper to deal with custom return type
-pipe.unet = NeuronUNet(UNetWrap(pipe.unet))
-
-# Only keep the model being compiled in RAM to minimze memory pressure
-unet = copy.deepcopy(pipe.unet.unetwrap)
-del pipe
-
-# Compile unet - FP32
-sample_1b = torch.randn([1, 4, 64, 64], dtype=DTYPE)
-timestep_1b = torch.tensor(999, dtype=DTYPE).expand((1,))
-encoder_hidden_states_1b = torch.randn([1, 77, 1024], dtype=DTYPE)
-example_inputs = sample_1b, timestep_1b, encoder_hidden_states_1b
-
-unet_neuron = torch_neuronx.trace(
-    unet,
-    example_inputs,
-    compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'unet'),
-    compiler_args=["--model-type=unet-inference", "--enable-fast-loading-neuron-binaries"]
-)
-
-# Enable asynchronous and lazy loading to speed up model load
-torch_neuronx.async_load(unet_neuron)
-torch_neuronx.lazy_load(unet_neuron)
-
-# save compiled unet
-unet_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'unet/model.pt')
-torch.jit.save(unet_neuron, unet_filename)
-
-# delete unused objects
-del unet
-del unet_neuron
-
-
-
-# --- Compile CLIP text encoder and save ---
-
-# Only keep the model being compiled in RAM to minimze memory pressure
-pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=DTYPE)
-text_encoder = copy.deepcopy(pipe.text_encoder)
-del pipe
-
-# Apply the wrapper to deal with custom return type
-text_encoder = NeuronTextEncoder(text_encoder)
-
-# Compile text encoder
-# This is used for indexing a lookup table in torch.nn.Embedding,
-# so using random numbers may give errors (out of range).
-emb = torch.tensor([[49406, 18376,   525,  7496, 49407,     0,     0,     0,     0,     0,
-        0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-        0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-        0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-        0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-        0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-        0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-        0,     0,     0,     0,     0,     0,     0]])
-text_encoder_neuron = torch_neuronx.trace(
-        text_encoder.neuron_text_encoder, 
-        emb, 
-        compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder'),
-        compiler_args=["--enable-fast-loading-neuron-binaries"]
-        )
-
-# Enable asynchronous loading to speed up model load
-torch_neuronx.async_load(text_encoder_neuron)
-
-# Save the compiled text encoder
-text_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder/model.pt')
-torch.jit.save(text_encoder_neuron, text_encoder_filename)
-
-# delete unused objects
-del text_encoder
-del text_encoder_neuron
-
-
-
-# --- Compile VAE decoder and save ---
-
-# Only keep the model being compiled in RAM to minimze memory pressure
-pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float32)
-decoder = copy.deepcopy(pipe.vae.decoder)
-del pipe
-
-# Compile vae decoder
-decoder_in = torch.randn([1, 4, 64, 64], dtype=torch.float32)
-decoder_neuron = torch_neuronx.trace(
-    decoder, 
-    decoder_in, 
-    compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder'),
-    compiler_args=["--enable-fast-loading-neuron-binaries"]
-)
-
-# Enable asynchronous loading to speed up model load
-torch_neuronx.async_load(decoder_neuron)
-
-# Save the compiled vae decoder
-decoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder/model.pt')
-torch.jit.save(decoder_neuron, decoder_filename)
-
-# delete unused objects
-del decoder
-del decoder_neuron
-
-
-
-# --- Compile VAE post_quant_conv and save ---
-
-# Only keep the model being compiled in RAM to minimze memory pressure
-pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float32)
-post_quant_conv = copy.deepcopy(pipe.vae.post_quant_conv)
-del pipe
-
-# # Compile vae post_quant_conv
-post_quant_conv_in = torch.randn([1, 4, 64, 64], dtype=torch.float32)
-post_quant_conv_neuron = torch_neuronx.trace(
-    post_quant_conv, 
-    post_quant_conv_in,
-    compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv'),
-)
-
-# Enable asynchronous loading to speed up model load
-torch_neuronx.async_load(post_quant_conv_neuron)
-
-# # Save the compiled vae post_quant_conv
-post_quant_conv_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv/model.pt')
-torch.jit.save(post_quant_conv_neuron, post_quant_conv_filename)
-
-# delete unused objects
-del post_quant_conv
-del post_quant_conv_neuron
diff --git a/test/integration/perf_utils/LatencyCollector.py b/test/integration/perf_utils/LatencyCollector.py
deleted file mode 100644
index 0fa7640..0000000
--- a/test/integration/perf_utils/LatencyCollector.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import time
-import math
-import os
-import json
-
-class LatencyCollector:
-    def __init__(self):
-        self.start = None
-        self.latency_list = []
-
-    def pre_hook(self, *args):
-        self.start = time.time()
-
-    def hook(self, *args):
-        self.latency_list.append(time.time() - self.start)
-
-    def percentile(self, percent):
-        latency_list = self.latency_list
-        pos_float = len(latency_list) * percent / 100
-        max_pos = len(latency_list) - 1
-        pos_floor = min(math.floor(pos_float), max_pos)
-        pos_ceil = min(math.ceil(pos_float), max_pos)
-        latency_list = sorted(latency_list)
-        return latency_list[pos_ceil] if pos_float - pos_floor > 0.5 else latency_list[pos_floor]
-
-def benchmark(n_runs, test_name, model, model_inputs, metric_path="."):
-    # model inputs can be tuple or dictionary
-    if not isinstance(model_inputs, tuple) and not isinstance(model_inputs, dict):
-        model_inputs = (model_inputs,)
-
-    def run_model():
-        if isinstance(model_inputs, dict):
-            return model(**model_inputs)
-        else : #tuple
-            return model(*model_inputs)
-
-    warmup_run = run_model()
-
-    latency_collector = LatencyCollector()
-    # can't use register_forward_pre_hook or register_forward_hook because StableDiffusionPipeline is not a torch.nn.Module
-    
-    for _ in range(n_runs):
-        latency_collector.pre_hook()
-        res = run_model()
-        latency_collector.hook()
-    
-    p0_latency_ms = latency_collector.percentile(0) * 1000
-    p50_latency_ms = latency_collector.percentile(50) * 1000
-    p90_latency_ms = latency_collector.percentile(90) * 1000
-    p95_latency_ms = latency_collector.percentile(95) * 1000
-    p99_latency_ms = latency_collector.percentile(99) * 1000
-    p100_latency_ms = latency_collector.percentile(100) * 1000
-
-    report_dict = dict()
-    report_dict["Latency.P0"] = f'{p0_latency_ms:.1f}'
-    report_dict["Latency.P50"]=f'{p50_latency_ms:.1f}'
-    report_dict["Latency.P90"]=f'{p90_latency_ms:.1f}'
-    report_dict["Latency.P95"]=f'{p95_latency_ms:.1f}'
-    report_dict["Latency.P99"]=f'{p99_latency_ms:.1f}'
-    report_dict["Latency.P100"]=f'{p100_latency_ms:.1f}'
-
-    report = f'RESULT FOR {test_name}:'
-    for key, value in report_dict.items():
-        report += f' {key}={value}'
-    print(report)
-
-    report = {"test_name": test_name, "report": report_dict}
-    
-    with open(metric_path, 'r+') as f:
-        cur = json.load(f)
-        cur.append(report)
-    with open(metric_path, 'w+') as f:
-        json.dump(cur, f)
-    return report_dict
diff --git a/test/integration/perf_utils/__init__.py b/test/integration/perf_utils/__init__.py
deleted file mode 100644
index 43b3f14..0000000
--- a/test/integration/perf_utils/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Package for integraton tests utilities
\ No newline at end of file
diff --git a/test/integration/resize_nearest_fixed_dma_kernel/inpainting_mask.png b/test/integration/resize_nearest_fixed_dma_kernel/inpainting_mask.png
deleted file mode 100644
index 7f3c753..0000000
Binary files a/test/integration/resize_nearest_fixed_dma_kernel/inpainting_mask.png and /dev/null differ
diff --git a/test/integration/resize_nearest_fixed_dma_kernel/inpainting_photo.png b/test/integration/resize_nearest_fixed_dma_kernel/inpainting_photo.png
deleted file mode 100644
index e84dfc8..0000000
Binary files a/test/integration/resize_nearest_fixed_dma_kernel/inpainting_photo.png and /dev/null differ
diff --git a/test/integration/resize_nearest_fixed_dma_kernel/run.sh b/test/integration/resize_nearest_fixed_dma_kernel/run.sh
deleted file mode 100644
index 13e1779..0000000
--- a/test/integration/resize_nearest_fixed_dma_kernel/run.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-set -e
-
-
-if [ "$#" -ne 1 ]; then
-    echo "Illegal number of parameters. Must call with ./run.sh <metric_path> "
-    exit 1
-fi
-
-metric_path=$1
-
-echo ${test_name} is writing the benchmark result to ${metric_path}
-
-export NEURON_FUSE_SOFTMAX=1
-pip install diffusers==0.24.0 transformers==4.35.2 accelerate==0.24.1 safetensors==0.4.1 matplotlib
-python3 sd2_inpainting_936_624_compile.py
-python3 sd2_inpainting_936_624_benchmark.py $1
\ No newline at end of file
diff --git a/test/integration/resize_nearest_fixed_dma_kernel/sd2_inpainting_936_624_benchmark.py b/test/integration/resize_nearest_fixed_dma_kernel/sd2_inpainting_936_624_benchmark.py
deleted file mode 100644
index 4970f72..0000000
--- a/test/integration/resize_nearest_fixed_dma_kernel/sd2_inpainting_936_624_benchmark.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import os
-
-import torch
-import torch.nn as nn
-import torch_neuronx
-import numpy as np
-
-from matplotlib import pyplot as plt
-from matplotlib import image as mpimg
-import time
-import copy
-from IPython.display import clear_output
-
-from diffusers import StableDiffusionInpaintPipeline
-from diffusers.models.unet_2d_condition import UNet2DConditionOutput
-# Compatibility for diffusers<0.18.0
-from packaging import version
-import diffusers
-diffusers_version = version.parse(diffusers.__version__)
-use_new_diffusers = diffusers_version >= version.parse('0.18.0')
-if use_new_diffusers:
-    from diffusers.models.attention_processor import Attention
-else:
-    from diffusers.models.cross_attention import CrossAttention
-
-parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-sys.path.append(parent_dir)
-from perf_utils.LatencyCollector import benchmark
-
-import sys
-if len(sys.argv) != 2:
-    print("Usage: python sd2_inpainting_936_624_benchmark.py <metric_path>")
-    exit(1)
-metric_path = os.path.abspath(sys.argv[1])
-
-from wrapper import UNetWrap, NeuronUNet, NeuronTextEncoder
-
-# Define datatype for UNet
-DTYPE = torch.bfloat16
-
-clear_output(wait=False)
-
-"""
-Sample image is taken from: https://huggingface.co/stabilityai/stable-diffusion-2-inpainting
-"""
-
-# --- Load all compiled models and run pipeline ---
-COMPILER_WORKDIR_ROOT = "sd2_inpainting_neuron"
-model_id = "stabilityai/stable-diffusion-2-inpainting"
-
-pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, torch_dtype=torch.float32)
-
-text_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder/model.pt')
-unet_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'unet/model.pt')
-vae_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_encoder/model.pt')
-decoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder/model.pt')
-post_quant_conv_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv/model.pt')
-
-# Load the compiled UNet onto two neuron cores.
-pipe.unet = NeuronUNet(UNetWrap(pipe.unet))
-device_ids = [0,1]
-pipe.unet.unetwrap = torch_neuronx.DataParallel(torch.jit.load(unet_filename), device_ids, set_dynamic_batching=False)
-
-# Load other compiled models onto a single neuron core.
-pipe.text_encoder = NeuronTextEncoder(pipe.text_encoder)
-pipe.text_encoder.neuron_text_encoder = torch.jit.load(text_encoder_filename)
-pipe.vae.encoder = torch.jit.load(vae_encoder_filename)
-pipe.vae.decoder = torch.jit.load(decoder_filename)
-pipe.vae.post_quant_conv = torch.jit.load(post_quant_conv_filename)
-
-height, width = 624, 936
-
-import PIL
-base_image = PIL.Image.open('inpainting_photo.png')
-mask = PIL.Image.open('inpainting_mask.png')
-
-prompt = 'Face of a yellow cat, high resolution, sitting on a park bench'
-n_runs = 10
-
-bench_result = benchmark(n_runs, "stable_diffusion_2_inpainting_936_624", pipe, {'prompt':prompt, 'image':base_image, 'mask_image':mask, 'height':height, 'width':width}, metric_path)
\ No newline at end of file
diff --git a/test/integration/resize_nearest_fixed_dma_kernel/sd2_inpainting_936_624_compile.py b/test/integration/resize_nearest_fixed_dma_kernel/sd2_inpainting_936_624_compile.py
deleted file mode 100644
index ffef15e..0000000
--- a/test/integration/resize_nearest_fixed_dma_kernel/sd2_inpainting_936_624_compile.py
+++ /dev/null
@@ -1,376 +0,0 @@
-import os
-from typing import Optional
-
-import torch
-import torch.nn as nn
-import torch_neuronx
-import numpy as np
-
-from matplotlib import pyplot as plt
-from matplotlib import image as mpimg
-import time
-import copy
-from IPython.display import clear_output
-
-from diffusers import StableDiffusionInpaintPipeline
-from diffusers.models.unet_2d_condition import UNet2DConditionOutput
-# Compatibility for diffusers<0.18.0
-from packaging import version
-import diffusers
-diffusers_version = version.parse(diffusers.__version__)
-use_new_diffusers = diffusers_version >= version.parse('0.18.0')
-if use_new_diffusers:
-    from diffusers.models.attention_processor import Attention
-else:
-    from diffusers.models.cross_attention import CrossAttention
-
-from wrapper import UNetWrap, NeuronUNet, NeuronTextEncoder
-
-# Define datatype for UNet
-DTYPE = torch.bfloat16
-
-clear_output(wait=False)
-
-"""
-In the following section, we will compile parts of the Stable Diffusion pipeline for execution on Neuron. 
-Note that this only needs to be done once: After you have compiled and saved the model by running the following section of code, 
-you can reuse it any number of times without having to recompile. In particular, we will compile:
-
-The CLIP text encoder;
-The VAE encoder;
-The VAE decoder;
-The UNet, and
-The VAE_post_quant_conv These blocks are chosen because they represent the bulk of the compute in the pipeline, 
-and performance benchmarking has shown that running them on Neuron yields significant performance benefit.
-
-The UNet contains Upsample2D layers.
-In this example, we will replace Upsample2D layer with our Custom_Upsample2D layer which is using NKI kernel (resize_nearest_fixed_dma_kernel). (Line 179)
-In this custom layer, when it finds the upscaling factor is not an integer, it applies the kernel. (Line 142)
-Example shape : (1, 1280, 30, 20) -> (1, 1280, 59, 38)
-Please note that this is a NKI example, showing how we apply the kernel. The non-integer Upsample2D is already supported in our compiler.
-
-Additional Notes
-- We use the optimized get_attention_scores utility function, 
- to replace the original get_attention_scores function in the attention_processor.Attention class
-- In order to save RAM before tracing each model, 
- we make a deepcopy of the part of the pipeline and delete the pipeline object from the memory,  
- This trick allows the compile to succeed on instance types with a smaller amount of RAM.
-
-
-Code details :
-https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/inference/hf_pretrained_sd2_inpainting_936_624_inference.ipynb
-
-"""
-
-
-from neuronxcc.nki.kernels.vision import resize_nearest_fixed_dma_kernel
-from torch_neuronx import nki_jit
-from torch_xla.core import xla_model as xm
-from torch import Tensor
-import torch.nn.functional as F
-from diffusers.utils import USE_PEFT_BACKEND
-from diffusers.models.lora import LoRACompatibleConv
-from diffusers.models.resnet import Upsample2D
-
-class Custom_Upsample2D(nn.Module):
-    """A 2D upsampling layer with an optional convolution.
-    Copied from diffusers.models.resnet.Upsample2D
-    """
-
-    def __init__(
-        self,
-        channels: int,
-        conv,
-        conv2d_0,
-        use_conv: bool = False,
-        use_conv_transpose: bool = False,
-        out_channels: Optional[int] = None,
-        name: str = "conv",
-    ):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.use_conv_transpose = use_conv_transpose
-        self.name = name
-        self.conv = conv
-        self.Conv2d_0 = conv2d_0
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        output_size: Optional[int] = None,
-        scale: float = 1.0,
-    ) -> torch.FloatTensor:
-        assert hidden_states.shape[1] == self.channels
-
-        if self.use_conv_transpose:
-            return self.conv(hidden_states)
-
-        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
-        dtype = hidden_states.dtype
-        if dtype == torch.bfloat16:
-            hidden_states = hidden_states.to(torch.float32)
-
-        if hidden_states.shape[0] >= 64:
-            hidden_states = hidden_states.contiguous()
-
-        if output_size is None:
-            hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
-        else:
-            # # When the upscailng factor is not an integer, compile using resize_nearest_fixed_dma_kernel
-            # # Otherwise, use the default interpolate function.
-            if (output_size[0] % hidden_states.shape[2] != 0 or output_size[1] % hidden_states.shape[3] != 0):
-                from neuronxcc.nki.kernels.vision import resize_nearest_fixed_dma_kernel
-                from torch_neuronx import nki_jit
-                from torch_xla.core import xla_model as xm
-                # Compile NKI kernel
-                device = xm.xla_device()
-                hidden_states = hidden_states.to(device=device)
-                nki_func = nki_jit(resize_nearest_fixed_dma_kernel)
-                # Apply NKI kernel
-                hidden_states = torch.permute(hidden_states, (0, 2, 3, 1))
-                result = torch.empty((hidden_states.shape[0], output_size[0], output_size[1], hidden_states.shape[3]) , dtype=torch.float32, device=device)
-                nki_func[hidden_states.shape[0]](hidden_states, result)
-                hidden_states = torch.permute(result, (0, 3, 1, 2))
-            else :
-                hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
-        
-        # If the input is bfloat16, we cast back to bfloat16
-        if dtype == torch.bfloat16:
-            hidden_states = hidden_states.to(torch.bfloat16)
-
-        if self.use_conv:
-            if self.name == "conv":
-                if isinstance(self.conv, LoRACompatibleConv) and not USE_PEFT_BACKEND:
-                    hidden_states = self.conv(hidden_states, scale)
-                else:
-                    hidden_states = self.conv(hidden_states)
-            else:
-                if isinstance(self.Conv2d_0, LoRACompatibleConv) and not USE_PEFT_BACKEND:
-                    hidden_states = self.Conv2d_0(hidden_states, scale)
-                else:
-                    hidden_states = self.Conv2d_0(hidden_states)
-
-        return hidden_states
-
-# Replace diffusers.models.resnet.Upsample2D with Custom_Upsample2D
-def replace_upsampling(module):
-    for name, child in module.named_children():
-        if isinstance(child, Upsample2D):
-            if child.name == "conv":
-                setattr(module, name, Custom_Upsample2D(channels=child.channels, conv=child.conv, conv2d_0=None, use_conv=child.use_conv, use_conv_transpose=child.use_conv_transpose, out_channels=child.out_channels, name=child.name))
-            else:
-                setattr(module, name, Custom_Upsample2D(channels=child.channels, conv=None, conv2d_0=child.Conv2d_0, use_conv=child.use_conv, use_conv_transpose=child.use_conv_transpose, out_channels=child.out_channels, name=child.name))
-            
-        else:
-            replace_upsampling(child)
-
-
-# optimized attention scores
-def get_attention_scores(self, query, key, attn_mask):       
-    dtype = query.dtype
-
-    if self.upcast_attention:
-        query = query.float()
-        key = key.float()
-
-    # Check for square matmuls
-    if(query.size() == key.size()):
-        attention_scores = custom_badbmm(
-            key,
-            query.transpose(-1, -2)
-        )
-
-        if self.upcast_softmax:
-            attention_scores = attention_scores.float()
-
-        attention_probs = torch.nn.functional.softmax(attention_scores, dim=1).permute(0,2,1)
-        attention_probs = attention_probs.to(dtype)
-
-    else:
-        attention_scores = custom_badbmm(
-            query,
-            key.transpose(-1, -2)
-        )
-
-        if self.upcast_softmax:
-            attention_scores = attention_scores.float()
-
-        attention_probs = torch.nn.functional.softmax(attention_scores, dim=-1)
-        attention_probs = attention_probs.to(dtype)
-        
-    return attention_probs
-
-def custom_badbmm(a, b):
-    bmm = torch.bmm(a, b)
-    scaled = bmm * 0.125
-    return scaled
-
-
-COMPILER_WORKDIR_ROOT = "sd2_inpainting_neuron"
-
-def trace_text_encoder(model_id):
-    pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, torch_dtype=torch.float32)
-    text_encoder = copy.deepcopy(pipe.text_encoder)
-    del pipe
-    # Apply the wrapper to deal with custom return type
-    text_encoder = NeuronTextEncoder(text_encoder)
-
-    # Compile text encoder
-    # This is used for indexing a lookup table in torch.nn.Embedding,
-    # so using random numbers may give errors (out of range).
-    emb = torch.tensor([[49406, 18376,   525,  7496, 49407,     0,     0,     0,     0,     0,
-            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-            0,     0,     0,     0,     0,     0,     0]])
-    text_encoder_neuron = torch_neuronx.trace(
-            text_encoder.neuron_text_encoder, 
-            emb, 
-            compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder'),
-            )
-    
-    # Enable asynchronous loading to speed up model load
-    torch_neuronx.async_load(text_encoder_neuron)   
-    
-    # Save the compiled text encoder
-    text_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder/model.pt')
-    torch.jit.save(text_encoder_neuron, text_encoder_filename)
-
-    # delete unused objects
-    del text_encoder
-    del text_encoder_neuron
-
-def trace_vae_encoder(model_id, height, width):
-    # Only keep the model being compiled in RAM to minimze memory pressure
-    pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, torch_dtype=torch.float32)
-    vae_encoder = copy.deepcopy(pipe.vae.encoder)
-    del pipe
-
-    sample_input = torch.randn([1, 3, height, width])
-    vae_encoder_neuron = torch_neuronx.trace(
-            vae_encoder, 
-            sample_input, 
-            compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'vae_encoder'),
-            )
-    
-    # Enable asynchronous loading to speed up model load
-    torch_neuronx.async_load(vae_encoder_neuron)    
-
-    # Save the compiled text encoder
-    vae_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_encoder/model.pt')
-    torch.jit.save(vae_encoder_neuron, vae_encoder_filename)
-
-    # delete unused objects
-    del vae_encoder
-    del vae_encoder_neuron
-
-
-def trace_vae_decoder(model_id, height, width):
-    # Only keep the model being compiled in RAM to minimze memory pressure
-    pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, torch_dtype=torch.float32)
-    decoder = copy.deepcopy(pipe.vae.decoder)
-    del pipe
-
-    # Compile vae decoder
-    decoder_in = torch.randn([1, 4, height // 8, width // 8])
-    decoder_neuron = torch_neuronx.trace(
-        decoder, 
-        decoder_in, 
-        compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder'),
-        compiler_args=["--verbose", "info"]
-    )
-
-    # Enable asynchronous loading to speed up model load
-    torch_neuronx.async_load(decoder_neuron)
-
-    # Save the compiled vae decoder
-    decoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder/model.pt')
-    torch.jit.save(decoder_neuron, decoder_filename)
-
-    # delete unused objects
-    del decoder
-    del decoder_neuron
-
-def trace_unet(model_id, height, width):
-    # --- Compile UNet and save ---
-    DTYPE = torch.bfloat16
-    pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, torch_dtype=DTYPE)
-
-    # Replace original cross-attention module with custom cross-attention module for better performance
-    Attention.get_attention_scores = get_attention_scores
-
-    # Apply double wrapper to deal with custom return type
-    pipe.unet = NeuronUNet(UNetWrap(pipe.unet))
-
-    # Only keep the model being compiled in RAM to minimze memory pressure
-    unet = copy.deepcopy(pipe.unet.unetwrap)
-
-    # replace Upsample2D
-    replace_upsampling(unet)
-
-    del pipe
-
-    sample_1b = torch.randn([1, 9, height // 8, width // 8], dtype=DTYPE)
-    timestep_1b = torch.tensor(999, dtype=DTYPE).expand((1,))
-    encoder_hidden_states_1b = torch.randn([1, 77, 1024], dtype=DTYPE)
-    example_inputs = sample_1b, timestep_1b, encoder_hidden_states_1b
-
-    unet_neuron = torch_neuronx.trace(
-        unet,
-        example_inputs,
-        compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'unet'),
-        compiler_args=["--model-type=unet-inference", "--verbose=info"],
-    )
-
-    # Enable asynchronous and lazy loading to speed up model load
-    torch_neuronx.async_load(unet_neuron)
-    torch_neuronx.lazy_load(unet_neuron)
-
-    # save compiled unet
-    unet_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'unet/model.pt')
-    torch.jit.save(unet_neuron, unet_filename)
-
-    # delete unused objects
-    del unet
-    del unet_neuron
-    
-def trace_post_quant_conv(model_id, height, width):
-    # Only keep the model being compiled in RAM to minimze memory pressure
-    pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, torch_dtype=torch.float32)
-    post_quant_conv = copy.deepcopy(pipe.vae.post_quant_conv)
-    del pipe
-
-    # Compile vae post_quant_conv
-    post_quant_conv_in = torch.randn([1, 4, height // 8 , width // 8])
-    post_quant_conv_neuron = torch_neuronx.trace(
-        post_quant_conv, 
-        post_quant_conv_in,
-        compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv'),
-        compiler_args=["--verbose", "info"]
-    )
-
-    # Enable asynchronous loading to speed up model load
-    torch_neuronx.async_load(post_quant_conv_neuron)
-
-    # Save the compiled vae post_quant_conv
-    post_quant_conv_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv/model.pt')
-    torch.jit.save(post_quant_conv_neuron, post_quant_conv_filename)
-
-    # delete unused objects
-    del post_quant_conv
-    del post_quant_conv_neuron
-
-model_id = "stabilityai/stable-diffusion-2-inpainting"
-height, width = 624, 936
-
-# trace the parts of the pipeline
-trace_text_encoder(model_id)
-trace_vae_decoder(model_id, height, width)
-trace_vae_encoder(model_id, height, width)
-trace_unet(model_id, height, width)
-trace_post_quant_conv(model_id, height, width)
diff --git a/test/integration/resize_nearest_fixed_dma_kernel/wrapper.py b/test/integration/resize_nearest_fixed_dma_kernel/wrapper.py
deleted file mode 100644
index 37a22c8..0000000
--- a/test/integration/resize_nearest_fixed_dma_kernel/wrapper.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import torch
-import torch.nn as nn
-from diffusers.models.unet_2d_condition import UNet2DConditionOutput
-
-"""
-Define a double-wrapper for the UNet because of the special UNet2DConditionOutput output type and a wrapper for the text encoder. 
-These wrappers enable torch_neuronx.trace to trace the wrapped models for compilation with the Neuron compiler
-
-Details
-https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/inference/hf_pretrained_sd2_inpainting_936_624_inference.ipynb
-"""
-
-class UNetWrap(nn.Module):
-    def __init__(self, unet):
-        super().__init__()
-        self.unet = unet
-
-    def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None):
-        out_tuple = self.unet(sample, timestep, encoder_hidden_states, return_dict=False)
-        return out_tuple
-
-class NeuronUNet(nn.Module):
-    def __init__(self, unetwrap):
-        super().__init__()
-        self.unetwrap = unetwrap
-        self.config = unetwrap.unet.config
-        self.in_channels = unetwrap.unet.in_channels
-        self.device = unetwrap.unet.device
-
-    def forward(self, sample, timestep, encoder_hidden_states, timestep_cond=None, added_cond_kwargs=None, cross_attention_kwargs=None, return_dict=False):
-        sample = self.unetwrap(sample.bfloat16(), timestep.bfloat16().expand((sample.shape[0],)), encoder_hidden_states.bfloat16())[0]
-        return UNet2DConditionOutput(sample=sample)
-
-class NeuronTextEncoder(nn.Module):
-    def __init__(self, text_encoder):
-        super().__init__()
-        self.neuron_text_encoder = text_encoder
-        self.config = text_encoder.config
-        self.dtype = text_encoder.dtype
-        self.device = text_encoder.device
-
-    def forward(self, emb, attention_mask = None):
-        return [self.neuron_text_encoder(emb)['last_hidden_state']]
\ No newline at end of file
diff --git a/test/integration/run_integration.sh b/test/integration/run_integration.sh
deleted file mode 100755
index 832c927..0000000
--- a/test/integration/run_integration.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-export PYTHONPATH=${PYTHONPATH}:$PWD
-
-echo Creating benchmark result json at $PWD/test_result.json
-if [ -f $PWD/test_result.json ]; then
-    rm $PWD/test_result.json
-fi
-touch $PWD/test_result.json
-echo "[]" >> $PWD/test_result.json
-
-RESULT_JSON=$PWD/test_result.json
-
-pushd fused_sd_attention_small_head
-sh run.sh ${RESULT_JSON}
-popd
-
-pushd resize_nearest_fixed_dma_kernel
-sh run.sh ${RESULT_JSON}
-popd
-
-pushd flash_attention
-sh run.sh ${RESULT_JSON}
-popd
-
-pushd select_and_scatter_kernel 
-sh run.sh ${RESULT_JSON}
-popd
diff --git a/test/integration/select_and_scatter_kernel/common/vision_utils.py b/test/integration/select_and_scatter_kernel/common/vision_utils.py
deleted file mode 100644
index 2df218c..0000000
--- a/test/integration/select_and_scatter_kernel/common/vision_utils.py
+++ /dev/null
@@ -1,319 +0,0 @@
-import os
-import sys
-import argparse
-from datetime import datetime
-import math
-import queue
-import time
-import inspect
-from torch.optim.lr_scheduler import _LRScheduler
-from torch.utils.data.dataloader import DataLoader
-from torch.utils.data.distributed import DistributedSampler
-from torch.utils.tensorboard import SummaryWriter
-
-import torchvision
-import torchvision.transforms as transforms
-import torchvision.datasets as datasets
-
-try:
-  from transformers import (
-    AutoModelForImageClassification,
-    AutoConfig
-  )
-except:
-  print("cannot import transformers")
-
-try:
-  import timm
-except:
-  print("cannot import timm")
-
-SUPPORTED_PLATFORMS = ['torchvision', 'transformers', 'timm']
-
-class WarmupAndExponentialDecayScheduler(_LRScheduler):
-  """Update the learning rate of wrapped optimizer based on epoch and step.
-
-  Args:
-    optimizer: Instance of torch.optim.Optimizer. Learning rate will be changed.
-    num_steps_per_epoch: int, the number of steps required to finish 1 epoch.
-    divide_every_n_epochs: After this number of epochs, learning rate will be
-      divided by the `divisor` param.
-    divisor: The learning rate will be divided by this amount when epoch %
-      divide_every_n_epochs == 0 (epoch 0 is excluded).
-    num_warmup_epochs: Float. Learning rate will ramp up from 0 to max learning
-      rate over this many epochs. Note that partial epochs are allowed, e.g. 0.5
-      epochs.
-    min_delta_to_update_lr: If the new learning rate does not differ much from
-      the learning rate of the previous step, don't bother updating the
-      optimizer's learning rate.
-  """
-
-  def __init__(self,
-               optimizer,
-               num_steps_per_epoch,
-               divide_every_n_epochs=20,
-               divisor=5,
-               num_warmup_epochs=0.9,
-               min_delta_to_update_lr=1e-6):
-    self._num_steps_per_epoch = num_steps_per_epoch
-    self._divide_every_n_epochs = divide_every_n_epochs
-    self._divisor = divisor
-    self._num_warmup_epochs = num_warmup_epochs
-    self._min_delta_to_update_lr = min_delta_to_update_lr
-    self._previous_lr = -1
-    self._max_lr = optimizer.param_groups[0]['lr']
-    super(WarmupAndExponentialDecayScheduler, self).__init__(optimizer)
-
-  def _epoch(self):
-    return self._step_count // self._num_steps_per_epoch
-
-  def _is_warmup_epoch(self):
-    return self._epoch() < math.ceil(self._num_warmup_epochs)
-
-  def get_lr(self):
-    epoch = self._epoch()
-    lr = 0.0
-
-    if self._is_warmup_epoch():
-      # Ramp up learning rate from 0.0 to self._max_lr using a linear slope.
-      num_warmup_steps = self._num_warmup_epochs * self._num_steps_per_epoch
-      lr = min(self._max_lr,
-               self._max_lr * ((self._step_count + 1.0) / num_warmup_steps))
-    else:
-      # Normal epoch. Use an exponential decay determined by init params.
-      lr = self._max_lr / (
-          self._divisor**(epoch // self._divide_every_n_epochs))
-
-    # _LRScheduler expects a list of learning rates like this.
-    return [lr for _ in self.base_lrs]
-
-  def step(self, epoch=None):
-    current_lr = self.get_lr()[0]
-
-    # Outside of warmup epochs, we use the same learning rate for every step
-    # in an epoch. Don't bother updating learning rate if it hasn't changed.
-    if abs(current_lr - self._previous_lr) >= self._min_delta_to_update_lr:
-      super(WarmupAndExponentialDecayScheduler, self).step()
-      self._previous_lr = current_lr
-    else:
-      self._step_count += 1  # This normally happens in super().step().
-
-
-class Throughput:
-  def __init__(self, batch_size, world_size, log_steps, moving_avg_window_size=10):
-    self.seqs_per_iteration = batch_size * world_size
-    self.moving_avg_window_size = moving_avg_window_size
-    self.moving_avg_window = queue.Queue()
-    self.window_time = 0
-    self.start_time = time.time()
-
-  def get_throughput(self):
-    step_time = time.time() - self.start_time
-    self.start_time += step_time
-    self.window_time += step_time
-    self.moving_avg_window.put(step_time)
-    window_size = self.moving_avg_window.qsize()
-    if window_size > self.moving_avg_window_size:
-        self.window_time -= self.moving_avg_window.get()
-        window_size -= 1
-    throughput = window_size * self.seqs_per_iteration / self.window_time
-    return throughput
-
-
-class Logger:
-  def __init__(self, args, world_size):
-        xla = 'torch_xla' in sys.modules
-        self.train_throughputs = []
-        self.test_throughputs = []
-        self.summary_writer = SummaryWriter(os.path.join(args.logdir,
-                                             f"neuron_tblogs_{time.strftime('%m%d%y_%H%M')}"
-                                             f"_w{world_size}"
-                                             f"_lr{args.lr}"
-                                             f"_bs{args.batch_size}"
-                                             f"_bf16autocast{args.enable_pt_autocast}"
-                                             f"_xla{xla}"))
-        self.summary_writer.add_text('script', "```\n" + inspect.getsource(sys.modules[__name__]) + "\n```", 0)
-
-  def print_training_update(self,
-                          device,
-                          step,
-                          lr,
-                          loss,
-                          throughput,
-                          epoch=None,
-                          summary_writer=None):
-    """Prints the training metrics at a given step.
-
-    Args:
-      device (torch.device): The device where these statistics came from.
-      step_num (int): Current step number.
-      loss (float): Current loss.
-      throughput (float): The examples/sec throughput for the current batch.
-      epoch (int, optional): The epoch number.
-      summary_writer (SummaryWriter, optional): If provided, this method will
-        write some of the provided statistics to Tensorboard.
-    """
-    update_data = [
-        'Training', 'Device={}'.format(str(device)),
-        'Epoch={}'.format(epoch) if epoch is not None else None,
-        'Step={}'.format(step), 'Learning_Rate={}'.format(lr),
-        'Loss={:.5f}'.format(loss), 'Throughput={:.5f}'.format(throughput),
-        'Time={}'.format(datetime.now())
-    ]
-    print('|', ' '.join(item for item in update_data if item), flush=True)
-    self.write_to_summary(
-        summary_writer,
-        dict_to_write={
-            'Throughput': throughput,
-        })
-
-  def print_test_update(self, device, throughput, accuracy, epoch=None, step=None):
-    """Prints single-core test metrics.
-
-    Args:
-      device: Instance of `torch.device`.
-      accuracy: Float.
-    """
-    update_data = [
-        'Test', 'Device={}'.format(str(device)),
-        'Step={}'.format(step) if step is not None else None,
-        'Epoch={}'.format(epoch) if epoch is not None else None,
-        'Throughput={:.5f}'.format(throughput),
-        'Accuracy={:.2f}'.format(accuracy) if accuracy is not None else None,
-        'Time={}'.format(datetime.now())
-    ]
-    print('|', ' '.join(item for item in update_data if item), flush=True)
-
-  def write_to_summary(self,
-                      global_step=None,
-                      dict_to_write={}):
-    """Writes scalars to a Tensorboard SummaryWriter.
-
-    Optionally writes XLA perf metrics.
-
-    Args:
-      global_step (int, optional): The global step value for these data points.
-        If None, global_step will not be set for this datapoint.
-      dict_to_write (dict, optional): Dict where key is the scalar name and value
-        is the scalar value to be written to Tensorboard.
-    """
-    if self.summary_writer is None:
-      return
-    for k, v in dict_to_write.items():
-      self.summary_writer.add_scalar(k, v, global_step)
-
-
-def build_train_parser():
-  parser = argparse.ArgumentParser()
-  parser.add_argument('--model', default='resnet50', help="Image classification model.")
-  parser.add_argument('--platform', default='torchvision', choices=SUPPORTED_PLATFORMS, help="The Platform where the model is from (torchvision/transformers/timm).")
-  parser.add_argument('--pretrained', action='store_true', help="Use model from Pre-trained.")
-  parser.add_argument('--data_dir', type=str, default="", help="Image classification dataset directory.")
-  parser.add_argument('--logdir', type=str, default="log_training", help="Training log directory.")
-  parser.add_argument('--batch_size', type=int, default=8, help="Batch size per core used in training.")
-  parser.add_argument('--num_epochs', type=int, default=2, help="Number of training epochs.")
-  parser.add_argument('--num_workers', type=int, default=0, help="Number of worker used in data loader.")
-  parser.add_argument('--log_steps', type=int, default=20, help="Number of steps between each other log message.")
-  parser.add_argument('--max_steps', type=int, default=28125, help="Number of max training steps.")
-  parser.add_argument('--expected_average_throughput', type=int, default=0, help="Expected average training throughput (seq/s).")
-  parser.add_argument('--image_dim', type=int, default=224, help="Image dimension after transformation.")
-  parser.add_argument('--test_batch_size', type=int, default=8, help="Batch size per core used in testing.")
-  parser.add_argument('--lr', type=float, default=0.00005, help="Learning rate used in training.")
-  parser.add_argument('--lr_scheduler_type', type=str, default=None, choices=["WarmupAndExponentialDecayScheduler",])
-  parser.add_argument('--lr_scheduler_divide_every_n_epochs', type=int, default=20)
-  parser.add_argument('--lr_scheduler_divisor', type=int, default=5)
-  parser.add_argument('--momentum', type=float, default=0.9, help="Momentum used in SGD optimizer")
-  parser.add_argument('--target_accuracy', type=float, default=0, help="Target accuracy (%).")
-  parser.add_argument('--drop_last', action='store_true', help="Enable deop_last in data loader.")
-  parser.add_argument('--fake_data', action='store_true', help="Use fake (random) data for training and testing.")
-  parser.add_argument('--fake_train_dataset_length', type=int, default=50000, help="Length of fake training dataset.")
-  parser.add_argument('--fake_test_dataset_length', type=int, default=1000, help="Length of fake testing dataset.")
-  parser.add_argument('--metrics_debug', action='store_true', help="Print debug metrics at the end of each epoch.")
-  parser.add_argument('--enable_pt_autocast', action='store_true', help="Enable Auto-cast to BF16 in GPU.")
-  parser.add_argument('--do_eval', action='store_true', help="Evaluate the model with eval dataset after training.")
-
-  return parser
-
-def get_model(platform, model, pretrained):
-  if platform == "torchvision":
-    default_model_property = {
-        'model_fn': getattr(torchvision.models, model)
-    }
-    model_properties = {
-        'inception_v3': {
-            'model_fn': lambda: torchvision.models.inception_v3(aux_logits=False)
-        },
-    }
-    return model_properties.get(model, default_model_property)['model_fn'](pretrained=pretrained)
-  elif platform == "transformers":
-    if pretrained:
-      return AutoModelForImageClassification.from_pretrained(model)
-    else:
-      config = AutoConfig.from_pretrained(model)
-      return AutoModelForImageClassification.from_config(config)
-  elif platform == "timm":
-    return timm.create_model(model, pretrained=pretrained)
-  else:
-    raise ValueError('Unsupported Platform.')
-
-def get_data_transforms(img_dim):
-  resize_dim = max(img_dim, 256)
-  normalize = transforms.Normalize(
-        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-  train_transform = transforms.Compose([
-            transforms.RandomResizedCrop(img_dim),
-            transforms.RandomHorizontalFlip(),
-            transforms.ToTensor(),
-            normalize,
-        ])
-  # Matches Torchvision's eval transforms except Torchvision uses size
-  # 256 resize for all models both here and in the train loader. Their
-  # version crashes during training on 299x299 images, e.g. inception.
-  test_transform = transforms.Compose([
-            transforms.Resize(resize_dim),
-            transforms.CenterCrop(img_dim),
-            transforms.ToTensor(),
-            normalize,
-        ])
-
-  return train_transform, test_transform
-
-def create_data_loaders(train_dataset,
-                        test_dataset,
-                        rank,
-                        world_size,
-                        train_batch_size,
-                        test_batch_size,
-                        num_workers,
-                        drop_last=False):
-  train_sampler, test_sampler = None, None
-  if world_size > 1:
-    train_sampler = DistributedSampler(
-        train_dataset,
-        num_replicas=world_size,
-        rank=rank,
-        shuffle=True)
-    test_sampler = DistributedSampler(
-        test_dataset,
-        num_replicas=world_size,
-        rank=rank,
-        shuffle=False)
-
-  train_loader = DataLoader(
-      train_dataset,
-      batch_size=train_batch_size,
-      shuffle=False if train_sampler else True,
-      sampler=train_sampler,
-      drop_last=drop_last,
-      num_workers=num_workers,
-      pin_memory=True)
-  test_loader = DataLoader(
-      test_dataset,
-      batch_size=test_batch_size,
-      shuffle=False,
-      sampler=test_sampler,
-      drop_last=drop_last,
-      num_workers=num_workers,
-      pin_memory=True)
-
-  return train_loader, test_loader
\ No newline at end of file
diff --git a/test/integration/select_and_scatter_kernel/insert_select_and_scatter_kernel.py b/test/integration/select_and_scatter_kernel/insert_select_and_scatter_kernel.py
deleted file mode 100644
index acb89d6..0000000
--- a/test/integration/select_and_scatter_kernel/insert_select_and_scatter_kernel.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import torch 
-from torch import nn
-from torch import autograd
-from neuronxcc.nki.kernels.vision import select_and_scatter_kernel
-
-import torch_xla.core.xla_model as xm
-from torch_neuronx.xla_impl.ops import nki_jit
-
-select_and_scatter_func = nki_jit()(select_and_scatter_kernel)
-
-class ModuleWrapper(nn.Module):
-    def __init__(self, func):
-        super().__init__()
-        self.func = func
-
-    def forward(self, x): 
-        return self.func(x)
-
-
-class NeuronMaxPool2d(autograd.function.Function):
-    @staticmethod
-    def forward(ctx, input):
-        ctx.save_for_backward(input)
-        return nn.MaxPool2d(kernel_size=3, stride=2, padding=(1, 1))(input)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, = ctx.saved_tensors
-        output = torch.zeros_like(input)
-        select_and_scatter_func(input, grad_output, output)
-        return output
-
-
-def replace_maxpool(model):
-    model.maxpool = ModuleWrapper(NeuronMaxPool2d.apply)
-    return model
\ No newline at end of file
diff --git a/test/integration/select_and_scatter_kernel/resnet50.ipynb b/test/integration/select_and_scatter_kernel/resnet50.ipynb
deleted file mode 100644
index 54d6bb8..0000000
--- a/test/integration/select_and_scatter_kernel/resnet50.ipynb
+++ /dev/null
@@ -1,179 +0,0 @@
-{
-    "cells": [
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "# ResNet50 training - Pytorch [Beta PyTorch 2.1] \n",
-                "This notebook shows how to fine-tune a pretrained ResNet50 Pytorch model with AWS Trainium (trn1 instances) using NeuronSDK.\n",
-                "The original implementation is provided by torchvision.\n",
-                "\n",
-                "The example has 3 stages:\n",
-                "1. In insert_select_and_scatter_kernel, we replace the backward implementation of maxpool to a select-and-scatter nki kernel.\n",
-                "2. Use `neuron_parallel_compile` to compile the model.\n",
-                "3. Run the fine-tuning script to train the model based on image classification task. The training job will use 2 workers with data parallel to speed up the training.\n",
-                "\n",
-                "This script is intended to run on trn1.2xlarge. For full training throughput, please refer to the sample script that runs on trn1.32xlarge at: https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/training/resnet50/resnet50.ipynb\n",
-                "\n",
-                "**Reference:** \n",
-                "\n",
-                "https://pytorch.org/vision/main/models/generated/torchvision.models.resnet50.html"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## 1) Set the parameters"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# use --model-type=cnn-training to gain the best performance\n",
-                "env_var_options = \"NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=2  \" + \\\n",
-                "    \"NEURON_CC_FLAGS=\\'--cache_dir=./compiler_cache --model-type=cnn-training\\'\"\n",
-                "num_workers = 2\n",
-                "learning_rate = 0.001\n",
-                "dataloader_num_workers = 2\n",
-                "device_prefetch_size = 2\n",
-                "host_to_device_transfer_threads = 4\n",
-                "num_epochs = 10\n",
-                "\n",
-                "import sys\n",
-                "import os\n",
-                "if len(sys.argv) != 2:\n",
-                "    exit(1)\n",
-                "metric_path = os.path.abspath(sys.argv[1])"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "model_name = \"resnet50\"\n",
-                "batch_size = 16"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## 2) Download CIFAR10 dataset"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "!wget -N https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz .\n",
-                "!tar xfvz cifar-10-python.tar.gz"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## 3) Compile the model with neuron_parallel_compile"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "%%time\n",
-                "import subprocess\n",
-                "\n",
-                "print(\"Compile model\")\n",
-                "# set epochs to 2 to reduce the time for tracing training graphs\n",
-                "COMPILE_CMD = f\"\"\"\n",
-                "   {env_var_options} neuron_parallel_compile torchrun --nproc_per_node={num_workers}\n",
-                "   run_image_classification.py\n",
-                "      --model {model_name}\n",
-                "      --platform torchvision\n",
-                "      --pretrained\n",
-                "      --num_epochs 2\n",
-                "      --batch_size {batch_size}\n",
-                "      --pretrained\n",
-                "      --lr {learning_rate}\n",
-                "      --drop_last\n",
-                "   \"\"\".replace('\\n', '')\n",
-                "\n",
-                "print(f'Running command: \\n{COMPILE_CMD}')\n",
-                "if subprocess.check_call(COMPILE_CMD,shell=True):\n",
-                "   print(\"There was an error with the compilation command\")\n",
-                "else:\n",
-                "   print(\"Compilation Success!!!\")"
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## 4) Compile and Fine-tune the model"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "%%time\n",
-                "print(\"Train model\")\n",
-                "RUN_CMD = f\"\"\"\n",
-                "   {env_var_options} torchrun --nproc_per_node={num_workers}\n",
-                "   run_image_classification.py\n",
-                "      --model {model_name}\n",
-                "      --platform torchvision\n",
-                "      --pretrained\n",
-                "      --num_epochs {num_epochs}\n",
-                "      --batch_size {batch_size}\n",
-                "      --pretrained\n",
-                "      --lr {learning_rate}\n",
-                "      --do_eval\n",
-                "      --drop_last\n",
-                "   \"\"\".replace('\\n', '')\n",
-                "\n",
-                "print(f'Running command: \\n{RUN_CMD}')\n",
-                "if subprocess.check_call(RUN_CMD,shell=True):\n",
-                "   print(\"There was an error with the fine-tune command\")\n",
-                "else:\n",
-                "   print(\"Fine-tune Successful!!!\")"
-            ]
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "Python (torch-neuronx)",
-            "language": "python",
-            "name": "aws_neuron_venv_pytorch"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.8.10"
-        },
-        "orig_nbformat": 4
-    },
-    "nbformat": 4,
-    "nbformat_minor": 2
-}
diff --git a/test/integration/select_and_scatter_kernel/run.sh b/test/integration/select_and_scatter_kernel/run.sh
deleted file mode 100644
index b379c08..0000000
--- a/test/integration/select_and_scatter_kernel/run.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-set -e
-
-
-if [ "$#" -ne 1 ]; then
-    echo "Illegal number of parameters. Must call with ./run.sh <metric_path> "
-    exit 1
-fi
-
-metric_path=$1
-
-echo ${test_name} is writing the benchmark result to ${metric_path}
-
-ipython resnet50.ipynb ${metric_path}
\ No newline at end of file
diff --git a/test/integration/select_and_scatter_kernel/run_image_classification.py b/test/integration/select_and_scatter_kernel/run_image_classification.py
deleted file mode 100644
index bf9df54..0000000
--- a/test/integration/select_and_scatter_kernel/run_image_classification.py
+++ /dev/null
@@ -1,215 +0,0 @@
-import os
-import sys
-import time
-from datetime import datetime
-import numpy as np
-import traceback
-import json
-import torch
-import torch.distributed as dist
-import torch_xla.distributed.xla_backend
-import torch.nn as nn
-import torch.optim as optim
-import torchvision.datasets as datasets
-import torch_xla
-import torch_xla.debug.metrics as met
-import torch_xla.distributed.parallel_loader as pl
-import torch_xla.utils.utils as xu
-import torch_xla.core.xla_model as xm
-import torch_xla.distributed.xla_multiprocessing as xmp
-
-if not '..' in sys.path: sys.path.append('..')
-import common.vision_utils as vision_utils
-from insert_select_and_scatter_kernel import replace_maxpool
-
-def train():
-  print('==> Preparing data..')
-  is_root = xm.is_master_ordinal(local=False)
-  train_transform, test_transform = vision_utils.get_data_transforms(FLAGS.image_dim)
-  if FLAGS.fake_data:
-    train_dataset_len = FLAGS.fake_train_dataset_length
-    test_dataset_len = FLAGS.fake_test_dataset_length
-    train_dataset = datasets.FakeData(
-      size=train_dataset_len,
-      image_size=(3, FLAGS.image_dim, FLAGS.image_dim),
-      num_classes=1000,
-      transform=train_transform)
-    test_dataset = datasets.FakeData(size=test_dataset_len,
-      image_size=(3, FLAGS.image_dim, FLAGS.image_dim),
-      num_classes=1000,
-      transform=test_transform)
-  elif FLAGS.data_dir:
-    train_dataset = datasets.ImageFolder(
-        os.path.join(FLAGS.data_dir, 'train'),
-        train_transform)
-    train_dataset_len = len(train_dataset.imgs)
-    test_dataset = datasets.ImageFolder(
-        os.path.join(FLAGS.data_dir, 'test'),
-        test_transform)
-  else:
-    # use cifar10 by default
-    train_transform, test_transform = vision_utils.get_data_transforms(FLAGS.image_dim)
-    train_dataset = datasets.CIFAR10(
-        os.path.join("./"), train=True, transform=train_transform, download=True
-    )
-    test_dataset = datasets.CIFAR10(
-        os.path.join("./"), train=False, transform=test_transform, download=True
-    )
-
-  train_loader, test_loader = vision_utils.create_data_loaders(
-    train_dataset,
-    test_dataset,
-    xm.get_ordinal(),
-    xm.xrt_world_size(),
-    FLAGS.batch_size,
-    FLAGS.test_batch_size,
-    FLAGS.num_workers,
-    FLAGS.drop_last)
-
-  torch.manual_seed(42)
-
-  device = xm.xla_device()
-  model = vision_utils.get_model(FLAGS.platform, FLAGS.model, FLAGS.pretrained).to(device)
-  model = replace_maxpool(model)
-
-  writer = None
-  if xm.is_master_ordinal():
-    logger = vision_utils.Logger(FLAGS, xm.xrt_world_size())
-  optimizer = optim.SGD(
-      model.parameters(),
-      lr=FLAGS.lr,
-      momentum=FLAGS.momentum,
-      weight_decay=1e-4)
-  lr_scheduler = None
-  if FLAGS.lr_scheduler_type == "WarmupAndExponentialDecayScheduler":
-    num_training_steps_per_epoch = train_dataset_len // (
-        FLAGS.batch_size * xm.xrt_world_size())
-    lr_scheduler = vision_utils.WarmupAndExponentialDecayScheduler(
-        optimizer,
-        num_training_steps_per_epoch,
-        divide_every_n_epochs=getattr(FLAGS, 'lr_scheduler_divide_every_n_epochs', None),
-        divisor=getattr(FLAGS, 'lr_scheduler_divisor', None))
-  loss_fn = nn.CrossEntropyLoss()
-
-  if is_root:
-    throughput = vision_utils.Throughput(FLAGS.batch_size, xm.xrt_world_size(), FLAGS.log_steps)
-    print('--------TRAINING CONFIG----------')
-    print(FLAGS)
-    print('---------------------------------')
-    train_start = time.time()
-
-  def train_loop_fn(loader, epoch, global_step):
-    model.train()
-    for step, (data, target) in enumerate(loader):
-      optimizer.zero_grad()
-      output = model(data)
-      logits = output if isinstance(output, torch.Tensor) else output.logits
-      loss = loss_fn(logits, target)
-      loss.backward()
-      xm.optimizer_step(optimizer)
-      if lr_scheduler:
-        lr_scheduler.step()
-      global_step += 1
-      if is_root:
-        step_throughput = throughput.get_throughput()
-        logger.train_throughputs.append(step_throughput)
-        if step % FLAGS.log_steps == 0:
-          logger.print_training_update(
-            device,
-            step,
-            FLAGS.lr if not lr_scheduler else lr_scheduler.optimizer.param_groups[0]['lr'],
-            loss.item(),
-            step_throughput,
-            epoch,
-            writer)
-      if global_step >= FLAGS.max_steps:
-        xm.mark_step()
-        break
-    return global_step, loss
-
-  def test_loop_fn(loader, epoch):
-    total_samples, correct = 0, 0
-    model.eval()
-    with torch.no_grad():
-      for step, (data, target) in enumerate(loader):
-        output = model(data)
-        logits = output if isinstance(output, torch.Tensor) else output.logits
-        pred = logits.max(1, keepdim=True)[1]
-        correct += pred.eq(target.view_as(pred)).sum()
-        total_samples += data.size()[0]
-        if is_root:
-          step_throughput = throughput.get_throughput()
-          logger.test_throughputs.append(step_throughput)
-          if step % FLAGS.log_steps == 0:
-            logger.print_test_update(device, step_throughput, None, epoch, step)
-    accuracy = 100.0 * correct.item() / total_samples
-    accuracy = xm.mesh_reduce('test_accuracy', accuracy, np.mean)
-    return accuracy
-
-  train_device_loader = pl.MpDeviceLoader(train_loader, device)
-  test_device_loader = pl.MpDeviceLoader(test_loader, device)
-  accuracy, max_accuracy = 0.0, 0.0
-  global_step = 0
-  for epoch in range(1, FLAGS.num_epochs + 1):
-    xm.master_print('Epoch {} train begin {}'.format(epoch, datetime.now()))
-    global_step, loss = train_loop_fn(train_device_loader, epoch, global_step)
-    xm.master_print('Epoch {} train end {}'.format(epoch, datetime.now()))
-    if FLAGS.metrics_debug:
-      xm.master_print(met.metrics_report())
-    if is_root:
-      average_train_throughput = round(sum(logger.train_throughputs)/len(logger.train_throughputs), 4)
-      xm.master_print('Average train throughput: {:.4f}'.format(average_train_throughput))
-      xm.master_print('Max train throughput: {:.4f}'.format(max(logger.train_throughputs)))
-    if global_step >= FLAGS.max_steps:
-      break
-  
-  if is_root:
-    time_to_train = time.time() - train_start
-
-  if FLAGS.do_eval:
-    if is_root:
-      throughput = vision_utils.Throughput(FLAGS.batch_size, xm.xrt_world_size(), FLAGS.log_steps)
-    accuracy = test_loop_fn(test_device_loader, epoch)
-    xm.master_print('Epoch {} test end {}, Accuracy={:.2f}'.format(
-        epoch, datetime.now(), accuracy))
-    max_accuracy = max(accuracy, max_accuracy)
-    if is_root:
-      logger.write_to_summary(
-          epoch,
-          dict_to_write={'Accuracy/test': accuracy})
-      average_test_throughput = round(sum(logger.test_throughputs)/len(logger.test_throughputs), 4)
-      xm.master_print('Average test throughput: {:.4f}'.format(average_test_throughput))
-      xm.master_print('Max test throughput: {:.4f}'.format(max(logger.test_throughputs)))
-      xm.master_print('Max Accuracy: {:.2f}%'.format(max_accuracy))
-
-      report_dict = dict()
-      report_dict["Throughput"] = f'{average_train_throughput:.1f}'
-      report_dict["Accuracy"]=f'{max_accuracy:.1f}'
-
-      report = {"test_name": f'{FLAGS.model}_training', "report": report_dict}
-
-      metric_path = FLAGS.metric_path
-      with open(metric_path, 'r+') as f:
-        cur = json.load(f)
-        cur.append(report)
-      with open(metric_path, 'w+') as f:
-        json.dump(cur, f)
-      return report_dict
-
-
-def _mp_fn(index, flags):
-  global FLAGS
-  FLAGS = flags
-  torch.set_default_tensor_type('torch.FloatTensor')
-  train()
-  xm.rendezvous("_mp_fn finished")
-
-if __name__ == '__main__':
-  parser = vision_utils.build_train_parser()
-  args = parser.parse_args(sys.argv[1:])
-  
-  if os.environ.get("WORLD_SIZE"):
-    dist.init_process_group('xla')
-    _mp_fn(0, args)
-  else:
-    xmp.spawn(_mp_fn, args=(args,))
\ No newline at end of file
diff --git a/test/unit/README.md b/test/unit/README.md
deleted file mode 100644
index 55dc937..0000000
--- a/test/unit/README.md
+++ /dev/null
@@ -1,7 +0,0 @@
-Tests under this folder are unit tests for the kernels in `src/nki_samples`. 
-
-To execute the tests, we need to include `src/nki_samples` in the `PYTHONPATH`.
-
-For example, 
-
-PYTHONPATH=$PYTHONPATH:/home/ubuntu/nki-samples/src/ pytest test_flash_attn_fwd.py
\ No newline at end of file
diff --git a/test/unit/conftest.py b/test/unit/conftest.py
deleted file mode 100644
index cd663ae..0000000
--- a/test/unit/conftest.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import pytest
-
-def pytest_addoption(parser):
-    parser.addoption(
-        "--simulation-only", action="store_true", default=False, help="Run simulation only, it will run test with `simulation` marker in simulation mode"
-    )
-
-def pytest_configure(config):
-    config.addinivalue_line(
-        "markers", "simulation: mark simulation test that can be executed without a NeuronDevice"
-    )
-
-@pytest.fixture
-def simulation_only(request):
-    return request.config.getoption("--simulation-only")
-
-def pytest_collection_modifyitems(session, config, items):
-    if config.getoption("--simulation-only"):
-        # Only run cases with `simulation marker`
-        result = []
-        for item in items:
-            for marker in item.iter_markers():
-                if marker.name == 'simulation':
-                    result.append(item)
-                    break
-        items.clear()
-        items.extend(result)
-        
\ No newline at end of file
diff --git a/test/unit/test_SD_attention_small_head.py b/test/unit/test_SD_attention_small_head.py
deleted file mode 100644
index 36d66a0..0000000
--- a/test/unit/test_SD_attention_small_head.py
+++ /dev/null
@@ -1,68 +0,0 @@
-"""
-Copyright (c) 2023, Amazon.com. All Rights Reserved
-"""
-import os
-import pytest
-from nki_samples.reference.attention import fused_self_attn_for_SD_small_head_size
-from neuronxcc.nki import benchmark, baremetal, simulate_kernel
-import neuronxcc.nki.language as nl
-import numpy as np
-from scipy.special import softmax
-
-test_trace_file_path='local_trace.ntff'
-bench_func = benchmark(warmup=5, iters=20, save_trace_name=test_trace_file_path)(fused_self_attn_for_SD_small_head_size)
-
-def cpu_golden_attn(q, k, v):
-    softmax_scale = 0.125
-    q_scaled = q * softmax_scale
-    raw_score = np.matmul(q_scaled.transpose(0, 2, 1), k.transpose(0, 2, 1))
-    
-    norm_score = softmax(raw_score, axis=-1)
-
-    # Transpose the result so it has the same layout as ours
-    return np.matmul(norm_score, v)
-
-class TestAttention:
-
-    @pytest.mark.parametrize("bs,seqlen,d,dtype,latency", [
-        [1, 4096, 128, np.float32, 675],
-        [1, 4096, 128, nl.bfloat16, 536],
-        [1, 4096, 64, nl.float16, 600]
-    ])
-    def test_attention_for_SD_perf(self, bs, seqlen, d, dtype, latency):
-        q = np.random.random_sample((bs, d, seqlen)).astype(np.float32)
-        k = np.random.random_sample((bs, seqlen, d)).astype(np.float32)
-        v = np.random.random_sample((bs, seqlen, d)).astype(np.float32)
-
-        q_dev = nl.static_cast(q, dtype)
-        k_dev = nl.static_cast(k, dtype)
-        v_dev = nl.static_cast(v, dtype)
-
-        bench_func_ = bench_func[bs]
-        bench_func_(q_dev, k_dev, v_dev)
-        latency_res = bench_func_.benchmark_result.nc_latency
-        p1 = latency_res.get_latency_percentile(1)
-        assert os.path.getsize(test_trace_file_path) > 0
-
-    @pytest.mark.simulation
-    @pytest.mark.parametrize("bs,seqlen,d,dtype", [
-        [1, 4096, 128, np.float32],
-        [1, 4096, 128, nl.bfloat16]
-    ])
-    def test_attention_for_SD_numberic(self, simulation_only, bs, seqlen, d, dtype):
-        q = np.random.random_sample((bs, d, seqlen)).astype(np.float32)
-        k = np.random.random_sample((bs, seqlen, d)).astype(np.float32)
-        v = np.random.random_sample((bs, seqlen, d)).astype(np.float32)
-
-        q_dev = nl.static_cast(q, dtype)
-        k_dev = nl.static_cast(k, dtype)
-        v_dev = nl.static_cast(v, dtype)
-
-        numeric_func = baremetal(fused_self_attn_for_SD_small_head_size)
-        if simulation_only:
-            out = simulate_kernel(numeric_func[bs], q_dev, k_dev, v_dev)
-        else:
-            out = numeric_func[bs](q_dev, k_dev, v_dev)
-        out = nl.static_cast(out, np.float32)
-        golden_result = cpu_golden_attn(q, k, v)
-        assert np.allclose(out, golden_result, atol=1e-2)
diff --git a/test/unit/test_adaptive_avg_pool2d.py b/test/unit/test_adaptive_avg_pool2d.py
deleted file mode 100644
index f4fff4e..0000000
--- a/test/unit/test_adaptive_avg_pool2d.py
+++ /dev/null
@@ -1,123 +0,0 @@
-"""
-Copyright (c) 2025, Amazon.com. All Rights Reserved
-
-Unit tests for adaptive average pooling 2D kernel
-"""
-import pytest
-import numpy as np
-
-from nki_samples.reference.vision import adaptive_avg_pool2d_kernel
-from neuronxcc.nki import benchmark, baremetal, simulate_kernel
-import neuronxcc.nki.language as nl
-
-
-bench_func = benchmark(warmup=5, iters=10)(adaptive_avg_pool2d_kernel)
-
-def cpu_golden_result_numpy(input_tensor, output_size):
-    """
-    Pure NumPy reference implementation for adaptive average pooling
-    """
-    N, C, H, W = input_tensor.shape
-    
-    if isinstance(output_size, int):
-        OH = OW = output_size
-    else:
-        OH, OW = output_size
-    
-    output = np.zeros((N, C, OH, OW), dtype=input_tensor.dtype)
-    
-    for n in range(N):
-        for c in range(C):
-            for oh in range(OH):
-                for ow in range(OW):
-                    # Calculate the input region for this output position
-                    h_start = (oh * H) // OH
-                    h_end = ((oh + 1) * H + OH - 1) // OH
-                    w_start = (ow * W) // OW
-                    w_end = ((ow + 1) * W + OW - 1) // OW
-                    
-                    # Compute average over the region
-                    region = input_tensor[n, c, h_start:h_end, w_start:w_end]
-                    output[n, c, oh, ow] = np.mean(region)
-    
-    return output
-
-
-class TestAdaptiveAvgPool2D:
-    
-    @pytest.mark.parametrize("N, C, H, W, output_size, dtype, latency", [
-        # Test cases with various input/output sizes
-        [1, 64, 56, 56, 7, np.float32, 65 * 1.05],
-        [2, 128, 28, 28, 7, np.float32, 109 * 1.05],
-        [1, 256, 14, 14, 7, nl.float16, 88 * 1.05],
-        [4, 64, 56, 56, (7, 7), np.float32, 152 * 1.05],
-        [2, 128, 28, 28, (14, 14), np.float32, 327 * 1.05],
-        [1, 512, 7, 7, 1, np.float32, 14 * 1.05],  # Global average pooling
-        [8, 64, 32, 32, (4, 4), nl.float16, 103 * 1.05],
-        [2, 2048, 32, 32, (3, 3), np.float32, 564 * 1.05],
-        [2, 64, 256, 256, 1, np.float32, 255 * 1.05],
-    ])
-    def test_adaptive_avg_pool2d_perf(self, N, C, H, W, output_size, dtype, latency):
-        """Performance test for adaptive average pooling 2D"""
-        # Generate random input
-        input_tensor = np.random.randn(N, C, H, W).astype(np.float32)
-        
-        # Cast to target dtype
-        input_dev = nl.static_cast(input_tensor, dtype)
-        
-        # Run benchmark
-        bench_func(input_dev, output_size)
-        latency_res = bench_func.benchmark_result.nc_latency
-        p99 = latency_res.get_latency_percentile(99)
-    
-    @pytest.mark.simulation
-    @pytest.mark.parametrize("N, C, H, W, output_size, dtype", [
-        # Numerical accuracy test cases
-        [1, 64, 56, 56, 7, np.float32],
-        [2, 128, 28, 28, 7, np.float32],
-        [1, 256, 14, 14, 7, nl.float16],
-        [1, 256, 14, 14, 7, nl.bfloat16],
-        [4, 64, 56, 56, (7, 7), np.float32],
-        [2, 128, 28, 28, (14, 14), np.float32],
-        [1, 512, 7, 7, 1, np.float32],  # Global average pooling
-        [1, 64, 32, 32, (8, 8), nl.float16],
-        [3, 96, 24, 24, (6, 6), nl.bfloat16],
-        [2, 192, 16, 16, (4, 2), np.float32],  # Non-square output
-        [2, 2048, 32, 32, (3, 3), np.float32],
-        [1, 1, 1, 1, 1, np.float32],  # Single element
-        [1, 128, 7, 7, 7, np.float32],  # Same input and output size
-        [2, 64, 256, 256, 1, np.float32],  # Global pooling on large input
-        [1, 256, 13, 13, 6, np.float32],  # Non-divisible dimensions
-        [4, 512, 7, 7, (1, 1), np.float32],  # Tuple output for global pooling
-    ])
-    def test_adaptive_avg_pool2d_numerical(self, simulation_only, N, C, H, W, output_size, dtype):
-        """Numerical accuracy test for adaptive average pooling 2D"""
-        # Generate random input
-        input_tensor = np.random.randn(N, C, H, W).astype(np.float32)
-        
-        # Cast to target dtype
-        input_dev = nl.static_cast(input_tensor, dtype)
-        
-        # Run kernel
-        numeric_func = baremetal(adaptive_avg_pool2d_kernel)
-        if simulation_only:
-            output_tensor = simulate_kernel(numeric_func, input_dev, output_size)
-        else:
-            output_tensor = numeric_func(input_dev, output_size)
-        
-        # Cast output back to float32 for comparison
-        output_tensor = nl.static_cast(output_tensor, np.float32)
-        
-        # Compute golden result using NumPy implementation
-        golden_result = cpu_golden_result_numpy(input_tensor, output_size)
-        
-        # Compare results with appropriate tolerance
-        if dtype in [nl.float16, nl.bfloat16]:
-            atol = 1e-2
-            rtol = 1e-2
-        else:
-            atol = 1e-5
-            rtol = 1e-5
-        
-        assert np.allclose(output_tensor, golden_result, atol=atol, rtol=rtol), \
-            f"Output mismatch. Max diff: {np.max(np.abs(output_tensor - golden_result))}"
diff --git a/test/unit/test_allocated_SD_attention_small_head.py b/test/unit/test_allocated_SD_attention_small_head.py
deleted file mode 100644
index f4318d4..0000000
--- a/test/unit/test_allocated_SD_attention_small_head.py
+++ /dev/null
@@ -1,71 +0,0 @@
-"""
-Copyright (c) 2023, Amazon.com. All Rights Reserved
-"""
-import os
-import pytest
-from nki_samples.reference.allocated_attention import allocated_fused_self_attn_for_SD_small_head_size
-from neuronxcc.nki import benchmark, baremetal, simulate_kernel
-import neuronxcc.nki as nki
-import neuronxcc.nki.language as nl
-import numpy as np
-from scipy.special import softmax
-
-test_trace_file_path='local_trace.ntff'
-
-bench_func = benchmark(warmup=20, iters=200, save_trace_name=test_trace_file_path)(allocated_fused_self_attn_for_SD_small_head_size)
-
-def cpu_golden_attn(q, k, v):
-    softmax_scale = 0.125
-    q_scaled = q * softmax_scale
-    raw_score = np.matmul(q_scaled.transpose(0, 2, 1), k)
-    
-    norm_score = softmax(raw_score, axis=-1)
-
-    # Transpose the result so it has the same layout as ours
-    return np.matmul(norm_score, v).transpose(0, 2, 1)
-
-class TestAttention:
-
-    @pytest.mark.parametrize("bs,seqlen,d,dtype,latency", [
-        [1, 4096, 128, np.float32, 410],
-        [1, 4096, 128, nl.bfloat16, 380],
-        [1, 5120, 128, nl.bfloat16, 586]
-    ])
-    def test_allocated_attention_for_SD_perf(self, bs, seqlen, d, dtype, latency):
-        q = np.random.random_sample((bs, d, seqlen)).astype(np.float32)
-        k = np.random.random_sample((bs, d, seqlen)).astype(np.float32)
-        v = np.random.random_sample((bs, seqlen, d)).astype(np.float32)
-
-        q_dev = nl.static_cast(q, dtype)
-        k_dev = nl.static_cast(k, dtype)
-        v_dev = nl.static_cast(v, dtype)
-
-        bench_func_ = bench_func[bs]
-        bench_func_(q_dev, k_dev, v_dev)
-        latency_res = bench_func_.benchmark_result.nc_latency
-        p50 = latency_res.get_latency_percentile(50)
-        assert os.path.getsize(test_trace_file_path) > 0
-
-    @pytest.mark.simulation
-    @pytest.mark.parametrize("bs,seqlen,d,dtype", [
-        [1, 4096, 128, np.float32],
-        [1, 4096, 128, nl.bfloat16],
-        [1, 5120, 128, nl.bfloat16]
-    ])
-    def test_allocated_attention_for_SD_numberic(self, simulation_only, bs, seqlen, d, dtype):
-        q = np.random.random_sample((bs, d, seqlen)).astype(np.float32)
-        k = np.random.random_sample((bs, d, seqlen)).astype(np.float32)
-        v = np.random.random_sample((bs, seqlen, d)).astype(np.float32)
-
-        q_dev = nl.static_cast(q, dtype)
-        k_dev = nl.static_cast(k, dtype)
-        v_dev = nl.static_cast(v, dtype)
-
-        numeric_func = baremetal(allocated_fused_self_attn_for_SD_small_head_size)
-        if simulation_only:
-            out = simulate_kernel(numeric_func[bs], q_dev, k_dev, v_dev)
-        else:
-            out = numeric_func[bs](q_dev, k_dev, v_dev)
-        out = nl.static_cast(out, np.float32)
-        golden_result = cpu_golden_attn(q, k, v)
-        assert np.allclose(out, golden_result, atol=1e-2)
diff --git a/test/unit/test_double_row_matmul.py b/test/unit/test_double_row_matmul.py
deleted file mode 100644
index 9e49ecb..0000000
--- a/test/unit/test_double_row_matmul.py
+++ /dev/null
@@ -1,142 +0,0 @@
-"""
-Copyright (c) 2025, Amazon.com. All Rights Reserved
-"""
-import pytest
-from nki_samples.reference.double_row_matmul import quantized_double_row_matmul
-from neuronxcc.nki import benchmark, baremetal, simulate_kernel
-import neuronxcc.nki.language as nl
-import numpy as np
-
-def get_target_string():
-    """ returns instance type, e.g. trn1, inf2, trn2. """
-    fpath = '/sys/devices/virtual/dmi/id/product_name'
-    try:
-        with open(fpath, 'r') as f:
-            fc = f.readline()
-    except IOError:
-        warnings.warn('Unable to read MLA target.')
-        return ""
-
-    instance_type = fc.split('.')[0]
-    return instance_type
-
-bench_func = benchmark(warmup=5, iters=10)(quantized_double_row_matmul)
-
-def reshape(matrix):
-    """
-    Interleaves every [128,512] tiles from every 2 tile rows.
-
-    A [K,N] matrix is reshaped into [K//2, 2*N] where K must be divisible by 128 and 
-    N must be divisible by 512.
-
-    E.g. if Tij is the (i,j)-th tile and assuming a matrix with 4x4 [128,512] tiles,
-    the reshaped matrix looks as follows
-
-        # T11 T12 T13 T14          
-        # T21 T22 T23 T24   reshape   T11 T21 T12 T22 T13 T23 T14 T24
-        # T31 T32 T33 T34  -------->  T21 T41 T22 T42 T23 T43 T24 T44
-        # T41 T42 T43 T44
-    """
-    K, N = matrix.shape
-
-    TILE_K = 128
-    TILE_N = 512
-    
-    assert K % TILE_K == 0
-    assert N % TILE_N == 0
-
-    result = np.zeros((K // 2, 2 * N))
-    
-    for k in range(0, K // TILE_K, 2):
-      for n in range(N // TILE_N):
-        # Get 2 tiles in the same tile column and consecutive tile rows.
-        tile1 = matrix[k * TILE_K:(k + 1) * TILE_K, n * TILE_N:(n + 1) * TILE_N]
-        tile2 = matrix[(k + 1) * TILE_K:(k + 2) * TILE_K, n * TILE_N:(n + 1) * TILE_N]
-
-        result[(k // 2) * TILE_K:(k // 2 + 1) * TILE_K, n * TILE_N * 2:n * TILE_N * 2 + TILE_N] = tile1
-        result[(k//2) * TILE_K:(k // 2 + 1) * TILE_K, n * TILE_N * 2 + TILE_N:(n + 1) * TILE_N * 2] = tile2
-        
-        # Place the 2 tiles in the same tile row side by side.
-        result[(k // 2) * TILE_K:(k // 2 + 1) * TILE_K, n * TILE_N * 2:n * TILE_N * 2+TILE_N] = tile1
-        result[(k // 2) * TILE_K:(k // 2 + 1) * TILE_K, n * TILE_N * 2 + TILE_N:n * TILE_N * 2 + TILE_N + TILE_N] = tile2
-    
-    return result
-
-def column_wise_quantize(matrix):
-    """
-    Quantizes a matrix.
-
-    Returns a column-wise scale broadcasted to (128, matrix.shape[1]) and the quantized matrix.
-    """
-    FP8_RANGE = 240
-    column_wise_max = np.max(np.abs(matrix), axis=0, keepdims=True)
-    column_wise_scale = column_wise_max / FP8_RANGE
-
-    matrix_quantized = matrix / column_wise_scale
-    column_wise_scale = np.broadcast_to(column_wise_scale, (128, matrix.shape[1]))
-
-    return column_wise_scale, matrix_quantized
-
-class TestDoubleRowMatmul:
-
-    @pytest.mark.parametrize("M, K, N, dtype, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K, max_p99_latency", [
-        [512, 16 * 1024, 1024, nl.bfloat16, 2, 2, 16, 320],
-    ])
-    def test_double_row_matmul_perf(self, M, K, N, dtype, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K, max_p99_latency):
-        if (get_target_string() != "trn2"):
-            return
-        # Initializing random inputs
-        lhs = np.random.rand(M, K)
-        rhs = np.random.rand(K, N)
-
-        # Quantizing rhs
-        rhs_scale, rhs_quantized = column_wise_quantize(rhs)
-        rhs_quantized_reshaped = reshape(rhs_quantized)
-
-        # Casting to the correct data type (rhs is pre-quantized, thus casted to FP8)
-        lhs = nl.static_cast(lhs, dtype)
-        rhs_scale = nl.static_cast(rhs_scale, dtype)
-        rhs_quantized_reshaped = nl.static_cast(rhs_quantized_reshaped, nl.float8_e4m3)
-        
-        # Latency checks
-        bench_func(lhs, rhs_quantized_reshaped, rhs_scale, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K)
-        latency_res = bench_func.benchmark_result.nc_latency
-        p99_latency = latency_res.get_latency_percentile(99)
-
-    @pytest.mark.simulation
-    @pytest.mark.parametrize("M, K, N, dtype, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K", [
-        [512, 16 * 1024, 1024, nl.bfloat16, 2, 2, 16],
-        [512, 16 * 1024, 1024, nl.bfloat16, 4, 1, 32],
-        [512, 16 * 1024, 1024, nl.bfloat16, 4, 2, 128],
-    ])
-    def test_double_row_matmul_numerical(self, simulation_only, M, K, N, dtype, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K):
-        if (get_target_string() != "trn2"):
-            return
-        # Initializing random inputs
-        lhs = np.random.rand(M, K)
-        rhs = np.random.rand(K, N)
-        
-        # Correct CPU results
-        result_golden = np.matmul(lhs, rhs)
-
-        # Quantizing rhs
-        rhs_scale, rhs_quantized = column_wise_quantize(rhs)
-        rhs_quantized_reshaped = reshape(rhs_quantized)
-
-        # Casting to the correct data type (rhs is pre-quantized, thus casted to FP8)
-        lhs = nl.static_cast(lhs, dtype)
-        rhs_scale = nl.static_cast(rhs_scale, dtype)
-        rhs_quantized_reshaped = nl.static_cast(rhs_quantized_reshaped, nl.float8_e4m3)
-        
-        # Numerical accuracy checks
-        numeric_func = baremetal(quantized_double_row_matmul)
-
-        if simulation_only:
-            result_nki = simulate_kernel(numeric_func, lhs, rhs_quantized_reshaped, rhs_scale, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K)
-        else:
-            result_nki = numeric_func(lhs, rhs_quantized_reshaped, rhs_scale, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K)
-
-        # Casting result_nki from dtype BF16 back to FP32 to compare the NumPy and NKI results
-        result_nki = result_nki.astype(np.float32)
-        
-        assert np.allclose(result_golden, result_nki, rtol=2e-2)
diff --git a/test/unit/test_flash_attn_bwd.py b/test/unit/test_flash_attn_bwd.py
deleted file mode 100644
index d1f0dfb..0000000
--- a/test/unit/test_flash_attn_bwd.py
+++ /dev/null
@@ -1,206 +0,0 @@
-"""
-Copyright (c) 2023, Amazon.com. All Rights Reserved
-"""
-import pytest
-from nki_samples.reference.attention import flash_attn_bwd
-from neuronxcc.nki import benchmark, baremetal, simulate_kernel
-import neuronxcc.nki.language as nl
-import numpy as np
-
-xfail = pytest.mark.arch_specific_xfail
-
-
-bench_func = benchmark(warmup=5, iters=10)(flash_attn_bwd)
-
-def softmax(x: np.ndarray, dim: int, zero_max_mode=False,
-            mixed_precision=False, return_max_reduce=False):
-    max_value = np.amax(x, axis=dim, keepdims=True)
-    max_value = np.maximum(0, max_value) if zero_max_mode else max_value
-    exp = np.exp(x - max_value)
-    if mixed_precision:
-        reduce = np.add.reduce(exp.astype(np.float32), axis=dim, keepdims=True).astype(x.dtype)
-    else:
-        reduce = np.add.reduce(exp, axis=dim, keepdims=True)
-    if return_max_reduce:
-        return exp / reduce, -max_value, np.reciprocal(reduce)
-    return exp / reduce
-
-def softmax_dx(dy: np.ndarray, y: np.ndarray, dim: int, mixed_precision=False):
-    # dx_i = (dy_i - sum(dy_k*y_k)) * y_i
-    prod = dy * y
-    if mixed_precision:
-        reduce = np.add.reduce(prod.astype(np.float32), axis=dim, keepdims=True).astype(dy.dtype)
-    else:
-        reduce = np.add.reduce(prod, axis=dim, keepdims=True)
-    subtract = dy - reduce
-    return subtract * y
-
-def cpu_attention_backward(q, k, v, dy, use_causal_mask=True, mixed_precision=True, sliding_window=-1, sinks=None):
-  """
-  Compute the attention backward with the softmax recomputation
-  """
-  def mixed_precision_matmul(a, b):
-    input_dtype = a.dtype
-    a, b = a.astype(np.float32), b.astype(np.float32)
-    c = np.matmul(a, b)
-    return c.astype(input_dtype)
-
-  bs, nheads, d, seqlen = q.shape
-  _, nheads_kv, _, _ = k.shape
-  nheads_per_kv_head = nheads // nheads_kv
-
-  # Expand key and value to match query heads
-  k_expanded = np.repeat(k, nheads_per_kv_head, axis=1)
-  v_expanded = np.repeat(v, nheads_per_kv_head, axis=1)
-
-  # Compute golden output
-  softmax_scale = 1.0 / (d ** 0.5)
-  q_scaled = q * softmax_scale
-  raw_score = mixed_precision_matmul(q_scaled.transpose(0, 1, 3, 2), k_expanded)
-
-  if use_causal_mask:
-    # raw_score has K seq in the most inner dim
-    # we want to mask all elements where Q idx is smaller than K idx with -inf
-    # this maps to the upper triangle of the final two axes
-    for i in range(raw_score.shape[0]):
-      for j in range(raw_score.shape[1]):
-        # -inf triggers invalid input error in softmax implementation, use a small negative instead
-        # k=1 to exclude the diagonal, because each token can still attend to itself
-        raw_score[i, j][np.triu_indices_from(raw_score[i, j], k=1)] = -9984.0
-    
-        if sliding_window > 0:
-           q_pos = np.arange(raw_score.shape[2])[:, None]
-           k_pos = np.arange(raw_score.shape[3])[None, :]
-           sliding_window_mask = k_pos < (q_pos - sliding_window + 1)
-           raw_score[i, j][sliding_window_mask] = -9984.0
-
-  # Add sink tokens to the raw_score along the innermost dimension
-  if sinks is not None:
-    bs, nheads, seq_q, seq_k = raw_score.shape
-    sinks_expanded = np.broadcast_to(sinks.reshape(bs, nheads, 1, 1), (bs, nheads, seq_q, 1))
-    combined_logits = np.concatenate([raw_score, sinks_expanded], axis=-1)
-  else:
-    combined_logits = raw_score
-  
-  norm_score_combined, cached_negative_max, cached_sum_reciprocal = \
-    softmax(combined_logits, dim=-1, mixed_precision=mixed_precision, return_max_reduce=True)
-    
-  # Drop sink probabilities if sinks were added, keep only attention scores
-  if sinks is not None:
-    norm_score = norm_score_combined[..., :-1]  # Remove last dimension (sink)
-  else:
-    norm_score = norm_score_combined
-
-  # Calculate softmax_dy = (dL/dy)^T @ V
-  softmax_dy = mixed_precision_matmul(dy.transpose(0, 1, 3, 2), v_expanded)
-
-  if sinks is not None:
-    # Combine gradients: softmax_dy for attention, zeros for sinks
-    combined_dy = np.concatenate([softmax_dy, np.zeros_like(norm_score_combined[..., -1:])], axis=-1)
-    combined_dx = softmax_dx(combined_dy, norm_score_combined, dim=-1, mixed_precision=mixed_precision)
-    softmax_dx_golden = combined_dx[..., :-1]  # Extract attention gradients
-    dsinks_golden = np.sum(combined_dx[..., -1:], axis=(2, 3))  # Extract and sum sink gradients
-  else:
-    softmax_dx_golden = softmax_dx(softmax_dy, norm_score, dim=-1, mixed_precision=mixed_precision)
-    dsinks_golden = None
-
-  # Calculate dv = (dL/dy) @ softmax_y
-  dv_expanded = mixed_precision_matmul(dy, norm_score)
-  dv_reshaped = dv_expanded.reshape(bs, nheads_kv, nheads_per_kv_head, d, seqlen)
-  dv_golden = np.sum(dv_reshaped, axis=2)
-
-  # Calculate dq
-  dq_golden = mixed_precision_matmul(k_expanded, softmax_dx_golden.transpose(0, 1, 3, 2)) * softmax_scale
-
-  # Calculate dk
-  dk_expanded = mixed_precision_matmul(q_scaled, softmax_dx_golden)
-  dk_reshaped = dk_expanded.reshape(bs, nheads_kv, nheads_per_kv_head, d, seqlen)
-  dk_golden = np.sum(dk_reshaped, axis=2)
-
-  # Calculate output projection
-  o_proj = np.matmul(norm_score, v_expanded.transpose(0, 1, 3, 2)).transpose(0, 1, 3, 2)
-
-  # Calculate 
-  return dq_golden, dk_golden, dv_golden, cached_negative_max, cached_sum_reciprocal, o_proj, dsinks_golden
-
-class TestAttention:
-
-    @xfail # P167481231
-    @pytest.mark.parametrize("bs, nheads, seqlen, d, dtype, latency", [
-        [1, 4, 32*1024, 128, nl.bfloat16, 117000],
-    ])
-    @pytest.mark.parametrize("sinks", [False, True])
-    def test_flash_attn_bwd_perf(self, bs, nheads, seqlen, d, dtype, latency, sinks):
-        q = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
-        k = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
-        v = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
-        dy = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
-        o_proj = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
-        lse = np.random.random_sample([bs, nheads, nl.tile_size.pmax, seqlen // nl.tile_size.pmax]).astype(np.float32)
-        seed = None
-        sinks_tensor = np.random.randn(bs, nheads).astype(dtype) if sinks else None
-
-        q = nl.static_cast(q, dtype)
-        k = nl.static_cast(k, dtype)
-        v = nl.static_cast(v, dtype)
-        o_proj = nl.static_cast(o_proj, dtype)
-        dy = nl.static_cast(dy, dtype)  
-
-        bench_func_ = bench_func[bs, nheads]
-        bench_func_(q, k, v, o_proj, dy, lse, seed,
-                    use_causal_mask=True, mixed_precision=True, sinks=sinks_tensor)
-        latency_res = bench_func_.benchmark_result.nc_latency
-        p99 = latency_res.get_latency_percentile(50)
-
-    @pytest.mark.simulation
-    @pytest.mark.parametrize("bs, nheads, nheads_kv, seqlen, d, dtype", [
-        [1, 4, 4, 4096, 128, np.float32],
-        [1, 4, 1, 4096, 128, np.float32],
-        [1, 8, 2, 4096, 128, np.float32],
-        [1, 8, 2, 8192, 128, np.float32],
-    ])
-    @pytest.mark.parametrize("sliding_window", [-1, 128])
-    @pytest.mark.parametrize("sinks", [False, True])
-    def test_flash_attn_bwd_numerical(self, simulation_only, bs, nheads, nheads_kv, seqlen, d, dtype, sliding_window, sinks):
-        q = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
-        k = (np.random.random_sample([bs, nheads_kv, d, seqlen]) - 0.5) * 2
-        v = (np.random.random_sample([bs, nheads_kv, d, seqlen]) - 0.5) * 2
-        dy = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2
-        q = nl.static_cast(q, dtype)
-        k = nl.static_cast(k, dtype)
-        v = nl.static_cast(v, dtype)
-        dy = nl.static_cast(dy, dtype)
-        seed = None
-        sinks_tensor = np.random.randn(bs, nheads).astype(dtype) if sinks else None
-
-        dq_golden, dk_golden, dv_golden, cached_negative_max, cached_sum_reciprocal, o_proj, dsinks_golden = \
-          cpu_attention_backward(q, k, v, dy, use_causal_mask=True, sliding_window=sliding_window, sinks=sinks_tensor)
-        cached_negative_max = cached_negative_max.reshape(bs, nheads, seqlen // nl.tile_size.pmax,
-                                                          nl.tile_size.pmax).transpose(0, 1, 3, 2)
-        cached_sum_reciprocal = cached_sum_reciprocal.reshape(bs, nheads, seqlen // nl.tile_size.pmax,
-                                                              nl.tile_size.pmax).transpose(0, 1, 3, 2)
-        lse = -1.0 * (cached_negative_max + np.log(cached_sum_reciprocal))
-
-        numeric_func = baremetal(flash_attn_bwd)
-        if simulation_only:
-          kernel_outputs = simulate_kernel(numeric_func[bs, nheads_kv], q, k, v, o_proj, dy, lse, seed,
-                                                          use_causal_mask=True,
-                                                          mixed_precision=True,
-                                                          sliding_window=sliding_window, sinks=sinks_tensor)
-        else:
-          kernel_outputs = numeric_func[bs, nheads_kv](q, k, v, o_proj, dy, lse, seed,
-                                                          use_causal_mask=True,
-                                                          mixed_precision=True,
-                                                          sliding_window=sliding_window, sinks=sinks_tensor)
-        if sinks_tensor is not None:
-            if len(kernel_outputs) == 4:
-                out_dq, out_dk, out_dv, dsinks = kernel_outputs
-                assert np.allclose(dsinks, dsinks_golden, atol=1e-2)
-            else:
-                out_dq, out_dk, out_dv = kernel_outputs
-        else:
-            out_dq, out_dk, out_dv = kernel_outputs
-
-        assert np.allclose(out_dq, dq_golden, atol=1e-2)
-        assert np.allclose(out_dk, dk_golden, atol=1e-2)
-        assert np.allclose(out_dv, dv_golden, atol=1e-2)
diff --git a/test/unit/test_flash_attn_fwd.py b/test/unit/test_flash_attn_fwd.py
deleted file mode 100644
index 33e02b0..0000000
--- a/test/unit/test_flash_attn_fwd.py
+++ /dev/null
@@ -1,166 +0,0 @@
-"""
-Copyright (c) 2023, Amazon.com. All Rights Reserved
-"""
-import pytest
-from nki_samples.reference.attention import flash_fwd, FlashConfig
-from neuronxcc.nki import benchmark, baremetal, simulate_kernel
-import neuronxcc.nki.language as nl
-import numpy as np
-
-bench_func = benchmark(warmup=5, iters=10)(flash_fwd)
- 
-def softmax(x: np.ndarray, dim: int, zero_max_mode=False,
-            mixed_precision=False, return_max_reduce=False):
-    max_value = np.amax(x, axis=dim, keepdims=True)
-    max_value = np.maximum(0, max_value) if zero_max_mode else max_value
-    exp = np.exp(x - max_value)
-    if mixed_precision:
-        reduce = np.add.reduce(exp.astype(np.float32), axis=dim, keepdims=True).astype(x.dtype)
-    else:
-        reduce = np.add.reduce(exp, axis=dim, keepdims=True)
-    if return_max_reduce:
-        return exp / reduce, -max_value, np.reciprocal(reduce)
-    return exp / reduce
- 
- 
-def cpu_attention_forward(q, k, v, use_causal_mask=True, sliding_window=-1, mixed_precision=True):
-    def mixed_precision_matmul(a, b):
-        input_dtype = a.dtype
-        a, b = a.astype(np.float32), b.astype(np.float32)
-        c = np.matmul(a, b)
-        return c.astype(input_dtype)
-
-    _, _, d, _ = q.shape
-
-    # Compute golden output
-    softmax_scale = 1.0 / (d ** 0.5)
-    q_scaled = q * softmax_scale
-    nheads = q.shape[1]
-    kv_heads = k.shape[1]
-    if nheads > kv_heads:
-        k = np.repeat(k, nheads//kv_heads, axis=1)
-        v = np.repeat(v, nheads//kv_heads, axis=1)
-    raw_score = mixed_precision_matmul(q_scaled.transpose(0, 1, 3, 2), k)
-
-    if use_causal_mask:
-        _, _, Q, K = raw_score.shape
-
-        q_idx = np.arange(Q)[:, None]
-        k_idx = np.arange(K)[None, :]
-
-        if sliding_window > 0:
-            mask = (k_idx > q_idx) | (k_idx < q_idx - sliding_window + 1)
-        else:  # causal
-            mask = k_idx > q_idx
-
-        # Broadcast mask to shape (1, 1, Q, K)
-        raw_score = raw_score.copy()
-        # -inf triggers invalid input error in softmax implementation, use a small negative instead
-        raw_score[:, :, mask] = -9984.0
-
-
-    norm_score, cached_negative_max, cached_sum_reciprocal = \
-        softmax(raw_score, dim=-1, mixed_precision=mixed_precision, return_max_reduce=True)
-
-    # Transpose the result so it has the same layout as ours
-    out_golden = mixed_precision_matmul(norm_score, v.transpose(0, 1, 3, 2)).transpose(0, 1, 3, 2)
-
-    return out_golden, cached_negative_max, cached_sum_reciprocal
- 
-class TestAttention:
- 
-    @pytest.mark.parametrize("bs, nheads, seqlen_q, seqlen_k, d, dtype, use_causal_mask, \
-                              sliding_window, mixed_precision, training, tile_size, kv_heads, should_transpose_v, latency", [
-    [1, 6, 32*1024, 32*1024, 96, nl.bfloat16, True, -1, True, True, 2048, 3, False, 87000000000],
-    [1, 1, 32*1024, 32*1024, 96, nl.bfloat16, True, -1, True, False, 2048, None, False, 15100000000],
-    # Non-square
-    [1, 3, 32*1024, 16*1024, 96, nl.bfloat16, True, -1, True, False, 2048, None, False, 7550000000],
-    [1, 3, 16*1024, 32*1024, 96, nl.bfloat16, True, -1, True, False, 2048, None, False, 7550000000],
-    # Causal vs. Sliding - test sliding window is faster
-    [1, 1, 16*1024, 16*1024, 96, nl.bfloat16, True, -1, True, False, 2048, None, False, 4000000000], 
-    [1, 1, 16*1024, 16*1024, 96, nl.bfloat16, True, 4096, True, False, 2048, None, False, 3000000000],
-    ])
-    def test_flash_attn_fwd_perf(self, bs, nheads, seqlen_q, seqlen_k, d, dtype, use_causal_mask,
-                                 sliding_window, mixed_precision, training, tile_size, kv_heads, should_transpose_v,latency):
-        q = (np.random.random_sample([bs, nheads, d, seqlen_q]) - 0.5) * 2
-        k = (np.random.random_sample([bs, nheads, d, seqlen_k]) - 0.5) * 2
-        if should_transpose_v:
-            v = (np.random.random_sample([bs, nheads, d, seqlen_k]) - 0.5) * 2
-        else:
-            v = (np.random.random_sample([bs, nheads, seqlen_k, d]) - 0.5) * 2
-        o_proj = np.zeros(shape=[bs, nheads, seqlen_q, d], dtype=dtype)
-        out_lse = np.zeros(shape=[bs, nheads, int(nl.tile_size.pmax), seqlen_q // nl.tile_size.pmax], 
-                                  dtype=nl.float32 if mixed_precision else dtype) if training else None
-        seed = None
-        
-        q = nl.static_cast(q, dtype)
-        k = nl.static_cast(k, dtype)
-        v = nl.static_cast(v, dtype)
-        config = FlashConfig(**{'seq_tile_size':tile_size, 'training':training, 'should_transpose_v':should_transpose_v})
-
-        heads = nheads if kv_heads is None else kv_heads
-
-        bench_func_ = bench_func[bs, heads]
-        bench_func_(q, k, v, seed, use_causal_mask=use_causal_mask, sliding_window=sliding_window,
-                    mixed_precision=mixed_precision, config=config)
-        latency_res = bench_func_.benchmark_result.nc_latency
-        p50 = latency_res.get_latency_percentile(50)
-    
-    @pytest.mark.simulation
-    @pytest.mark.parametrize("bs, nheads, seqlen_q, seqlen_k, d, dtype, use_causal_mask, \
-                              sliding_window, training, tile_size, kv_heads, should_transpose_v", [
-    [1, 6, 4096, 4096, 128, np.float32, True, -1, True, 2048, 3, False],
-    [1, 1, 4096, 4096, 128, np.float32, True, -1, False, 2048, None, False],
-    [1, 1, 8192, 4096, 128, np.float32, True, -1, False, 2048, None, False],
-    [1, 1, 4096, 8192, 128, np.float32, True, -1, False, 2048, None, False],
-    [1, 1, 4096, 4096, 128, np.float32, True, 1024, False, 2048, None, False],
-    ])
-    def test_flash_attn_fwd_numerical(self, simulation_only, bs, nheads, seqlen_q, seqlen_k, d, dtype, use_causal_mask, 
-                                    sliding_window, training, tile_size, kv_heads, should_transpose_v):
-        q = (np.random.random_sample([bs, nheads, d, seqlen_q]) - 0.5) * 2
-        k = (np.random.random_sample([bs, kv_heads or nheads, d, seqlen_k]) - 0.5) * 2
-        if should_transpose_v:
-            v = (np.random.random_sample([bs, nheads, d, seqlen_k]) - 0.5) * 2
-            cpu_permute = (0, 1, 2, 3)
-        else:
-            v = (np.random.random_sample([bs, kv_heads or nheads, seqlen_k, d]) - 0.5) * 2
-            cpu_permute = (0, 1, 3, 2)
-
-        q = nl.static_cast(q, dtype)
-        k = nl.static_cast(k, dtype)
-        v = nl.static_cast(v, dtype)
-        seed = None
-
-        o_proj_golden, cached_negative_max, cached_sum_reciprocal  = \
-          cpu_attention_forward(q, k, v.transpose(cpu_permute), use_causal_mask=use_causal_mask, sliding_window=sliding_window, mixed_precision=True)
-        o_proj_golden = o_proj_golden.transpose(0,1,3,2) # (b,h, d, seq)
-        cached_negative_max = cached_negative_max.reshape(bs, nheads, seqlen_q // nl.tile_size.pmax,
-                                                          nl.tile_size.pmax).transpose(0, 1, 3, 2)
-        cached_sum_reciprocal = cached_sum_reciprocal.reshape(bs, nheads, seqlen_q // nl.tile_size.pmax,
-                                                              nl.tile_size.pmax).transpose(0, 1, 3, 2)
-        lse_golden = -1.0 * (cached_negative_max + np.log(cached_sum_reciprocal)) if training else None
-        config = FlashConfig(**{'seq_tile_size':tile_size, 'training':training, 'should_transpose_v':should_transpose_v})
-
-        heads = nheads if kv_heads is None else kv_heads
-
-        numeric_func = baremetal(flash_fwd)
-        if simulation_only:
-            results = simulate_kernel(numeric_func[bs, heads], q, k, v, seed,
-                                          use_causal_mask=use_causal_mask,
-                                          sliding_window=sliding_window,
-                                          mixed_precision=True,
-                                          config=config)
-        else:
-            results = numeric_func[bs, heads](q, k, v, seed,
-                                          use_causal_mask=use_causal_mask,
-                                          sliding_window=sliding_window,
-                                          mixed_precision=True,
-                                          config=config)
-
-        if training:
-            o_proj, out_lse = results
-            assert np.allclose(o_proj, o_proj_golden, atol=1e-2)
-            assert np.allclose(out_lse, lse_golden, atol=1e-2)
-        else:
-            o_proj = results
-            assert np.allclose(o_proj, o_proj_golden, atol=1e-2)
diff --git a/test/unit/test_neuron_profile.py b/test/unit/test_neuron_profile.py
deleted file mode 100644
index e607705..0000000
--- a/test/unit/test_neuron_profile.py
+++ /dev/null
@@ -1,86 +0,0 @@
-from neuronxcc.nki import benchmark
-from neuronxcc.nki import profile
-import neuronxcc.nki.language as nl
-import numpy as np
-import pytest
-import os
-import shutil
-import tempfile
-
-
-WORKING_DIRECTORY = tempfile.mkdtemp()
-SAVE_NEFF_NAME = "cus_file123.neff"
-SAVE_TRACE_NAME = "profile-custom.ntff"
-NUM_EXECS = 20
-PROFILE_NTH = 10  
-JSON_REPORTS = "json_reports"
-
-@profile(working_directory=WORKING_DIRECTORY, save_neff_name=SAVE_NEFF_NAME, overwrite=False , save_trace_name=SAVE_TRACE_NAME, num_execs=NUM_EXECS, profile_nth=PROFILE_NTH)
-def nki_tensor_tensor_add(a_tensor, b_tensor):
-  c_output = nl.ndarray(a_tensor.shape, dtype=a_tensor.dtype, buffer=nl.shared_hbm)
- 
-  a = nl.load(a_tensor)
-  b = nl.load(b_tensor)
-
-  c_tile = a + b
-
-  nl.store(c_output, value=c_tile)
-
-  return c_output
-
-class TestNeuronProfile:
-    def _get_ntff_path(self, trace_val):
-        """
-        Prepares ntff file name based on execution trace number
-        """
-        if trace_val == 1:
-            return os.path.join(WORKING_DIRECTORY, f"{os.path.splitext(os.path.basename(SAVE_TRACE_NAME))[0]}.ntff")
-        else:
-            return os.path.join(WORKING_DIRECTORY, f"{os.path.splitext(os.path.basename(SAVE_TRACE_NAME))[0]}_exec_{trace_val}.ntff")
-
-    @pytest.fixture
-    def traces(self):
-        ret = []
-        if NUM_EXECS < PROFILE_NTH:
-            ret.append(self._get_ntff_path(PROFILE_NTH))
-        else:
-            curr = PROFILE_NTH
-            while curr <= NUM_EXECS:
-                ret.append(self._get_ntff_path(curr))
-                curr += PROFILE_NTH
-        return ret
-    
-    @pytest.fixture
-    def num_reports(self):
-        if NUM_EXECS < PROFILE_NTH:
-            return 1
-        else:
-            return NUM_EXECS // PROFILE_NTH
-
-    def test_output_artifacts_created(self, traces, num_reports):
-        # delete artifact directory, only testing non-overwrite functionality
-        if os.path.exists(WORKING_DIRECTORY):
-            shutil.rmtree(WORKING_DIRECTORY)
-
-        # creates dummy input to invoke profile kernel
-        a = np.zeros([128, 1024]).astype(np.float16)
-        b = np.random.random_sample([128, 1024]).astype(np.float16)
-
-        output_nki = nki_tensor_tensor_add(a, b)
-
-        # now asserting artifacts are correctly created     
-        assert os.path.exists(os.path.join(WORKING_DIRECTORY, SAVE_NEFF_NAME)) # neff
-        
-        for trace in traces:
-            assert os.path.exists(trace) # trace
-        
-        # json reports
-        report_dir = os.path.join(WORKING_DIRECTORY, JSON_REPORTS)
-
-        assert os.path.exists(report_dir) # actually exists
-        assert len(os.listdir(report_dir)) == num_reports # report all iterations queried
-
-        # post condition cleanup
-        if os.path.exists(WORKING_DIRECTORY):
-            shutil.rmtree(WORKING_DIRECTORY)
-
diff --git a/test/unit/test_resize_nearest.py b/test/unit/test_resize_nearest.py
deleted file mode 100644
index ab5a606..0000000
--- a/test/unit/test_resize_nearest.py
+++ /dev/null
@@ -1,71 +0,0 @@
-"""
-Copyright (c) 2023, Amazon.com. All Rights Reserved
-"""
-import pytest
-
-from nki_samples.reference.vision import resize_nearest_fixed_dma_kernel
-from neuronxcc.nki import benchmark, baremetal, simulate_kernel
-import neuronxcc.nki.language as nl
-import numpy as np
-
-bench_func = benchmark(warmup=5, iters=10)(resize_nearest_fixed_dma_kernel)
-
-
-def cpu_golden_result(data_tensor, output_shape):
-    in_b, in_h, in_w, in_c = data_tensor.shape
-    out_b, out_h, out_w, out_c = output_shape
-
-    # Generate nearest map
-    h_scale, w_scale = 1.0 * in_h / out_h, 1.0 * in_w / out_w
-    h_map = np.floor(np.fromfunction(lambda i, _: i * h_scale, (out_h, out_w), dtype=np.float32))
-    w_map = np.floor(np.fromfunction(lambda _, j: j * w_scale, (out_h, out_w), dtype=np.float32))
-    map = (h_map * in_w + w_map).astype(np.int32).flatten()
-
-    in_seqlen, out_seqlen = in_h * in_w, out_h * out_w
-
-    data_tensor = data_tensor.reshape((in_b, in_seqlen, in_c))
-    out_tensor = np.zeros((out_b, out_seqlen, out_c))
-
-    for b_map in range(in_b):
-        for i in range(len(map)):
-            for c_map in range(out_c):
-                out_tensor[b_map, i, c_map] = data_tensor[b_map, map[i], c_map]
-
-    return out_tensor.reshape(( out_b, out_h, out_w, out_c ))
-
-class TestResizeNearest:
-
-    @pytest.mark.parametrize("in_b, in_h, in_w, in_c, out_b, out_h, out_w, out_c, dtype, latency", [
- 	    [10, 30, 20, 1280, 10, 59, 38, 1280, np.float32, 1740],
-        [1, 30, 20, 1280, 1, 59, 38, 1280, nl.float16, 659],
-        [1, 30, 20, 1280, 1, 59, 38, 1280, nl.bfloat16, 659],
- 	])
-    def test_resize_nearest_for_perf(self, in_b, in_h, in_w, in_c, out_b, out_h, out_w, out_c, dtype, latency):
-        input_tensor = np.random.random_sample((in_b, in_h, in_w, in_c)).astype(np.float32)
-
-        input_dev = nl.static_cast(input_tensor, dtype)
-
-        bench_func_ = bench_func[in_b]
-        bench_func_(input_dev, (out_b, out_h, out_w, out_c))
-        latency_res = bench_func_.benchmark_result.nc_latency
-        p99 = latency_res.get_latency_percentile(50)
-
-    @pytest.mark.simulation
-    @pytest.mark.parametrize("in_b, in_h, in_w, in_c, out_b, out_h, out_w, out_c, dtype", [
- 	    [10, 30, 20, 1280, 10, 59, 38, 1280, np.float32],
-        [1, 30, 20, 1280, 1, 59, 38, 1280, nl.float16],
-        [1, 30, 20, 1280, 1, 59, 38, 1280, nl.bfloat16],
- 	])
-    def test_resize_nearest_for_numberic(self, simulation_only, in_b, in_h, in_w, in_c, out_b, out_h, out_w, out_c, dtype):
-        input_tensor = np.random.random_sample((in_b, in_h, in_w, in_c)).astype(np.float32)
-
-        input_dev = nl.static_cast(input_tensor, dtype)
-
-        numeric_func = baremetal(resize_nearest_fixed_dma_kernel)
-        if simulation_only:
-            output_tensor = simulate_kernel(numeric_func[in_b], input_dev, (out_b, out_h, out_w, out_c))
-        else:
-            output_tensor = numeric_func[in_b](input_dev, (out_b, out_h, out_w, out_c))
-        output_tensor = nl.static_cast(output_tensor, np.float32)
-        golden_result = cpu_golden_result(input_tensor, output_tensor.shape)
-        assert np.allclose(output_tensor, golden_result, atol=1e-2)
diff --git a/test/unit/test_rmsnorm_qkv.py b/test/unit/test_rmsnorm_qkv.py
deleted file mode 100644
index 2344a59..0000000
--- a/test/unit/test_rmsnorm_qkv.py
+++ /dev/null
@@ -1,68 +0,0 @@
-"""
-Copyright (c) 2024, Amazon.com. All Rights Reserved
-"""
-import pytest
-from nki_samples.reference.allocated_fused_linear import allocated_fused_rms_norm_qkv
-from neuronxcc.nki import benchmark, baremetal, simulate_kernel
-import neuronxcc.nki.language as nl
-import numpy as np
-
-bench_func = benchmark(warmup=5, iters=10)(allocated_fused_rms_norm_qkv)
-
-np.random.seed(0)
-
-
-def rms_norm(hidden, gamma, eps=1e-6):
-  rms = np.sqrt(np.mean(np.square(hidden), axis=-1, keepdims=True))
-  norm = hidden * np.reciprocal(rms + eps)
-  if gamma is not None:
-    norm *= gamma
-  return norm
-
-def cpu_golden_result(hidden, gamma, qkv_weights, dtype, do_norm=True):
-  if do_norm:
-      hidden = rms_norm(hidden, gamma)
-  qkv_out = (hidden @ qkv_weights).astype(dtype)
-  return qkv_out
-
-class TestRMSNormQKV:
-  @pytest.mark.parametrize("batch, seqlen, dim, d_head, dtype, latency", [
-    [1, 128, 512, 512, np.float16, 25],
-    [1, 512, 1024, 384, nl.bfloat16, 40],
-    [1, 128, 1024, 512, nl.bfloat16, 28],
-    # [1, 1024, 8192, 512, nl.bfloat16, 301 * 1.02], # FIXME: performance is flaky
-  ])
-  def test_allocated_rmsnorm_qkv_perf(self, batch, seqlen, dim, d_head, dtype, latency):
-    hidden = np.random.random_sample((batch, seqlen, dim)).astype(np.float32)
-    weights = np.random.random_sample((dim, d_head)).astype(np.float32)
-
-    hidden = nl.static_cast(hidden, dtype)
-    weights = nl.static_cast(weights, dtype)
-
-    bench_func(hidden, weights)
-    latency_res = bench_func.benchmark_result.nc_latency
-    p99 = latency_res.get_latency_percentile(50)
-
-  @pytest.mark.simulation
-  @pytest.mark.parametrize("batch, seqlen, dim, d_head, dtype", [
-    [1, 128, 512, 512, np.float16],
-    [1, 512, 1024, 384, nl.bfloat16],
-    [1, 128, 1024, 512, nl.bfloat16],
-    [1, 1024, 8192, 512, nl.bfloat16]
-  ])
-  def test_allocated_rmsnorm_qkv_numeric(self, simulation_only, batch, seqlen, dim, d_head, dtype):
-    hidden = np.random.random_sample((batch, seqlen, dim))
-    weights = np.random.random_sample((dim, d_head))
-
-    hidden_dev = nl.static_cast(hidden, dtype)
-    weights_dev = nl.static_cast(weights, dtype)
-
-    numeric_func = baremetal(allocated_fused_rms_norm_qkv)
-    if simulation_only:
-      out = simulate_kernel(numeric_func, hidden_dev, weights_dev)
-    else:
-      out = numeric_func(hidden_dev, weights_dev)
-    out = nl.static_cast(out, np.float32)
-    golden_res = nl.static_cast(cpu_golden_result(hidden, None, weights, dtype, do_norm=True), np.float32)
-    assert np.allclose(out, golden_res, atol=1e-2, rtol=1e-2)
-