diff --git a/.github/workflows/run_simulation_tests.yml b/.github/workflows/run_simulation_tests.yml deleted file mode 100644 index 7ea89ef..0000000 --- a/.github/workflows/run_simulation_tests.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: Run Python Simulation Tests - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - -permissions: - contents: read - -jobs: - build: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - name: Set up Python 3.10 - uses: actions/setup-python@v3 - with: - python-version: "3.10" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com - python -m pip install wget awscli - python -m pip install pytest - python -m pip install neuronx-cc==2.* - - name: Test with pytest - run: | - PYTHONPATH=$PYTHONPATH:src/ pytest test/unit/ --simulation-only \ No newline at end of file diff --git a/test/integration/flash_attention/flash_attention.py b/test/integration/flash_attention/flash_attention.py deleted file mode 100644 index 5e14a95..0000000 --- a/test/integration/flash_attention/flash_attention.py +++ /dev/null @@ -1,160 +0,0 @@ -import os - -import neuronxcc.nki.language as nl -import numpy as np -import torch -import torch_xla.core.xla_model as xm - -from neuronxcc.nki.kernels.attention import flash_attn_bwd, flash_fwd - - -def _flash_attn_forward(q, k, v, causal, mixed_precision, seed, dropout_p, softmax_scale, sliding_window): - # flash_fwd assumes spmd grid uses key/value heads - bs, num_heads_kv, _, _ = k.shape - attn_output, lse = flash_fwd[bs, num_heads_kv]( - q, - k, - v, - seed, - use_causal_mask=causal, - mixed_precision=mixed_precision, - dropout_p=dropout_p, - softmax_scale=softmax_scale, - sliding_window=sliding_window, - ) - return attn_output, lse - - -def _flash_attn_backward( - q, k, v, o, dout, lse, seed, causal, mixed_precision, dropout_p, softmax_scale, sliding_window -): - bs, num_heads_kv, _, _ = k.shape - dq, dk, dv = flash_attn_bwd[bs, num_heads_kv]( - q, - k, - v, - o, - dout, - lse, - seed, - use_causal_mask=causal, - mixed_precision=mixed_precision, - dropout_p=dropout_p, - softmax_scale=softmax_scale, - sliding_window=sliding_window, - ) - return dq, dk, dv - - -class NKIAttnFunc(torch.autograd.Function): - @staticmethod - def forward( - ctx, - q, - k, - v, - softmax_scale, - causal: bool, - mixed_precision: bool, - seed, - dropout_p: float, - sliding_window: int, - ): - if softmax_scale is None: - softmax_scale = q.shape[-2] ** (-0.5) - - if seed is None and dropout_p > 0.0: - # NKI only supports 32bit seed - seed = np.array([xm.get_rng_state()]).astype(np.int32) - seed = torch.from_numpy(seed).to(q.device) - - attn_output, lse = _flash_attn_forward( - q, - k, - v.permute(0, 1, 3, 2), - causal=causal, - mixed_precision=mixed_precision, - seed=seed, - dropout_p=dropout_p, - softmax_scale=softmax_scale, - sliding_window=sliding_window, - ) - ctx.save_for_backward(q, k, v, attn_output, lse, seed) - ctx.causal = causal - ctx.mixed_precision = mixed_precision - ctx.dropout_p = dropout_p - ctx.softmax_scale = softmax_scale - ctx.sliding_window = sliding_window - - # Move seed manually if the dropout is used - # https://github.com/pytorch/xla/blob/v1.13.0/torch_xla/csrc/tensor.cpp#L323 - if dropout_p > 0.0: - orig_seed = xm.get_rng_state() - running_seed = (orig_seed * 214013 + 2531011) & 0xFFFFFFFFFFFFFFFF - xm.set_rng_state(int(running_seed)) - return attn_output - - @staticmethod - def backward(ctx, dout, *args): - q, k, v, attn_output, lse, seed = ctx.saved_tensors - dout = dout.permute(0, 1, 3, 2) - attn_output = attn_output.permute(0, 1, 3, 2) - dq, dk, dv = _flash_attn_backward( - q, - k, - v, - attn_output, - dout, - lse, - seed=seed, - causal=ctx.causal, - mixed_precision=ctx.mixed_precision, - dropout_p=ctx.dropout_p, - softmax_scale=ctx.softmax_scale, - sliding_window=ctx.sliding_window, - ) - return dq, dk, dv, None, None, None, None, None, None - - -def nki_flash_attn_func( - q, - k, - v, - dropout_p=0.0, - softmax_scale=None, - causal=True, - mixed_precision=True, - seed=None, - sliding_window=-1, -): - """ - Arguments: - q: (batch_size, nheads, seqlen, headdim) - k: (batch_size, nheads_k, seqlen, headdim) - v: (batch_size, nheads_k, seqlen, headdim) - dropout_p: float. Dropout probability. - softmax_scale: float. The scaling of QK^T before applying softmax. - Default to 1 / sqrt(headdim). - causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling). - mixed_precision: bool. Whether to enable the higher precisions on the softmax. - seed: int32 torch.Tensor. The seed for the dropout. - - Return: - out: (batch_size, seqlen, nheads, headdim). - """ - - _, _, seqlen, _ = q.shape - if seqlen % 2048 != 0: - raise NotImplementedError("Only support sequence as multiples of 2K") - - # Permute QKV to match the kernel required layouts - q = q.permute(0, 1, 3, 2) - k = k.permute(0, 1, 3, 2) - v = v.permute(0, 1, 3, 2) - - if os.environ.get("XLA_USE_BF16") or os.environ.get("XLA_DOWNCAST_BF16"): - q = q.to(torch.bfloat16) - k = k.to(torch.bfloat16) - v = v.to(torch.bfloat16) - - return NKIAttnFunc.apply(q, k, v, softmax_scale, causal, mixed_precision, seed, dropout_p, sliding_window) diff --git a/test/integration/flash_attention/flash_attention_benchmark.py b/test/integration/flash_attention/flash_attention_benchmark.py deleted file mode 100644 index 918a14f..0000000 --- a/test/integration/flash_attention/flash_attention_benchmark.py +++ /dev/null @@ -1,55 +0,0 @@ -import torch -import unittest -import os -import sys -import pytest -import torch.nn as nn - -os.environ["NEURON_CC_FLAGS"] = "--model-type=transformer --distribution-strategy=llm-training" -os.environ["NEURON_FUSE_SOFTMAX"] = "1" - -import torch_xla -import torch_xla.core.xla_model as xm -from neuronxcc.starfish.support.util import allclose - -from flash_attention import nki_flash_attn_func - -parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) -sys.path.append(parent_dir) -from perf_utils.LatencyCollector import benchmark - -if len(sys.argv) != 2: - print("Usage: python flash_attention_benchmark.py ") - exit(1) -metric_path = os.path.abspath(sys.argv[1]) - -torch.manual_seed(0) -dtype = torch.bfloat16 -bs = 1 -num_heads = 4 -head_dim = 128 -seq_len = 32*1024 -query_states_cpu = torch.randn(bs, num_heads, seq_len, head_dim, dtype=dtype) - 0.5 -query_states_cpu.requires_grad_() -key_states_cpu = torch.randn(bs, num_heads, seq_len, head_dim, dtype=dtype) - 0.5 -key_states_cpu.requires_grad_() -value_states_cpu = torch.randn(bs, num_heads, seq_len, head_dim, dtype=dtype) - 0.5 -value_states_cpu.requires_grad_() - -# Run the Neuron kernel implementation with torch-xla -query_states_xla = query_states_cpu.to(xm.xla_device()).detach().requires_grad_() -key_states_xla = key_states_cpu.to(xm.xla_device()).detach().requires_grad_() -value_states_xla = value_states_cpu.to(xm.xla_device()).detach().requires_grad_() - -def model_fwd_bwd(q, k, v): - attn_nki = nki_flash_attn_func(q, k, v) - loss_actual = torch.sum(attn_nki**2) - loss_actual.backward() - xm.mark_step() - return loss_actual - -n_runs = 10 -bench_result = benchmark(n_runs, f"flash_attention_bs{bs}_heads{num_heads}_seq{seq_len}_head_dim{head_dim}", - model_fwd_bwd, - (query_states_xla, key_states_xla, value_states_xla), - metric_path) \ No newline at end of file diff --git a/test/integration/flash_attention/flash_attention_correctness.py b/test/integration/flash_attention/flash_attention_correctness.py deleted file mode 100644 index 945336f..0000000 --- a/test/integration/flash_attention/flash_attention_correctness.py +++ /dev/null @@ -1,78 +0,0 @@ -import torch -import unittest -import os -import sys -import pytest -import torch.nn as nn - -os.environ["NEURON_CC_FLAGS"] = "--model-type=transformer --distribution-strategy=llm-training" -os.environ["NEURON_FUSE_SOFTMAX"] = "1" - -import torch_xla -import torch_xla.core.xla_model as xm -from neuronxcc.starfish.support.util import allclose - -from flash_attention import nki_flash_attn_func - -@pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float32]) -@pytest.mark.parametrize('causal,sliding_window', [(True, -1), (True, 128), (False, -1)]) -@pytest.mark.parametrize('num_heads,num_kv_heads', [(4, 4), (4, 1), (8, 2)]) # MHA, MQA, GQA -def test_attention(dtype, causal, sliding_window, num_heads, num_kv_heads): - def torch_golden_attn_cpu(query_states, key_states, value_states): - if num_heads != num_kv_heads: - key_states = key_states.repeat_interleave(num_heads // num_kv_heads, dim=1) - value_states = value_states.repeat_interleave(num_heads // num_kv_heads, dim=1) - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * (query_states.shape[-1] ** (-0.5)) - - if sliding_window > 0: - causal_mask = torch.triu( - torch.tril(torch.ones(1, 1, query_states.shape[2], key_states.shape[2])), - diagonal=-(sliding_window - 1), - ).bool() - causal_mask = ~causal_mask - attn_weights = attn_weights.masked_fill_(causal_mask, -10000.0) - elif causal: - causal_mask = torch.triu(torch.ones((1, 1, query_states.shape[2], key_states.shape[2])), diagonal=1).bool() - attn_weights = attn_weights.masked_fill_(causal_mask, -10000.0) - - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.double).to(query_states.dtype) - - attn_output = torch.matmul(attn_weights, value_states) - return attn_output - - torch.manual_seed(0) - bs = 1 - head_dim = 128 - seq_len = 4096 - query_states_cpu = torch.randn(bs, num_heads, seq_len, head_dim, dtype=dtype) - 0.5 - query_states_cpu.requires_grad_() - key_states_cpu = torch.randn(bs, num_kv_heads, seq_len, head_dim, dtype=dtype) - 0.5 - key_states_cpu.requires_grad_() - value_states_cpu = torch.randn(bs, num_kv_heads, seq_len, head_dim, dtype=dtype) - 0.5 - value_states_cpu.requires_grad_() - - # Run the CPU golden results - golden_attn = torch_golden_attn_cpu(query_states_cpu, key_states_cpu, value_states_cpu) - loss_golden = torch.sum(golden_attn**2) - loss_golden.backward() - - # Run the Neuron kernel implementation with torch-xla - query_states_xla = query_states_cpu.to(xm.xla_device()).detach().requires_grad_() - key_states_xla = key_states_cpu.to(xm.xla_device()).detach().requires_grad_() - value_states_xla = value_states_cpu.to(xm.xla_device()).detach().requires_grad_() - attn_nki = nki_flash_attn_func( - query_states_xla, key_states_xla, value_states_xla, causal=causal, sliding_window=sliding_window - ) - loss_actual = torch.sum(attn_nki**2) - loss_actual.backward() - xm.mark_step() - - actual_dv = value_states_xla.grad.to('cpu') - actual_dq = query_states_xla.grad.to('cpu') - actual_dk = key_states_xla.grad.to('cpu') - - # Compare against cpu result - assert(allclose(loss_actual.to('cpu').to(torch.float32).detach().numpy(), loss_golden.to(torch.float32).detach().numpy(), atol=1e-5, rtol=0.05, verbose=1)) - assert(allclose(actual_dv.to(torch.float32).numpy(), value_states_cpu.grad.to(torch.float32).numpy(), atol=1e-5, rtol=0.05, verbose=1)) - assert(allclose(actual_dq.to(torch.float32).numpy(), query_states_cpu.grad.to(torch.float32).numpy(), atol=1e-5, rtol=0.05, verbose=1)) - assert(allclose(actual_dk.to(torch.float32).numpy(), key_states_cpu.grad.to(torch.float32).numpy(), atol=1e-5, rtol=0.05, verbose=1)) \ No newline at end of file diff --git a/test/integration/flash_attention/run.sh b/test/integration/flash_attention/run.sh deleted file mode 100644 index 02870a4..0000000 --- a/test/integration/flash_attention/run.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -set -e - -if [ "$#" -ne 1 ]; then - echo "Illegal number of parameters. Must call with ./run.sh " - exit 1 -fi - -metric_path=$1 - -echo ${test_name} is writing the benchmark result to ${metric_path} - -pytest flash_attention_correctness.py -python flash_attention_benchmark.py $metric_path - diff --git a/test/integration/fused_sd_attention_small_head/run.sh b/test/integration/fused_sd_attention_small_head/run.sh deleted file mode 100755 index 638bf3c..0000000 --- a/test/integration/fused_sd_attention_small_head/run.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -set -e - - -if [ "$#" -ne 1 ]; then - echo "Illegal number of parameters. Must call with ./run.sh " - exit 1 -fi - -metric_path=$1 - -echo ${test_name} is writing the benchmark result to ${metric_path} - -pip install diffusers==0.20.2 transformers==4.33.1 accelerate==0.22.0 safetensors==0.3.1 matplotlib -python3 sd2_512_compile.py -python3 sd2_512_benchmark.py $1 \ No newline at end of file diff --git a/test/integration/fused_sd_attention_small_head/sd2_512_benchmark.py b/test/integration/fused_sd_attention_small_head/sd2_512_benchmark.py deleted file mode 100644 index 5d63424..0000000 --- a/test/integration/fused_sd_attention_small_head/sd2_512_benchmark.py +++ /dev/null @@ -1,99 +0,0 @@ -import os -os.environ["NEURON_FUSE_SOFTMAX"] = "1" - -import torch -import torch.nn as nn -import torch_neuronx - -from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler -from diffusers.models.unet_2d_condition import UNet2DConditionOutput - -parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) -sys.path.append(parent_dir) -from perf_utils.LatencyCollector import benchmark - -import sys -if len(sys.argv) != 2: - print("Usage: python sd2_512_benchmark.py ") - exit(1) -metric_path = os.path.abspath(sys.argv[1]) - -# Define datatype -DTYPE = torch.bfloat16 - -class UNetWrap(nn.Module): - def __init__(self, unet): - super().__init__() - self.unet = unet - - def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None): - out_tuple = self.unet(sample, timestep, encoder_hidden_states, return_dict=False) - return out_tuple - -class NeuronUNet(nn.Module): - def __init__(self, unetwrap): - super().__init__() - self.unetwrap = unetwrap - self.config = unetwrap.unet.config - self.in_channels = unetwrap.unet.in_channels - self.device = unetwrap.unet.device - - def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None, return_dict=False): - sample = self.unetwrap(sample, timestep.to(dtype=DTYPE).expand((sample.shape[0],)), encoder_hidden_states)[0] - return UNet2DConditionOutput(sample=sample) - -class NeuronTextEncoder(nn.Module): - def __init__(self, text_encoder): - super().__init__() - self.neuron_text_encoder = text_encoder - self.config = text_encoder.config - self.dtype = text_encoder.dtype - self.device = text_encoder.device - - def forward(self, emb, attention_mask = None): - return [self.neuron_text_encoder(emb)['last_hidden_state']] - -def decode_latents(self, latents): - latents = latents.to(torch.float) - latents = 1 / self.vae.config.scaling_factor * latents - image = self.vae.decode(latents).sample - image = (image / 2 + 0.5).clamp(0, 1) - image = image.cpu().permute(0, 2, 3, 1).float().numpy() - return image - -StableDiffusionPipeline.decode_latents = decode_latents - -# --- Load all compiled models and benchmark pipeline --- -COMPILER_WORKDIR_ROOT = 'sd2_compile_dir_512' -model_id = "stabilityai/stable-diffusion-2-1-base" -text_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder/model.pt') -decoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder/model.pt') -unet_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'unet/model.pt') -post_quant_conv_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv/model.pt') - -pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=DTYPE) -pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) - -# Load the compiled UNet onto two neuron cores. -pipe.unet = NeuronUNet(UNetWrap(pipe.unet)) -#TODO: FIXME, dual core execution is not working on release pipeline at the moment. -device_ids = [0] -pipe.unet.unetwrap = torch_neuronx.DataParallel(torch.jit.load(unet_filename), device_ids, set_dynamic_batching=True) - -class NeuronTypeConversionWrapper(nn.Module): - def __init__(self, network): - super().__init__() - self.network = network - - def forward(self, x): - return self.network(x.float()) - -# Load other compiled models onto a single neuron core. -pipe.text_encoder = NeuronTextEncoder(pipe.text_encoder) -pipe.text_encoder.neuron_text_encoder = torch.jit.load(text_encoder_filename) -pipe.vae.decoder = NeuronTypeConversionWrapper(torch.jit.load(decoder_filename)) -pipe.vae.post_quant_conv = NeuronTypeConversionWrapper(torch.jit.load(post_quant_conv_filename)) - -prompt = "a photo of an astronaut riding a horse on mars" -n_runs = 10 -bench_result = benchmark(n_runs, "stable_diffusion_512", pipe, prompt, metric_path) diff --git a/test/integration/fused_sd_attention_small_head/sd2_512_compile.py b/test/integration/fused_sd_attention_small_head/sd2_512_compile.py deleted file mode 100644 index 4d6710f..0000000 --- a/test/integration/fused_sd_attention_small_head/sd2_512_compile.py +++ /dev/null @@ -1,319 +0,0 @@ -import os -os.environ["NEURON_FUSE_SOFTMAX"] = "1" - -import torch -import torch.nn as nn -import torch_neuronx - -import copy -from diffusers import StableDiffusionPipeline -from diffusers.models.unet_2d_condition import UNet2DConditionOutput - -from packaging import version -import diffusers -diffusers_version = version.parse(diffusers.__version__) -assert diffusers_version >= version.parse('0.18.0'), "Must use diffusers 0.18.0 or newer" - -from diffusers.models.attention_processor import Attention - -# Define datatype -DTYPE = torch.bfloat16 - -# Have to do this double wrapper trick to compile the unet, because -# of the special UNet2DConditionOutput output type. -class UNetWrap(nn.Module): - def __init__(self, unet): - super().__init__() - self.unet = unet - - def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None): - out_tuple = self.unet(sample, timestep, encoder_hidden_states, return_dict=False) - return out_tuple - -class NeuronUNet(nn.Module): - def __init__(self, unetwrap): - super().__init__() - self.unetwrap = unetwrap - self.config = unetwrap.unet.config - self.in_channels = unetwrap.unet.in_channels - self.device = unetwrap.unet.device - - def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None): - sample = self.unetwrap(sample, timestep.to(dtype=DTYPE).expand((sample.shape[0],)), encoder_hidden_states)[0] - return UNet2DConditionOutput(sample=sample) - -class NeuronTextEncoder(nn.Module): - def __init__(self, text_encoder): - super().__init__() - self.neuron_text_encoder = text_encoder - self.config = text_encoder.config - self.dtype = text_encoder.dtype - self.device = text_encoder.device - - def forward(self, emb, attention_mask = None): - return [self.neuron_text_encoder(emb)['last_hidden_state']] - - -# Optimized attention -def get_attention_scores(self, query, key, attn_mask): - dtype = query.dtype - - if self.upcast_attention: - query = query.float() - key = key.float() - - # Check for square matmuls - if(query.size() == key.size()): - attention_scores = custom_badbmm( - key, - query.transpose(-1, -2) - ) - - if self.upcast_softmax: - attention_scores = attention_scores.float() - - attention_probs = attention_scores.softmax(dim=1).permute(0,2,1) - attention_probs = attention_probs.to(dtype) - - else: - attention_scores = custom_badbmm( - query, - key.transpose(-1, -2) - ) - - if self.upcast_softmax: - attention_scores = attention_scores.float() - - attention_probs = attention_scores.softmax(dim=-1) - attention_probs = attention_probs.to(dtype) - - return attention_probs - -# In the original badbmm the bias is all zeros, so only apply scale -def custom_badbmm(a, b): - bmm = torch.bmm(a, b) - scaled = bmm * 0.125 - return scaled - - -# For saving compiler artifacts -COMPILER_WORKDIR_ROOT = 'sd2_compile_dir_512' - -# Model ID for SD version pipeline -model_id = "stabilityai/stable-diffusion-2-1-base" - -# --- Compile UNet and save --- -pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=DTYPE) - -from neuronxcc.nki.kernels.attention import fused_self_attn_for_SD_small_head_size -from torch_neuronx import nki_jit - -nki_func = nki_jit(fused_self_attn_for_SD_small_head_size) - -class AttnProcessor: - r""" - Default processor for performing attention-related computations. - """ - - def __call__( - self, - attn: Attention, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - temb=None, - ): - residual = hidden_states - - if attn.spatial_norm is not None: - hidden_states = attn.spatial_norm(hidden_states, temb) - - input_ndim = hidden_states.ndim - - if input_ndim == 4: - batch_size, channel, height, width = hidden_states.shape - hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) - - batch_size, sequence_length, _ = ( - hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape - ) - attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) - - if attn.group_norm is not None: - hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) - - query = attn.to_q(hidden_states) - - if encoder_hidden_states is None: - encoder_hidden_states = hidden_states - elif attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) - - key = attn.to_k(encoder_hidden_states) - value = attn.to_v(encoder_hidden_states) - - query = attn.head_to_batch_dim(query) - key = attn.head_to_batch_dim(key) - value = attn.head_to_batch_dim(value) - - if query.shape[1] == 4096 and query.shape[2] <= 128 and \ - query.shape == key.shape and query.shape == value.shape: - kernel = nki_func - # breakpoint() - result = torch.empty(value.shape, dtype=value.dtype, device=query.device) - # Each batch is independeant, use a launch grid of size batch size - kernel[query.shape[0]](query.transpose(1, 2), key, value, result) - hidden_states = result - else: - attention_probs = attn.get_attention_scores(query, key, attention_mask) - hidden_states = torch.bmm(attention_probs, value) - hidden_states = attn.batch_to_head_dim(hidden_states) - - # linear proj - hidden_states = attn.to_out[0](hidden_states) - # dropout - hidden_states = attn.to_out[1](hidden_states) - - if input_ndim == 4: - hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) - - if attn.residual_connection: - hidden_states = hidden_states + residual - - hidden_states = hidden_states / attn.rescale_output_factor - - return hidden_states - -# Replace original cross-attention module with custom cross-attention module for better performance -import diffusers.models.attention_processor as attn_processor -attn_processor.AttnProcessor.__call__ = AttnProcessor.__call__ - -# Apply double wrapper to deal with custom return type -pipe.unet = NeuronUNet(UNetWrap(pipe.unet)) - -# Only keep the model being compiled in RAM to minimze memory pressure -unet = copy.deepcopy(pipe.unet.unetwrap) -del pipe - -# Compile unet - FP32 -sample_1b = torch.randn([1, 4, 64, 64], dtype=DTYPE) -timestep_1b = torch.tensor(999, dtype=DTYPE).expand((1,)) -encoder_hidden_states_1b = torch.randn([1, 77, 1024], dtype=DTYPE) -example_inputs = sample_1b, timestep_1b, encoder_hidden_states_1b - -unet_neuron = torch_neuronx.trace( - unet, - example_inputs, - compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'unet'), - compiler_args=["--model-type=unet-inference", "--enable-fast-loading-neuron-binaries"] -) - -# Enable asynchronous and lazy loading to speed up model load -torch_neuronx.async_load(unet_neuron) -torch_neuronx.lazy_load(unet_neuron) - -# save compiled unet -unet_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'unet/model.pt') -torch.jit.save(unet_neuron, unet_filename) - -# delete unused objects -del unet -del unet_neuron - - - -# --- Compile CLIP text encoder and save --- - -# Only keep the model being compiled in RAM to minimze memory pressure -pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=DTYPE) -text_encoder = copy.deepcopy(pipe.text_encoder) -del pipe - -# Apply the wrapper to deal with custom return type -text_encoder = NeuronTextEncoder(text_encoder) - -# Compile text encoder -# This is used for indexing a lookup table in torch.nn.Embedding, -# so using random numbers may give errors (out of range). -emb = torch.tensor([[49406, 18376, 525, 7496, 49407, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0]]) -text_encoder_neuron = torch_neuronx.trace( - text_encoder.neuron_text_encoder, - emb, - compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder'), - compiler_args=["--enable-fast-loading-neuron-binaries"] - ) - -# Enable asynchronous loading to speed up model load -torch_neuronx.async_load(text_encoder_neuron) - -# Save the compiled text encoder -text_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder/model.pt') -torch.jit.save(text_encoder_neuron, text_encoder_filename) - -# delete unused objects -del text_encoder -del text_encoder_neuron - - - -# --- Compile VAE decoder and save --- - -# Only keep the model being compiled in RAM to minimze memory pressure -pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float32) -decoder = copy.deepcopy(pipe.vae.decoder) -del pipe - -# Compile vae decoder -decoder_in = torch.randn([1, 4, 64, 64], dtype=torch.float32) -decoder_neuron = torch_neuronx.trace( - decoder, - decoder_in, - compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder'), - compiler_args=["--enable-fast-loading-neuron-binaries"] -) - -# Enable asynchronous loading to speed up model load -torch_neuronx.async_load(decoder_neuron) - -# Save the compiled vae decoder -decoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder/model.pt') -torch.jit.save(decoder_neuron, decoder_filename) - -# delete unused objects -del decoder -del decoder_neuron - - - -# --- Compile VAE post_quant_conv and save --- - -# Only keep the model being compiled in RAM to minimze memory pressure -pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float32) -post_quant_conv = copy.deepcopy(pipe.vae.post_quant_conv) -del pipe - -# # Compile vae post_quant_conv -post_quant_conv_in = torch.randn([1, 4, 64, 64], dtype=torch.float32) -post_quant_conv_neuron = torch_neuronx.trace( - post_quant_conv, - post_quant_conv_in, - compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv'), -) - -# Enable asynchronous loading to speed up model load -torch_neuronx.async_load(post_quant_conv_neuron) - -# # Save the compiled vae post_quant_conv -post_quant_conv_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv/model.pt') -torch.jit.save(post_quant_conv_neuron, post_quant_conv_filename) - -# delete unused objects -del post_quant_conv -del post_quant_conv_neuron diff --git a/test/integration/perf_utils/LatencyCollector.py b/test/integration/perf_utils/LatencyCollector.py deleted file mode 100644 index 0fa7640..0000000 --- a/test/integration/perf_utils/LatencyCollector.py +++ /dev/null @@ -1,74 +0,0 @@ -import time -import math -import os -import json - -class LatencyCollector: - def __init__(self): - self.start = None - self.latency_list = [] - - def pre_hook(self, *args): - self.start = time.time() - - def hook(self, *args): - self.latency_list.append(time.time() - self.start) - - def percentile(self, percent): - latency_list = self.latency_list - pos_float = len(latency_list) * percent / 100 - max_pos = len(latency_list) - 1 - pos_floor = min(math.floor(pos_float), max_pos) - pos_ceil = min(math.ceil(pos_float), max_pos) - latency_list = sorted(latency_list) - return latency_list[pos_ceil] if pos_float - pos_floor > 0.5 else latency_list[pos_floor] - -def benchmark(n_runs, test_name, model, model_inputs, metric_path="."): - # model inputs can be tuple or dictionary - if not isinstance(model_inputs, tuple) and not isinstance(model_inputs, dict): - model_inputs = (model_inputs,) - - def run_model(): - if isinstance(model_inputs, dict): - return model(**model_inputs) - else : #tuple - return model(*model_inputs) - - warmup_run = run_model() - - latency_collector = LatencyCollector() - # can't use register_forward_pre_hook or register_forward_hook because StableDiffusionPipeline is not a torch.nn.Module - - for _ in range(n_runs): - latency_collector.pre_hook() - res = run_model() - latency_collector.hook() - - p0_latency_ms = latency_collector.percentile(0) * 1000 - p50_latency_ms = latency_collector.percentile(50) * 1000 - p90_latency_ms = latency_collector.percentile(90) * 1000 - p95_latency_ms = latency_collector.percentile(95) * 1000 - p99_latency_ms = latency_collector.percentile(99) * 1000 - p100_latency_ms = latency_collector.percentile(100) * 1000 - - report_dict = dict() - report_dict["Latency.P0"] = f'{p0_latency_ms:.1f}' - report_dict["Latency.P50"]=f'{p50_latency_ms:.1f}' - report_dict["Latency.P90"]=f'{p90_latency_ms:.1f}' - report_dict["Latency.P95"]=f'{p95_latency_ms:.1f}' - report_dict["Latency.P99"]=f'{p99_latency_ms:.1f}' - report_dict["Latency.P100"]=f'{p100_latency_ms:.1f}' - - report = f'RESULT FOR {test_name}:' - for key, value in report_dict.items(): - report += f' {key}={value}' - print(report) - - report = {"test_name": test_name, "report": report_dict} - - with open(metric_path, 'r+') as f: - cur = json.load(f) - cur.append(report) - with open(metric_path, 'w+') as f: - json.dump(cur, f) - return report_dict diff --git a/test/integration/perf_utils/__init__.py b/test/integration/perf_utils/__init__.py deleted file mode 100644 index 43b3f14..0000000 --- a/test/integration/perf_utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Package for integraton tests utilities \ No newline at end of file diff --git a/test/integration/resize_nearest_fixed_dma_kernel/inpainting_mask.png b/test/integration/resize_nearest_fixed_dma_kernel/inpainting_mask.png deleted file mode 100644 index 7f3c753..0000000 Binary files a/test/integration/resize_nearest_fixed_dma_kernel/inpainting_mask.png and /dev/null differ diff --git a/test/integration/resize_nearest_fixed_dma_kernel/inpainting_photo.png b/test/integration/resize_nearest_fixed_dma_kernel/inpainting_photo.png deleted file mode 100644 index e84dfc8..0000000 Binary files a/test/integration/resize_nearest_fixed_dma_kernel/inpainting_photo.png and /dev/null differ diff --git a/test/integration/resize_nearest_fixed_dma_kernel/run.sh b/test/integration/resize_nearest_fixed_dma_kernel/run.sh deleted file mode 100644 index 13e1779..0000000 --- a/test/integration/resize_nearest_fixed_dma_kernel/run.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -set -e - - -if [ "$#" -ne 1 ]; then - echo "Illegal number of parameters. Must call with ./run.sh " - exit 1 -fi - -metric_path=$1 - -echo ${test_name} is writing the benchmark result to ${metric_path} - -export NEURON_FUSE_SOFTMAX=1 -pip install diffusers==0.24.0 transformers==4.35.2 accelerate==0.24.1 safetensors==0.4.1 matplotlib -python3 sd2_inpainting_936_624_compile.py -python3 sd2_inpainting_936_624_benchmark.py $1 \ No newline at end of file diff --git a/test/integration/resize_nearest_fixed_dma_kernel/sd2_inpainting_936_624_benchmark.py b/test/integration/resize_nearest_fixed_dma_kernel/sd2_inpainting_936_624_benchmark.py deleted file mode 100644 index 4970f72..0000000 --- a/test/integration/resize_nearest_fixed_dma_kernel/sd2_inpainting_936_624_benchmark.py +++ /dev/null @@ -1,80 +0,0 @@ -import os - -import torch -import torch.nn as nn -import torch_neuronx -import numpy as np - -from matplotlib import pyplot as plt -from matplotlib import image as mpimg -import time -import copy -from IPython.display import clear_output - -from diffusers import StableDiffusionInpaintPipeline -from diffusers.models.unet_2d_condition import UNet2DConditionOutput -# Compatibility for diffusers<0.18.0 -from packaging import version -import diffusers -diffusers_version = version.parse(diffusers.__version__) -use_new_diffusers = diffusers_version >= version.parse('0.18.0') -if use_new_diffusers: - from diffusers.models.attention_processor import Attention -else: - from diffusers.models.cross_attention import CrossAttention - -parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) -sys.path.append(parent_dir) -from perf_utils.LatencyCollector import benchmark - -import sys -if len(sys.argv) != 2: - print("Usage: python sd2_inpainting_936_624_benchmark.py ") - exit(1) -metric_path = os.path.abspath(sys.argv[1]) - -from wrapper import UNetWrap, NeuronUNet, NeuronTextEncoder - -# Define datatype for UNet -DTYPE = torch.bfloat16 - -clear_output(wait=False) - -""" -Sample image is taken from: https://huggingface.co/stabilityai/stable-diffusion-2-inpainting -""" - -# --- Load all compiled models and run pipeline --- -COMPILER_WORKDIR_ROOT = "sd2_inpainting_neuron" -model_id = "stabilityai/stable-diffusion-2-inpainting" - -pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, torch_dtype=torch.float32) - -text_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder/model.pt') -unet_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'unet/model.pt') -vae_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_encoder/model.pt') -decoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder/model.pt') -post_quant_conv_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv/model.pt') - -# Load the compiled UNet onto two neuron cores. -pipe.unet = NeuronUNet(UNetWrap(pipe.unet)) -device_ids = [0,1] -pipe.unet.unetwrap = torch_neuronx.DataParallel(torch.jit.load(unet_filename), device_ids, set_dynamic_batching=False) - -# Load other compiled models onto a single neuron core. -pipe.text_encoder = NeuronTextEncoder(pipe.text_encoder) -pipe.text_encoder.neuron_text_encoder = torch.jit.load(text_encoder_filename) -pipe.vae.encoder = torch.jit.load(vae_encoder_filename) -pipe.vae.decoder = torch.jit.load(decoder_filename) -pipe.vae.post_quant_conv = torch.jit.load(post_quant_conv_filename) - -height, width = 624, 936 - -import PIL -base_image = PIL.Image.open('inpainting_photo.png') -mask = PIL.Image.open('inpainting_mask.png') - -prompt = 'Face of a yellow cat, high resolution, sitting on a park bench' -n_runs = 10 - -bench_result = benchmark(n_runs, "stable_diffusion_2_inpainting_936_624", pipe, {'prompt':prompt, 'image':base_image, 'mask_image':mask, 'height':height, 'width':width}, metric_path) \ No newline at end of file diff --git a/test/integration/resize_nearest_fixed_dma_kernel/sd2_inpainting_936_624_compile.py b/test/integration/resize_nearest_fixed_dma_kernel/sd2_inpainting_936_624_compile.py deleted file mode 100644 index ffef15e..0000000 --- a/test/integration/resize_nearest_fixed_dma_kernel/sd2_inpainting_936_624_compile.py +++ /dev/null @@ -1,376 +0,0 @@ -import os -from typing import Optional - -import torch -import torch.nn as nn -import torch_neuronx -import numpy as np - -from matplotlib import pyplot as plt -from matplotlib import image as mpimg -import time -import copy -from IPython.display import clear_output - -from diffusers import StableDiffusionInpaintPipeline -from diffusers.models.unet_2d_condition import UNet2DConditionOutput -# Compatibility for diffusers<0.18.0 -from packaging import version -import diffusers -diffusers_version = version.parse(diffusers.__version__) -use_new_diffusers = diffusers_version >= version.parse('0.18.0') -if use_new_diffusers: - from diffusers.models.attention_processor import Attention -else: - from diffusers.models.cross_attention import CrossAttention - -from wrapper import UNetWrap, NeuronUNet, NeuronTextEncoder - -# Define datatype for UNet -DTYPE = torch.bfloat16 - -clear_output(wait=False) - -""" -In the following section, we will compile parts of the Stable Diffusion pipeline for execution on Neuron. -Note that this only needs to be done once: After you have compiled and saved the model by running the following section of code, -you can reuse it any number of times without having to recompile. In particular, we will compile: - -The CLIP text encoder; -The VAE encoder; -The VAE decoder; -The UNet, and -The VAE_post_quant_conv These blocks are chosen because they represent the bulk of the compute in the pipeline, -and performance benchmarking has shown that running them on Neuron yields significant performance benefit. - -The UNet contains Upsample2D layers. -In this example, we will replace Upsample2D layer with our Custom_Upsample2D layer which is using NKI kernel (resize_nearest_fixed_dma_kernel). (Line 179) -In this custom layer, when it finds the upscaling factor is not an integer, it applies the kernel. (Line 142) -Example shape : (1, 1280, 30, 20) -> (1, 1280, 59, 38) -Please note that this is a NKI example, showing how we apply the kernel. The non-integer Upsample2D is already supported in our compiler. - -Additional Notes -- We use the optimized get_attention_scores utility function, - to replace the original get_attention_scores function in the attention_processor.Attention class -- In order to save RAM before tracing each model, - we make a deepcopy of the part of the pipeline and delete the pipeline object from the memory, - This trick allows the compile to succeed on instance types with a smaller amount of RAM. - - -Code details : -https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/inference/hf_pretrained_sd2_inpainting_936_624_inference.ipynb - -""" - - -from neuronxcc.nki.kernels.vision import resize_nearest_fixed_dma_kernel -from torch_neuronx import nki_jit -from torch_xla.core import xla_model as xm -from torch import Tensor -import torch.nn.functional as F -from diffusers.utils import USE_PEFT_BACKEND -from diffusers.models.lora import LoRACompatibleConv -from diffusers.models.resnet import Upsample2D - -class Custom_Upsample2D(nn.Module): - """A 2D upsampling layer with an optional convolution. - Copied from diffusers.models.resnet.Upsample2D - """ - - def __init__( - self, - channels: int, - conv, - conv2d_0, - use_conv: bool = False, - use_conv_transpose: bool = False, - out_channels: Optional[int] = None, - name: str = "conv", - ): - super().__init__() - self.channels = channels - self.out_channels = out_channels or channels - self.use_conv = use_conv - self.use_conv_transpose = use_conv_transpose - self.name = name - self.conv = conv - self.Conv2d_0 = conv2d_0 - - def forward( - self, - hidden_states: torch.FloatTensor, - output_size: Optional[int] = None, - scale: float = 1.0, - ) -> torch.FloatTensor: - assert hidden_states.shape[1] == self.channels - - if self.use_conv_transpose: - return self.conv(hidden_states) - - # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16 - dtype = hidden_states.dtype - if dtype == torch.bfloat16: - hidden_states = hidden_states.to(torch.float32) - - if hidden_states.shape[0] >= 64: - hidden_states = hidden_states.contiguous() - - if output_size is None: - hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest") - else: - # # When the upscailng factor is not an integer, compile using resize_nearest_fixed_dma_kernel - # # Otherwise, use the default interpolate function. - if (output_size[0] % hidden_states.shape[2] != 0 or output_size[1] % hidden_states.shape[3] != 0): - from neuronxcc.nki.kernels.vision import resize_nearest_fixed_dma_kernel - from torch_neuronx import nki_jit - from torch_xla.core import xla_model as xm - # Compile NKI kernel - device = xm.xla_device() - hidden_states = hidden_states.to(device=device) - nki_func = nki_jit(resize_nearest_fixed_dma_kernel) - # Apply NKI kernel - hidden_states = torch.permute(hidden_states, (0, 2, 3, 1)) - result = torch.empty((hidden_states.shape[0], output_size[0], output_size[1], hidden_states.shape[3]) , dtype=torch.float32, device=device) - nki_func[hidden_states.shape[0]](hidden_states, result) - hidden_states = torch.permute(result, (0, 3, 1, 2)) - else : - hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest") - - # If the input is bfloat16, we cast back to bfloat16 - if dtype == torch.bfloat16: - hidden_states = hidden_states.to(torch.bfloat16) - - if self.use_conv: - if self.name == "conv": - if isinstance(self.conv, LoRACompatibleConv) and not USE_PEFT_BACKEND: - hidden_states = self.conv(hidden_states, scale) - else: - hidden_states = self.conv(hidden_states) - else: - if isinstance(self.Conv2d_0, LoRACompatibleConv) and not USE_PEFT_BACKEND: - hidden_states = self.Conv2d_0(hidden_states, scale) - else: - hidden_states = self.Conv2d_0(hidden_states) - - return hidden_states - -# Replace diffusers.models.resnet.Upsample2D with Custom_Upsample2D -def replace_upsampling(module): - for name, child in module.named_children(): - if isinstance(child, Upsample2D): - if child.name == "conv": - setattr(module, name, Custom_Upsample2D(channels=child.channels, conv=child.conv, conv2d_0=None, use_conv=child.use_conv, use_conv_transpose=child.use_conv_transpose, out_channels=child.out_channels, name=child.name)) - else: - setattr(module, name, Custom_Upsample2D(channels=child.channels, conv=None, conv2d_0=child.Conv2d_0, use_conv=child.use_conv, use_conv_transpose=child.use_conv_transpose, out_channels=child.out_channels, name=child.name)) - - else: - replace_upsampling(child) - - -# optimized attention scores -def get_attention_scores(self, query, key, attn_mask): - dtype = query.dtype - - if self.upcast_attention: - query = query.float() - key = key.float() - - # Check for square matmuls - if(query.size() == key.size()): - attention_scores = custom_badbmm( - key, - query.transpose(-1, -2) - ) - - if self.upcast_softmax: - attention_scores = attention_scores.float() - - attention_probs = torch.nn.functional.softmax(attention_scores, dim=1).permute(0,2,1) - attention_probs = attention_probs.to(dtype) - - else: - attention_scores = custom_badbmm( - query, - key.transpose(-1, -2) - ) - - if self.upcast_softmax: - attention_scores = attention_scores.float() - - attention_probs = torch.nn.functional.softmax(attention_scores, dim=-1) - attention_probs = attention_probs.to(dtype) - - return attention_probs - -def custom_badbmm(a, b): - bmm = torch.bmm(a, b) - scaled = bmm * 0.125 - return scaled - - -COMPILER_WORKDIR_ROOT = "sd2_inpainting_neuron" - -def trace_text_encoder(model_id): - pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, torch_dtype=torch.float32) - text_encoder = copy.deepcopy(pipe.text_encoder) - del pipe - # Apply the wrapper to deal with custom return type - text_encoder = NeuronTextEncoder(text_encoder) - - # Compile text encoder - # This is used for indexing a lookup table in torch.nn.Embedding, - # so using random numbers may give errors (out of range). - emb = torch.tensor([[49406, 18376, 525, 7496, 49407, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0]]) - text_encoder_neuron = torch_neuronx.trace( - text_encoder.neuron_text_encoder, - emb, - compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder'), - ) - - # Enable asynchronous loading to speed up model load - torch_neuronx.async_load(text_encoder_neuron) - - # Save the compiled text encoder - text_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder/model.pt') - torch.jit.save(text_encoder_neuron, text_encoder_filename) - - # delete unused objects - del text_encoder - del text_encoder_neuron - -def trace_vae_encoder(model_id, height, width): - # Only keep the model being compiled in RAM to minimze memory pressure - pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, torch_dtype=torch.float32) - vae_encoder = copy.deepcopy(pipe.vae.encoder) - del pipe - - sample_input = torch.randn([1, 3, height, width]) - vae_encoder_neuron = torch_neuronx.trace( - vae_encoder, - sample_input, - compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'vae_encoder'), - ) - - # Enable asynchronous loading to speed up model load - torch_neuronx.async_load(vae_encoder_neuron) - - # Save the compiled text encoder - vae_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_encoder/model.pt') - torch.jit.save(vae_encoder_neuron, vae_encoder_filename) - - # delete unused objects - del vae_encoder - del vae_encoder_neuron - - -def trace_vae_decoder(model_id, height, width): - # Only keep the model being compiled in RAM to minimze memory pressure - pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, torch_dtype=torch.float32) - decoder = copy.deepcopy(pipe.vae.decoder) - del pipe - - # Compile vae decoder - decoder_in = torch.randn([1, 4, height // 8, width // 8]) - decoder_neuron = torch_neuronx.trace( - decoder, - decoder_in, - compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder'), - compiler_args=["--verbose", "info"] - ) - - # Enable asynchronous loading to speed up model load - torch_neuronx.async_load(decoder_neuron) - - # Save the compiled vae decoder - decoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder/model.pt') - torch.jit.save(decoder_neuron, decoder_filename) - - # delete unused objects - del decoder - del decoder_neuron - -def trace_unet(model_id, height, width): - # --- Compile UNet and save --- - DTYPE = torch.bfloat16 - pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, torch_dtype=DTYPE) - - # Replace original cross-attention module with custom cross-attention module for better performance - Attention.get_attention_scores = get_attention_scores - - # Apply double wrapper to deal with custom return type - pipe.unet = NeuronUNet(UNetWrap(pipe.unet)) - - # Only keep the model being compiled in RAM to minimze memory pressure - unet = copy.deepcopy(pipe.unet.unetwrap) - - # replace Upsample2D - replace_upsampling(unet) - - del pipe - - sample_1b = torch.randn([1, 9, height // 8, width // 8], dtype=DTYPE) - timestep_1b = torch.tensor(999, dtype=DTYPE).expand((1,)) - encoder_hidden_states_1b = torch.randn([1, 77, 1024], dtype=DTYPE) - example_inputs = sample_1b, timestep_1b, encoder_hidden_states_1b - - unet_neuron = torch_neuronx.trace( - unet, - example_inputs, - compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'unet'), - compiler_args=["--model-type=unet-inference", "--verbose=info"], - ) - - # Enable asynchronous and lazy loading to speed up model load - torch_neuronx.async_load(unet_neuron) - torch_neuronx.lazy_load(unet_neuron) - - # save compiled unet - unet_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'unet/model.pt') - torch.jit.save(unet_neuron, unet_filename) - - # delete unused objects - del unet - del unet_neuron - -def trace_post_quant_conv(model_id, height, width): - # Only keep the model being compiled in RAM to minimze memory pressure - pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, torch_dtype=torch.float32) - post_quant_conv = copy.deepcopy(pipe.vae.post_quant_conv) - del pipe - - # Compile vae post_quant_conv - post_quant_conv_in = torch.randn([1, 4, height // 8 , width // 8]) - post_quant_conv_neuron = torch_neuronx.trace( - post_quant_conv, - post_quant_conv_in, - compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv'), - compiler_args=["--verbose", "info"] - ) - - # Enable asynchronous loading to speed up model load - torch_neuronx.async_load(post_quant_conv_neuron) - - # Save the compiled vae post_quant_conv - post_quant_conv_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv/model.pt') - torch.jit.save(post_quant_conv_neuron, post_quant_conv_filename) - - # delete unused objects - del post_quant_conv - del post_quant_conv_neuron - -model_id = "stabilityai/stable-diffusion-2-inpainting" -height, width = 624, 936 - -# trace the parts of the pipeline -trace_text_encoder(model_id) -trace_vae_decoder(model_id, height, width) -trace_vae_encoder(model_id, height, width) -trace_unet(model_id, height, width) -trace_post_quant_conv(model_id, height, width) diff --git a/test/integration/resize_nearest_fixed_dma_kernel/wrapper.py b/test/integration/resize_nearest_fixed_dma_kernel/wrapper.py deleted file mode 100644 index 37a22c8..0000000 --- a/test/integration/resize_nearest_fixed_dma_kernel/wrapper.py +++ /dev/null @@ -1,43 +0,0 @@ -import torch -import torch.nn as nn -from diffusers.models.unet_2d_condition import UNet2DConditionOutput - -""" -Define a double-wrapper for the UNet because of the special UNet2DConditionOutput output type and a wrapper for the text encoder. -These wrappers enable torch_neuronx.trace to trace the wrapped models for compilation with the Neuron compiler - -Details -https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/inference/hf_pretrained_sd2_inpainting_936_624_inference.ipynb -""" - -class UNetWrap(nn.Module): - def __init__(self, unet): - super().__init__() - self.unet = unet - - def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None): - out_tuple = self.unet(sample, timestep, encoder_hidden_states, return_dict=False) - return out_tuple - -class NeuronUNet(nn.Module): - def __init__(self, unetwrap): - super().__init__() - self.unetwrap = unetwrap - self.config = unetwrap.unet.config - self.in_channels = unetwrap.unet.in_channels - self.device = unetwrap.unet.device - - def forward(self, sample, timestep, encoder_hidden_states, timestep_cond=None, added_cond_kwargs=None, cross_attention_kwargs=None, return_dict=False): - sample = self.unetwrap(sample.bfloat16(), timestep.bfloat16().expand((sample.shape[0],)), encoder_hidden_states.bfloat16())[0] - return UNet2DConditionOutput(sample=sample) - -class NeuronTextEncoder(nn.Module): - def __init__(self, text_encoder): - super().__init__() - self.neuron_text_encoder = text_encoder - self.config = text_encoder.config - self.dtype = text_encoder.dtype - self.device = text_encoder.device - - def forward(self, emb, attention_mask = None): - return [self.neuron_text_encoder(emb)['last_hidden_state']] \ No newline at end of file diff --git a/test/integration/run_integration.sh b/test/integration/run_integration.sh deleted file mode 100755 index 832c927..0000000 --- a/test/integration/run_integration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -export PYTHONPATH=${PYTHONPATH}:$PWD - -echo Creating benchmark result json at $PWD/test_result.json -if [ -f $PWD/test_result.json ]; then - rm $PWD/test_result.json -fi -touch $PWD/test_result.json -echo "[]" >> $PWD/test_result.json - -RESULT_JSON=$PWD/test_result.json - -pushd fused_sd_attention_small_head -sh run.sh ${RESULT_JSON} -popd - -pushd resize_nearest_fixed_dma_kernel -sh run.sh ${RESULT_JSON} -popd - -pushd flash_attention -sh run.sh ${RESULT_JSON} -popd - -pushd select_and_scatter_kernel -sh run.sh ${RESULT_JSON} -popd diff --git a/test/integration/select_and_scatter_kernel/common/vision_utils.py b/test/integration/select_and_scatter_kernel/common/vision_utils.py deleted file mode 100644 index 2df218c..0000000 --- a/test/integration/select_and_scatter_kernel/common/vision_utils.py +++ /dev/null @@ -1,319 +0,0 @@ -import os -import sys -import argparse -from datetime import datetime -import math -import queue -import time -import inspect -from torch.optim.lr_scheduler import _LRScheduler -from torch.utils.data.dataloader import DataLoader -from torch.utils.data.distributed import DistributedSampler -from torch.utils.tensorboard import SummaryWriter - -import torchvision -import torchvision.transforms as transforms -import torchvision.datasets as datasets - -try: - from transformers import ( - AutoModelForImageClassification, - AutoConfig - ) -except: - print("cannot import transformers") - -try: - import timm -except: - print("cannot import timm") - -SUPPORTED_PLATFORMS = ['torchvision', 'transformers', 'timm'] - -class WarmupAndExponentialDecayScheduler(_LRScheduler): - """Update the learning rate of wrapped optimizer based on epoch and step. - - Args: - optimizer: Instance of torch.optim.Optimizer. Learning rate will be changed. - num_steps_per_epoch: int, the number of steps required to finish 1 epoch. - divide_every_n_epochs: After this number of epochs, learning rate will be - divided by the `divisor` param. - divisor: The learning rate will be divided by this amount when epoch % - divide_every_n_epochs == 0 (epoch 0 is excluded). - num_warmup_epochs: Float. Learning rate will ramp up from 0 to max learning - rate over this many epochs. Note that partial epochs are allowed, e.g. 0.5 - epochs. - min_delta_to_update_lr: If the new learning rate does not differ much from - the learning rate of the previous step, don't bother updating the - optimizer's learning rate. - """ - - def __init__(self, - optimizer, - num_steps_per_epoch, - divide_every_n_epochs=20, - divisor=5, - num_warmup_epochs=0.9, - min_delta_to_update_lr=1e-6): - self._num_steps_per_epoch = num_steps_per_epoch - self._divide_every_n_epochs = divide_every_n_epochs - self._divisor = divisor - self._num_warmup_epochs = num_warmup_epochs - self._min_delta_to_update_lr = min_delta_to_update_lr - self._previous_lr = -1 - self._max_lr = optimizer.param_groups[0]['lr'] - super(WarmupAndExponentialDecayScheduler, self).__init__(optimizer) - - def _epoch(self): - return self._step_count // self._num_steps_per_epoch - - def _is_warmup_epoch(self): - return self._epoch() < math.ceil(self._num_warmup_epochs) - - def get_lr(self): - epoch = self._epoch() - lr = 0.0 - - if self._is_warmup_epoch(): - # Ramp up learning rate from 0.0 to self._max_lr using a linear slope. - num_warmup_steps = self._num_warmup_epochs * self._num_steps_per_epoch - lr = min(self._max_lr, - self._max_lr * ((self._step_count + 1.0) / num_warmup_steps)) - else: - # Normal epoch. Use an exponential decay determined by init params. - lr = self._max_lr / ( - self._divisor**(epoch // self._divide_every_n_epochs)) - - # _LRScheduler expects a list of learning rates like this. - return [lr for _ in self.base_lrs] - - def step(self, epoch=None): - current_lr = self.get_lr()[0] - - # Outside of warmup epochs, we use the same learning rate for every step - # in an epoch. Don't bother updating learning rate if it hasn't changed. - if abs(current_lr - self._previous_lr) >= self._min_delta_to_update_lr: - super(WarmupAndExponentialDecayScheduler, self).step() - self._previous_lr = current_lr - else: - self._step_count += 1 # This normally happens in super().step(). - - -class Throughput: - def __init__(self, batch_size, world_size, log_steps, moving_avg_window_size=10): - self.seqs_per_iteration = batch_size * world_size - self.moving_avg_window_size = moving_avg_window_size - self.moving_avg_window = queue.Queue() - self.window_time = 0 - self.start_time = time.time() - - def get_throughput(self): - step_time = time.time() - self.start_time - self.start_time += step_time - self.window_time += step_time - self.moving_avg_window.put(step_time) - window_size = self.moving_avg_window.qsize() - if window_size > self.moving_avg_window_size: - self.window_time -= self.moving_avg_window.get() - window_size -= 1 - throughput = window_size * self.seqs_per_iteration / self.window_time - return throughput - - -class Logger: - def __init__(self, args, world_size): - xla = 'torch_xla' in sys.modules - self.train_throughputs = [] - self.test_throughputs = [] - self.summary_writer = SummaryWriter(os.path.join(args.logdir, - f"neuron_tblogs_{time.strftime('%m%d%y_%H%M')}" - f"_w{world_size}" - f"_lr{args.lr}" - f"_bs{args.batch_size}" - f"_bf16autocast{args.enable_pt_autocast}" - f"_xla{xla}")) - self.summary_writer.add_text('script', "```\n" + inspect.getsource(sys.modules[__name__]) + "\n```", 0) - - def print_training_update(self, - device, - step, - lr, - loss, - throughput, - epoch=None, - summary_writer=None): - """Prints the training metrics at a given step. - - Args: - device (torch.device): The device where these statistics came from. - step_num (int): Current step number. - loss (float): Current loss. - throughput (float): The examples/sec throughput for the current batch. - epoch (int, optional): The epoch number. - summary_writer (SummaryWriter, optional): If provided, this method will - write some of the provided statistics to Tensorboard. - """ - update_data = [ - 'Training', 'Device={}'.format(str(device)), - 'Epoch={}'.format(epoch) if epoch is not None else None, - 'Step={}'.format(step), 'Learning_Rate={}'.format(lr), - 'Loss={:.5f}'.format(loss), 'Throughput={:.5f}'.format(throughput), - 'Time={}'.format(datetime.now()) - ] - print('|', ' '.join(item for item in update_data if item), flush=True) - self.write_to_summary( - summary_writer, - dict_to_write={ - 'Throughput': throughput, - }) - - def print_test_update(self, device, throughput, accuracy, epoch=None, step=None): - """Prints single-core test metrics. - - Args: - device: Instance of `torch.device`. - accuracy: Float. - """ - update_data = [ - 'Test', 'Device={}'.format(str(device)), - 'Step={}'.format(step) if step is not None else None, - 'Epoch={}'.format(epoch) if epoch is not None else None, - 'Throughput={:.5f}'.format(throughput), - 'Accuracy={:.2f}'.format(accuracy) if accuracy is not None else None, - 'Time={}'.format(datetime.now()) - ] - print('|', ' '.join(item for item in update_data if item), flush=True) - - def write_to_summary(self, - global_step=None, - dict_to_write={}): - """Writes scalars to a Tensorboard SummaryWriter. - - Optionally writes XLA perf metrics. - - Args: - global_step (int, optional): The global step value for these data points. - If None, global_step will not be set for this datapoint. - dict_to_write (dict, optional): Dict where key is the scalar name and value - is the scalar value to be written to Tensorboard. - """ - if self.summary_writer is None: - return - for k, v in dict_to_write.items(): - self.summary_writer.add_scalar(k, v, global_step) - - -def build_train_parser(): - parser = argparse.ArgumentParser() - parser.add_argument('--model', default='resnet50', help="Image classification model.") - parser.add_argument('--platform', default='torchvision', choices=SUPPORTED_PLATFORMS, help="The Platform where the model is from (torchvision/transformers/timm).") - parser.add_argument('--pretrained', action='store_true', help="Use model from Pre-trained.") - parser.add_argument('--data_dir', type=str, default="", help="Image classification dataset directory.") - parser.add_argument('--logdir', type=str, default="log_training", help="Training log directory.") - parser.add_argument('--batch_size', type=int, default=8, help="Batch size per core used in training.") - parser.add_argument('--num_epochs', type=int, default=2, help="Number of training epochs.") - parser.add_argument('--num_workers', type=int, default=0, help="Number of worker used in data loader.") - parser.add_argument('--log_steps', type=int, default=20, help="Number of steps between each other log message.") - parser.add_argument('--max_steps', type=int, default=28125, help="Number of max training steps.") - parser.add_argument('--expected_average_throughput', type=int, default=0, help="Expected average training throughput (seq/s).") - parser.add_argument('--image_dim', type=int, default=224, help="Image dimension after transformation.") - parser.add_argument('--test_batch_size', type=int, default=8, help="Batch size per core used in testing.") - parser.add_argument('--lr', type=float, default=0.00005, help="Learning rate used in training.") - parser.add_argument('--lr_scheduler_type', type=str, default=None, choices=["WarmupAndExponentialDecayScheduler",]) - parser.add_argument('--lr_scheduler_divide_every_n_epochs', type=int, default=20) - parser.add_argument('--lr_scheduler_divisor', type=int, default=5) - parser.add_argument('--momentum', type=float, default=0.9, help="Momentum used in SGD optimizer") - parser.add_argument('--target_accuracy', type=float, default=0, help="Target accuracy (%).") - parser.add_argument('--drop_last', action='store_true', help="Enable deop_last in data loader.") - parser.add_argument('--fake_data', action='store_true', help="Use fake (random) data for training and testing.") - parser.add_argument('--fake_train_dataset_length', type=int, default=50000, help="Length of fake training dataset.") - parser.add_argument('--fake_test_dataset_length', type=int, default=1000, help="Length of fake testing dataset.") - parser.add_argument('--metrics_debug', action='store_true', help="Print debug metrics at the end of each epoch.") - parser.add_argument('--enable_pt_autocast', action='store_true', help="Enable Auto-cast to BF16 in GPU.") - parser.add_argument('--do_eval', action='store_true', help="Evaluate the model with eval dataset after training.") - - return parser - -def get_model(platform, model, pretrained): - if platform == "torchvision": - default_model_property = { - 'model_fn': getattr(torchvision.models, model) - } - model_properties = { - 'inception_v3': { - 'model_fn': lambda: torchvision.models.inception_v3(aux_logits=False) - }, - } - return model_properties.get(model, default_model_property)['model_fn'](pretrained=pretrained) - elif platform == "transformers": - if pretrained: - return AutoModelForImageClassification.from_pretrained(model) - else: - config = AutoConfig.from_pretrained(model) - return AutoModelForImageClassification.from_config(config) - elif platform == "timm": - return timm.create_model(model, pretrained=pretrained) - else: - raise ValueError('Unsupported Platform.') - -def get_data_transforms(img_dim): - resize_dim = max(img_dim, 256) - normalize = transforms.Normalize( - mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - train_transform = transforms.Compose([ - transforms.RandomResizedCrop(img_dim), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ]) - # Matches Torchvision's eval transforms except Torchvision uses size - # 256 resize for all models both here and in the train loader. Their - # version crashes during training on 299x299 images, e.g. inception. - test_transform = transforms.Compose([ - transforms.Resize(resize_dim), - transforms.CenterCrop(img_dim), - transforms.ToTensor(), - normalize, - ]) - - return train_transform, test_transform - -def create_data_loaders(train_dataset, - test_dataset, - rank, - world_size, - train_batch_size, - test_batch_size, - num_workers, - drop_last=False): - train_sampler, test_sampler = None, None - if world_size > 1: - train_sampler = DistributedSampler( - train_dataset, - num_replicas=world_size, - rank=rank, - shuffle=True) - test_sampler = DistributedSampler( - test_dataset, - num_replicas=world_size, - rank=rank, - shuffle=False) - - train_loader = DataLoader( - train_dataset, - batch_size=train_batch_size, - shuffle=False if train_sampler else True, - sampler=train_sampler, - drop_last=drop_last, - num_workers=num_workers, - pin_memory=True) - test_loader = DataLoader( - test_dataset, - batch_size=test_batch_size, - shuffle=False, - sampler=test_sampler, - drop_last=drop_last, - num_workers=num_workers, - pin_memory=True) - - return train_loader, test_loader \ No newline at end of file diff --git a/test/integration/select_and_scatter_kernel/insert_select_and_scatter_kernel.py b/test/integration/select_and_scatter_kernel/insert_select_and_scatter_kernel.py deleted file mode 100644 index acb89d6..0000000 --- a/test/integration/select_and_scatter_kernel/insert_select_and_scatter_kernel.py +++ /dev/null @@ -1,36 +0,0 @@ -import torch -from torch import nn -from torch import autograd -from neuronxcc.nki.kernels.vision import select_and_scatter_kernel - -import torch_xla.core.xla_model as xm -from torch_neuronx.xla_impl.ops import nki_jit - -select_and_scatter_func = nki_jit()(select_and_scatter_kernel) - -class ModuleWrapper(nn.Module): - def __init__(self, func): - super().__init__() - self.func = func - - def forward(self, x): - return self.func(x) - - -class NeuronMaxPool2d(autograd.function.Function): - @staticmethod - def forward(ctx, input): - ctx.save_for_backward(input) - return nn.MaxPool2d(kernel_size=3, stride=2, padding=(1, 1))(input) - - @staticmethod - def backward(ctx, grad_output): - input, = ctx.saved_tensors - output = torch.zeros_like(input) - select_and_scatter_func(input, grad_output, output) - return output - - -def replace_maxpool(model): - model.maxpool = ModuleWrapper(NeuronMaxPool2d.apply) - return model \ No newline at end of file diff --git a/test/integration/select_and_scatter_kernel/resnet50.ipynb b/test/integration/select_and_scatter_kernel/resnet50.ipynb deleted file mode 100644 index 54d6bb8..0000000 --- a/test/integration/select_and_scatter_kernel/resnet50.ipynb +++ /dev/null @@ -1,179 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# ResNet50 training - Pytorch [Beta PyTorch 2.1] \n", - "This notebook shows how to fine-tune a pretrained ResNet50 Pytorch model with AWS Trainium (trn1 instances) using NeuronSDK.\n", - "The original implementation is provided by torchvision.\n", - "\n", - "The example has 3 stages:\n", - "1. In insert_select_and_scatter_kernel, we replace the backward implementation of maxpool to a select-and-scatter nki kernel.\n", - "2. Use `neuron_parallel_compile` to compile the model.\n", - "3. Run the fine-tuning script to train the model based on image classification task. The training job will use 2 workers with data parallel to speed up the training.\n", - "\n", - "This script is intended to run on trn1.2xlarge. For full training throughput, please refer to the sample script that runs on trn1.32xlarge at: https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/training/resnet50/resnet50.ipynb\n", - "\n", - "**Reference:** \n", - "\n", - "https://pytorch.org/vision/main/models/generated/torchvision.models.resnet50.html" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1) Set the parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# use --model-type=cnn-training to gain the best performance\n", - "env_var_options = \"NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=2 \" + \\\n", - " \"NEURON_CC_FLAGS=\\'--cache_dir=./compiler_cache --model-type=cnn-training\\'\"\n", - "num_workers = 2\n", - "learning_rate = 0.001\n", - "dataloader_num_workers = 2\n", - "device_prefetch_size = 2\n", - "host_to_device_transfer_threads = 4\n", - "num_epochs = 10\n", - "\n", - "import sys\n", - "import os\n", - "if len(sys.argv) != 2:\n", - " exit(1)\n", - "metric_path = os.path.abspath(sys.argv[1])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model_name = \"resnet50\"\n", - "batch_size = 16" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2) Download CIFAR10 dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!wget -N https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz .\n", - "!tar xfvz cifar-10-python.tar.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3) Compile the model with neuron_parallel_compile" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "import subprocess\n", - "\n", - "print(\"Compile model\")\n", - "# set epochs to 2 to reduce the time for tracing training graphs\n", - "COMPILE_CMD = f\"\"\"\n", - " {env_var_options} neuron_parallel_compile torchrun --nproc_per_node={num_workers}\n", - " run_image_classification.py\n", - " --model {model_name}\n", - " --platform torchvision\n", - " --pretrained\n", - " --num_epochs 2\n", - " --batch_size {batch_size}\n", - " --pretrained\n", - " --lr {learning_rate}\n", - " --drop_last\n", - " \"\"\".replace('\\n', '')\n", - "\n", - "print(f'Running command: \\n{COMPILE_CMD}')\n", - "if subprocess.check_call(COMPILE_CMD,shell=True):\n", - " print(\"There was an error with the compilation command\")\n", - "else:\n", - " print(\"Compilation Success!!!\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4) Compile and Fine-tune the model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "print(\"Train model\")\n", - "RUN_CMD = f\"\"\"\n", - " {env_var_options} torchrun --nproc_per_node={num_workers}\n", - " run_image_classification.py\n", - " --model {model_name}\n", - " --platform torchvision\n", - " --pretrained\n", - " --num_epochs {num_epochs}\n", - " --batch_size {batch_size}\n", - " --pretrained\n", - " --lr {learning_rate}\n", - " --do_eval\n", - " --drop_last\n", - " \"\"\".replace('\\n', '')\n", - "\n", - "print(f'Running command: \\n{RUN_CMD}')\n", - "if subprocess.check_call(RUN_CMD,shell=True):\n", - " print(\"There was an error with the fine-tune command\")\n", - "else:\n", - " print(\"Fine-tune Successful!!!\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python (torch-neuronx)", - "language": "python", - "name": "aws_neuron_venv_pytorch" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/test/integration/select_and_scatter_kernel/run.sh b/test/integration/select_and_scatter_kernel/run.sh deleted file mode 100644 index b379c08..0000000 --- a/test/integration/select_and_scatter_kernel/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -set -e - - -if [ "$#" -ne 1 ]; then - echo "Illegal number of parameters. Must call with ./run.sh " - exit 1 -fi - -metric_path=$1 - -echo ${test_name} is writing the benchmark result to ${metric_path} - -ipython resnet50.ipynb ${metric_path} \ No newline at end of file diff --git a/test/integration/select_and_scatter_kernel/run_image_classification.py b/test/integration/select_and_scatter_kernel/run_image_classification.py deleted file mode 100644 index bf9df54..0000000 --- a/test/integration/select_and_scatter_kernel/run_image_classification.py +++ /dev/null @@ -1,215 +0,0 @@ -import os -import sys -import time -from datetime import datetime -import numpy as np -import traceback -import json -import torch -import torch.distributed as dist -import torch_xla.distributed.xla_backend -import torch.nn as nn -import torch.optim as optim -import torchvision.datasets as datasets -import torch_xla -import torch_xla.debug.metrics as met -import torch_xla.distributed.parallel_loader as pl -import torch_xla.utils.utils as xu -import torch_xla.core.xla_model as xm -import torch_xla.distributed.xla_multiprocessing as xmp - -if not '..' in sys.path: sys.path.append('..') -import common.vision_utils as vision_utils -from insert_select_and_scatter_kernel import replace_maxpool - -def train(): - print('==> Preparing data..') - is_root = xm.is_master_ordinal(local=False) - train_transform, test_transform = vision_utils.get_data_transforms(FLAGS.image_dim) - if FLAGS.fake_data: - train_dataset_len = FLAGS.fake_train_dataset_length - test_dataset_len = FLAGS.fake_test_dataset_length - train_dataset = datasets.FakeData( - size=train_dataset_len, - image_size=(3, FLAGS.image_dim, FLAGS.image_dim), - num_classes=1000, - transform=train_transform) - test_dataset = datasets.FakeData(size=test_dataset_len, - image_size=(3, FLAGS.image_dim, FLAGS.image_dim), - num_classes=1000, - transform=test_transform) - elif FLAGS.data_dir: - train_dataset = datasets.ImageFolder( - os.path.join(FLAGS.data_dir, 'train'), - train_transform) - train_dataset_len = len(train_dataset.imgs) - test_dataset = datasets.ImageFolder( - os.path.join(FLAGS.data_dir, 'test'), - test_transform) - else: - # use cifar10 by default - train_transform, test_transform = vision_utils.get_data_transforms(FLAGS.image_dim) - train_dataset = datasets.CIFAR10( - os.path.join("./"), train=True, transform=train_transform, download=True - ) - test_dataset = datasets.CIFAR10( - os.path.join("./"), train=False, transform=test_transform, download=True - ) - - train_loader, test_loader = vision_utils.create_data_loaders( - train_dataset, - test_dataset, - xm.get_ordinal(), - xm.xrt_world_size(), - FLAGS.batch_size, - FLAGS.test_batch_size, - FLAGS.num_workers, - FLAGS.drop_last) - - torch.manual_seed(42) - - device = xm.xla_device() - model = vision_utils.get_model(FLAGS.platform, FLAGS.model, FLAGS.pretrained).to(device) - model = replace_maxpool(model) - - writer = None - if xm.is_master_ordinal(): - logger = vision_utils.Logger(FLAGS, xm.xrt_world_size()) - optimizer = optim.SGD( - model.parameters(), - lr=FLAGS.lr, - momentum=FLAGS.momentum, - weight_decay=1e-4) - lr_scheduler = None - if FLAGS.lr_scheduler_type == "WarmupAndExponentialDecayScheduler": - num_training_steps_per_epoch = train_dataset_len // ( - FLAGS.batch_size * xm.xrt_world_size()) - lr_scheduler = vision_utils.WarmupAndExponentialDecayScheduler( - optimizer, - num_training_steps_per_epoch, - divide_every_n_epochs=getattr(FLAGS, 'lr_scheduler_divide_every_n_epochs', None), - divisor=getattr(FLAGS, 'lr_scheduler_divisor', None)) - loss_fn = nn.CrossEntropyLoss() - - if is_root: - throughput = vision_utils.Throughput(FLAGS.batch_size, xm.xrt_world_size(), FLAGS.log_steps) - print('--------TRAINING CONFIG----------') - print(FLAGS) - print('---------------------------------') - train_start = time.time() - - def train_loop_fn(loader, epoch, global_step): - model.train() - for step, (data, target) in enumerate(loader): - optimizer.zero_grad() - output = model(data) - logits = output if isinstance(output, torch.Tensor) else output.logits - loss = loss_fn(logits, target) - loss.backward() - xm.optimizer_step(optimizer) - if lr_scheduler: - lr_scheduler.step() - global_step += 1 - if is_root: - step_throughput = throughput.get_throughput() - logger.train_throughputs.append(step_throughput) - if step % FLAGS.log_steps == 0: - logger.print_training_update( - device, - step, - FLAGS.lr if not lr_scheduler else lr_scheduler.optimizer.param_groups[0]['lr'], - loss.item(), - step_throughput, - epoch, - writer) - if global_step >= FLAGS.max_steps: - xm.mark_step() - break - return global_step, loss - - def test_loop_fn(loader, epoch): - total_samples, correct = 0, 0 - model.eval() - with torch.no_grad(): - for step, (data, target) in enumerate(loader): - output = model(data) - logits = output if isinstance(output, torch.Tensor) else output.logits - pred = logits.max(1, keepdim=True)[1] - correct += pred.eq(target.view_as(pred)).sum() - total_samples += data.size()[0] - if is_root: - step_throughput = throughput.get_throughput() - logger.test_throughputs.append(step_throughput) - if step % FLAGS.log_steps == 0: - logger.print_test_update(device, step_throughput, None, epoch, step) - accuracy = 100.0 * correct.item() / total_samples - accuracy = xm.mesh_reduce('test_accuracy', accuracy, np.mean) - return accuracy - - train_device_loader = pl.MpDeviceLoader(train_loader, device) - test_device_loader = pl.MpDeviceLoader(test_loader, device) - accuracy, max_accuracy = 0.0, 0.0 - global_step = 0 - for epoch in range(1, FLAGS.num_epochs + 1): - xm.master_print('Epoch {} train begin {}'.format(epoch, datetime.now())) - global_step, loss = train_loop_fn(train_device_loader, epoch, global_step) - xm.master_print('Epoch {} train end {}'.format(epoch, datetime.now())) - if FLAGS.metrics_debug: - xm.master_print(met.metrics_report()) - if is_root: - average_train_throughput = round(sum(logger.train_throughputs)/len(logger.train_throughputs), 4) - xm.master_print('Average train throughput: {:.4f}'.format(average_train_throughput)) - xm.master_print('Max train throughput: {:.4f}'.format(max(logger.train_throughputs))) - if global_step >= FLAGS.max_steps: - break - - if is_root: - time_to_train = time.time() - train_start - - if FLAGS.do_eval: - if is_root: - throughput = vision_utils.Throughput(FLAGS.batch_size, xm.xrt_world_size(), FLAGS.log_steps) - accuracy = test_loop_fn(test_device_loader, epoch) - xm.master_print('Epoch {} test end {}, Accuracy={:.2f}'.format( - epoch, datetime.now(), accuracy)) - max_accuracy = max(accuracy, max_accuracy) - if is_root: - logger.write_to_summary( - epoch, - dict_to_write={'Accuracy/test': accuracy}) - average_test_throughput = round(sum(logger.test_throughputs)/len(logger.test_throughputs), 4) - xm.master_print('Average test throughput: {:.4f}'.format(average_test_throughput)) - xm.master_print('Max test throughput: {:.4f}'.format(max(logger.test_throughputs))) - xm.master_print('Max Accuracy: {:.2f}%'.format(max_accuracy)) - - report_dict = dict() - report_dict["Throughput"] = f'{average_train_throughput:.1f}' - report_dict["Accuracy"]=f'{max_accuracy:.1f}' - - report = {"test_name": f'{FLAGS.model}_training', "report": report_dict} - - metric_path = FLAGS.metric_path - with open(metric_path, 'r+') as f: - cur = json.load(f) - cur.append(report) - with open(metric_path, 'w+') as f: - json.dump(cur, f) - return report_dict - - -def _mp_fn(index, flags): - global FLAGS - FLAGS = flags - torch.set_default_tensor_type('torch.FloatTensor') - train() - xm.rendezvous("_mp_fn finished") - -if __name__ == '__main__': - parser = vision_utils.build_train_parser() - args = parser.parse_args(sys.argv[1:]) - - if os.environ.get("WORLD_SIZE"): - dist.init_process_group('xla') - _mp_fn(0, args) - else: - xmp.spawn(_mp_fn, args=(args,)) \ No newline at end of file diff --git a/test/unit/README.md b/test/unit/README.md deleted file mode 100644 index 55dc937..0000000 --- a/test/unit/README.md +++ /dev/null @@ -1,7 +0,0 @@ -Tests under this folder are unit tests for the kernels in `src/nki_samples`. - -To execute the tests, we need to include `src/nki_samples` in the `PYTHONPATH`. - -For example, - -PYTHONPATH=$PYTHONPATH:/home/ubuntu/nki-samples/src/ pytest test_flash_attn_fwd.py \ No newline at end of file diff --git a/test/unit/conftest.py b/test/unit/conftest.py deleted file mode 100644 index cd663ae..0000000 --- a/test/unit/conftest.py +++ /dev/null @@ -1,28 +0,0 @@ -import pytest - -def pytest_addoption(parser): - parser.addoption( - "--simulation-only", action="store_true", default=False, help="Run simulation only, it will run test with `simulation` marker in simulation mode" - ) - -def pytest_configure(config): - config.addinivalue_line( - "markers", "simulation: mark simulation test that can be executed without a NeuronDevice" - ) - -@pytest.fixture -def simulation_only(request): - return request.config.getoption("--simulation-only") - -def pytest_collection_modifyitems(session, config, items): - if config.getoption("--simulation-only"): - # Only run cases with `simulation marker` - result = [] - for item in items: - for marker in item.iter_markers(): - if marker.name == 'simulation': - result.append(item) - break - items.clear() - items.extend(result) - \ No newline at end of file diff --git a/test/unit/test_SD_attention_small_head.py b/test/unit/test_SD_attention_small_head.py deleted file mode 100644 index 36d66a0..0000000 --- a/test/unit/test_SD_attention_small_head.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -Copyright (c) 2023, Amazon.com. All Rights Reserved -""" -import os -import pytest -from nki_samples.reference.attention import fused_self_attn_for_SD_small_head_size -from neuronxcc.nki import benchmark, baremetal, simulate_kernel -import neuronxcc.nki.language as nl -import numpy as np -from scipy.special import softmax - -test_trace_file_path='local_trace.ntff' -bench_func = benchmark(warmup=5, iters=20, save_trace_name=test_trace_file_path)(fused_self_attn_for_SD_small_head_size) - -def cpu_golden_attn(q, k, v): - softmax_scale = 0.125 - q_scaled = q * softmax_scale - raw_score = np.matmul(q_scaled.transpose(0, 2, 1), k.transpose(0, 2, 1)) - - norm_score = softmax(raw_score, axis=-1) - - # Transpose the result so it has the same layout as ours - return np.matmul(norm_score, v) - -class TestAttention: - - @pytest.mark.parametrize("bs,seqlen,d,dtype,latency", [ - [1, 4096, 128, np.float32, 675], - [1, 4096, 128, nl.bfloat16, 536], - [1, 4096, 64, nl.float16, 600] - ]) - def test_attention_for_SD_perf(self, bs, seqlen, d, dtype, latency): - q = np.random.random_sample((bs, d, seqlen)).astype(np.float32) - k = np.random.random_sample((bs, seqlen, d)).astype(np.float32) - v = np.random.random_sample((bs, seqlen, d)).astype(np.float32) - - q_dev = nl.static_cast(q, dtype) - k_dev = nl.static_cast(k, dtype) - v_dev = nl.static_cast(v, dtype) - - bench_func_ = bench_func[bs] - bench_func_(q_dev, k_dev, v_dev) - latency_res = bench_func_.benchmark_result.nc_latency - p1 = latency_res.get_latency_percentile(1) - assert os.path.getsize(test_trace_file_path) > 0 - - @pytest.mark.simulation - @pytest.mark.parametrize("bs,seqlen,d,dtype", [ - [1, 4096, 128, np.float32], - [1, 4096, 128, nl.bfloat16] - ]) - def test_attention_for_SD_numberic(self, simulation_only, bs, seqlen, d, dtype): - q = np.random.random_sample((bs, d, seqlen)).astype(np.float32) - k = np.random.random_sample((bs, seqlen, d)).astype(np.float32) - v = np.random.random_sample((bs, seqlen, d)).astype(np.float32) - - q_dev = nl.static_cast(q, dtype) - k_dev = nl.static_cast(k, dtype) - v_dev = nl.static_cast(v, dtype) - - numeric_func = baremetal(fused_self_attn_for_SD_small_head_size) - if simulation_only: - out = simulate_kernel(numeric_func[bs], q_dev, k_dev, v_dev) - else: - out = numeric_func[bs](q_dev, k_dev, v_dev) - out = nl.static_cast(out, np.float32) - golden_result = cpu_golden_attn(q, k, v) - assert np.allclose(out, golden_result, atol=1e-2) diff --git a/test/unit/test_adaptive_avg_pool2d.py b/test/unit/test_adaptive_avg_pool2d.py deleted file mode 100644 index f4fff4e..0000000 --- a/test/unit/test_adaptive_avg_pool2d.py +++ /dev/null @@ -1,123 +0,0 @@ -""" -Copyright (c) 2025, Amazon.com. All Rights Reserved - -Unit tests for adaptive average pooling 2D kernel -""" -import pytest -import numpy as np - -from nki_samples.reference.vision import adaptive_avg_pool2d_kernel -from neuronxcc.nki import benchmark, baremetal, simulate_kernel -import neuronxcc.nki.language as nl - - -bench_func = benchmark(warmup=5, iters=10)(adaptive_avg_pool2d_kernel) - -def cpu_golden_result_numpy(input_tensor, output_size): - """ - Pure NumPy reference implementation for adaptive average pooling - """ - N, C, H, W = input_tensor.shape - - if isinstance(output_size, int): - OH = OW = output_size - else: - OH, OW = output_size - - output = np.zeros((N, C, OH, OW), dtype=input_tensor.dtype) - - for n in range(N): - for c in range(C): - for oh in range(OH): - for ow in range(OW): - # Calculate the input region for this output position - h_start = (oh * H) // OH - h_end = ((oh + 1) * H + OH - 1) // OH - w_start = (ow * W) // OW - w_end = ((ow + 1) * W + OW - 1) // OW - - # Compute average over the region - region = input_tensor[n, c, h_start:h_end, w_start:w_end] - output[n, c, oh, ow] = np.mean(region) - - return output - - -class TestAdaptiveAvgPool2D: - - @pytest.mark.parametrize("N, C, H, W, output_size, dtype, latency", [ - # Test cases with various input/output sizes - [1, 64, 56, 56, 7, np.float32, 65 * 1.05], - [2, 128, 28, 28, 7, np.float32, 109 * 1.05], - [1, 256, 14, 14, 7, nl.float16, 88 * 1.05], - [4, 64, 56, 56, (7, 7), np.float32, 152 * 1.05], - [2, 128, 28, 28, (14, 14), np.float32, 327 * 1.05], - [1, 512, 7, 7, 1, np.float32, 14 * 1.05], # Global average pooling - [8, 64, 32, 32, (4, 4), nl.float16, 103 * 1.05], - [2, 2048, 32, 32, (3, 3), np.float32, 564 * 1.05], - [2, 64, 256, 256, 1, np.float32, 255 * 1.05], - ]) - def test_adaptive_avg_pool2d_perf(self, N, C, H, W, output_size, dtype, latency): - """Performance test for adaptive average pooling 2D""" - # Generate random input - input_tensor = np.random.randn(N, C, H, W).astype(np.float32) - - # Cast to target dtype - input_dev = nl.static_cast(input_tensor, dtype) - - # Run benchmark - bench_func(input_dev, output_size) - latency_res = bench_func.benchmark_result.nc_latency - p99 = latency_res.get_latency_percentile(99) - - @pytest.mark.simulation - @pytest.mark.parametrize("N, C, H, W, output_size, dtype", [ - # Numerical accuracy test cases - [1, 64, 56, 56, 7, np.float32], - [2, 128, 28, 28, 7, np.float32], - [1, 256, 14, 14, 7, nl.float16], - [1, 256, 14, 14, 7, nl.bfloat16], - [4, 64, 56, 56, (7, 7), np.float32], - [2, 128, 28, 28, (14, 14), np.float32], - [1, 512, 7, 7, 1, np.float32], # Global average pooling - [1, 64, 32, 32, (8, 8), nl.float16], - [3, 96, 24, 24, (6, 6), nl.bfloat16], - [2, 192, 16, 16, (4, 2), np.float32], # Non-square output - [2, 2048, 32, 32, (3, 3), np.float32], - [1, 1, 1, 1, 1, np.float32], # Single element - [1, 128, 7, 7, 7, np.float32], # Same input and output size - [2, 64, 256, 256, 1, np.float32], # Global pooling on large input - [1, 256, 13, 13, 6, np.float32], # Non-divisible dimensions - [4, 512, 7, 7, (1, 1), np.float32], # Tuple output for global pooling - ]) - def test_adaptive_avg_pool2d_numerical(self, simulation_only, N, C, H, W, output_size, dtype): - """Numerical accuracy test for adaptive average pooling 2D""" - # Generate random input - input_tensor = np.random.randn(N, C, H, W).astype(np.float32) - - # Cast to target dtype - input_dev = nl.static_cast(input_tensor, dtype) - - # Run kernel - numeric_func = baremetal(adaptive_avg_pool2d_kernel) - if simulation_only: - output_tensor = simulate_kernel(numeric_func, input_dev, output_size) - else: - output_tensor = numeric_func(input_dev, output_size) - - # Cast output back to float32 for comparison - output_tensor = nl.static_cast(output_tensor, np.float32) - - # Compute golden result using NumPy implementation - golden_result = cpu_golden_result_numpy(input_tensor, output_size) - - # Compare results with appropriate tolerance - if dtype in [nl.float16, nl.bfloat16]: - atol = 1e-2 - rtol = 1e-2 - else: - atol = 1e-5 - rtol = 1e-5 - - assert np.allclose(output_tensor, golden_result, atol=atol, rtol=rtol), \ - f"Output mismatch. Max diff: {np.max(np.abs(output_tensor - golden_result))}" diff --git a/test/unit/test_allocated_SD_attention_small_head.py b/test/unit/test_allocated_SD_attention_small_head.py deleted file mode 100644 index f4318d4..0000000 --- a/test/unit/test_allocated_SD_attention_small_head.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -Copyright (c) 2023, Amazon.com. All Rights Reserved -""" -import os -import pytest -from nki_samples.reference.allocated_attention import allocated_fused_self_attn_for_SD_small_head_size -from neuronxcc.nki import benchmark, baremetal, simulate_kernel -import neuronxcc.nki as nki -import neuronxcc.nki.language as nl -import numpy as np -from scipy.special import softmax - -test_trace_file_path='local_trace.ntff' - -bench_func = benchmark(warmup=20, iters=200, save_trace_name=test_trace_file_path)(allocated_fused_self_attn_for_SD_small_head_size) - -def cpu_golden_attn(q, k, v): - softmax_scale = 0.125 - q_scaled = q * softmax_scale - raw_score = np.matmul(q_scaled.transpose(0, 2, 1), k) - - norm_score = softmax(raw_score, axis=-1) - - # Transpose the result so it has the same layout as ours - return np.matmul(norm_score, v).transpose(0, 2, 1) - -class TestAttention: - - @pytest.mark.parametrize("bs,seqlen,d,dtype,latency", [ - [1, 4096, 128, np.float32, 410], - [1, 4096, 128, nl.bfloat16, 380], - [1, 5120, 128, nl.bfloat16, 586] - ]) - def test_allocated_attention_for_SD_perf(self, bs, seqlen, d, dtype, latency): - q = np.random.random_sample((bs, d, seqlen)).astype(np.float32) - k = np.random.random_sample((bs, d, seqlen)).astype(np.float32) - v = np.random.random_sample((bs, seqlen, d)).astype(np.float32) - - q_dev = nl.static_cast(q, dtype) - k_dev = nl.static_cast(k, dtype) - v_dev = nl.static_cast(v, dtype) - - bench_func_ = bench_func[bs] - bench_func_(q_dev, k_dev, v_dev) - latency_res = bench_func_.benchmark_result.nc_latency - p50 = latency_res.get_latency_percentile(50) - assert os.path.getsize(test_trace_file_path) > 0 - - @pytest.mark.simulation - @pytest.mark.parametrize("bs,seqlen,d,dtype", [ - [1, 4096, 128, np.float32], - [1, 4096, 128, nl.bfloat16], - [1, 5120, 128, nl.bfloat16] - ]) - def test_allocated_attention_for_SD_numberic(self, simulation_only, bs, seqlen, d, dtype): - q = np.random.random_sample((bs, d, seqlen)).astype(np.float32) - k = np.random.random_sample((bs, d, seqlen)).astype(np.float32) - v = np.random.random_sample((bs, seqlen, d)).astype(np.float32) - - q_dev = nl.static_cast(q, dtype) - k_dev = nl.static_cast(k, dtype) - v_dev = nl.static_cast(v, dtype) - - numeric_func = baremetal(allocated_fused_self_attn_for_SD_small_head_size) - if simulation_only: - out = simulate_kernel(numeric_func[bs], q_dev, k_dev, v_dev) - else: - out = numeric_func[bs](q_dev, k_dev, v_dev) - out = nl.static_cast(out, np.float32) - golden_result = cpu_golden_attn(q, k, v) - assert np.allclose(out, golden_result, atol=1e-2) diff --git a/test/unit/test_double_row_matmul.py b/test/unit/test_double_row_matmul.py deleted file mode 100644 index 9e49ecb..0000000 --- a/test/unit/test_double_row_matmul.py +++ /dev/null @@ -1,142 +0,0 @@ -""" -Copyright (c) 2025, Amazon.com. All Rights Reserved -""" -import pytest -from nki_samples.reference.double_row_matmul import quantized_double_row_matmul -from neuronxcc.nki import benchmark, baremetal, simulate_kernel -import neuronxcc.nki.language as nl -import numpy as np - -def get_target_string(): - """ returns instance type, e.g. trn1, inf2, trn2. """ - fpath = '/sys/devices/virtual/dmi/id/product_name' - try: - with open(fpath, 'r') as f: - fc = f.readline() - except IOError: - warnings.warn('Unable to read MLA target.') - return "" - - instance_type = fc.split('.')[0] - return instance_type - -bench_func = benchmark(warmup=5, iters=10)(quantized_double_row_matmul) - -def reshape(matrix): - """ - Interleaves every [128,512] tiles from every 2 tile rows. - - A [K,N] matrix is reshaped into [K//2, 2*N] where K must be divisible by 128 and - N must be divisible by 512. - - E.g. if Tij is the (i,j)-th tile and assuming a matrix with 4x4 [128,512] tiles, - the reshaped matrix looks as follows - - # T11 T12 T13 T14 - # T21 T22 T23 T24 reshape T11 T21 T12 T22 T13 T23 T14 T24 - # T31 T32 T33 T34 --------> T21 T41 T22 T42 T23 T43 T24 T44 - # T41 T42 T43 T44 - """ - K, N = matrix.shape - - TILE_K = 128 - TILE_N = 512 - - assert K % TILE_K == 0 - assert N % TILE_N == 0 - - result = np.zeros((K // 2, 2 * N)) - - for k in range(0, K // TILE_K, 2): - for n in range(N // TILE_N): - # Get 2 tiles in the same tile column and consecutive tile rows. - tile1 = matrix[k * TILE_K:(k + 1) * TILE_K, n * TILE_N:(n + 1) * TILE_N] - tile2 = matrix[(k + 1) * TILE_K:(k + 2) * TILE_K, n * TILE_N:(n + 1) * TILE_N] - - result[(k // 2) * TILE_K:(k // 2 + 1) * TILE_K, n * TILE_N * 2:n * TILE_N * 2 + TILE_N] = tile1 - result[(k//2) * TILE_K:(k // 2 + 1) * TILE_K, n * TILE_N * 2 + TILE_N:(n + 1) * TILE_N * 2] = tile2 - - # Place the 2 tiles in the same tile row side by side. - result[(k // 2) * TILE_K:(k // 2 + 1) * TILE_K, n * TILE_N * 2:n * TILE_N * 2+TILE_N] = tile1 - result[(k // 2) * TILE_K:(k // 2 + 1) * TILE_K, n * TILE_N * 2 + TILE_N:n * TILE_N * 2 + TILE_N + TILE_N] = tile2 - - return result - -def column_wise_quantize(matrix): - """ - Quantizes a matrix. - - Returns a column-wise scale broadcasted to (128, matrix.shape[1]) and the quantized matrix. - """ - FP8_RANGE = 240 - column_wise_max = np.max(np.abs(matrix), axis=0, keepdims=True) - column_wise_scale = column_wise_max / FP8_RANGE - - matrix_quantized = matrix / column_wise_scale - column_wise_scale = np.broadcast_to(column_wise_scale, (128, matrix.shape[1])) - - return column_wise_scale, matrix_quantized - -class TestDoubleRowMatmul: - - @pytest.mark.parametrize("M, K, N, dtype, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K, max_p99_latency", [ - [512, 16 * 1024, 1024, nl.bfloat16, 2, 2, 16, 320], - ]) - def test_double_row_matmul_perf(self, M, K, N, dtype, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K, max_p99_latency): - if (get_target_string() != "trn2"): - return - # Initializing random inputs - lhs = np.random.rand(M, K) - rhs = np.random.rand(K, N) - - # Quantizing rhs - rhs_scale, rhs_quantized = column_wise_quantize(rhs) - rhs_quantized_reshaped = reshape(rhs_quantized) - - # Casting to the correct data type (rhs is pre-quantized, thus casted to FP8) - lhs = nl.static_cast(lhs, dtype) - rhs_scale = nl.static_cast(rhs_scale, dtype) - rhs_quantized_reshaped = nl.static_cast(rhs_quantized_reshaped, nl.float8_e4m3) - - # Latency checks - bench_func(lhs, rhs_quantized_reshaped, rhs_scale, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K) - latency_res = bench_func.benchmark_result.nc_latency - p99_latency = latency_res.get_latency_percentile(99) - - @pytest.mark.simulation - @pytest.mark.parametrize("M, K, N, dtype, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K", [ - [512, 16 * 1024, 1024, nl.bfloat16, 2, 2, 16], - [512, 16 * 1024, 1024, nl.bfloat16, 4, 1, 32], - [512, 16 * 1024, 1024, nl.bfloat16, 4, 2, 128], - ]) - def test_double_row_matmul_numerical(self, simulation_only, M, K, N, dtype, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K): - if (get_target_string() != "trn2"): - return - # Initializing random inputs - lhs = np.random.rand(M, K) - rhs = np.random.rand(K, N) - - # Correct CPU results - result_golden = np.matmul(lhs, rhs) - - # Quantizing rhs - rhs_scale, rhs_quantized = column_wise_quantize(rhs) - rhs_quantized_reshaped = reshape(rhs_quantized) - - # Casting to the correct data type (rhs is pre-quantized, thus casted to FP8) - lhs = nl.static_cast(lhs, dtype) - rhs_scale = nl.static_cast(rhs_scale, dtype) - rhs_quantized_reshaped = nl.static_cast(rhs_quantized_reshaped, nl.float8_e4m3) - - # Numerical accuracy checks - numeric_func = baremetal(quantized_double_row_matmul) - - if simulation_only: - result_nki = simulate_kernel(numeric_func, lhs, rhs_quantized_reshaped, rhs_scale, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K) - else: - result_nki = numeric_func(lhs, rhs_quantized_reshaped, rhs_scale, TILES_IN_BLOCK_M, TILES_IN_BLOCK_N, TILES_IN_BLOCK_K) - - # Casting result_nki from dtype BF16 back to FP32 to compare the NumPy and NKI results - result_nki = result_nki.astype(np.float32) - - assert np.allclose(result_golden, result_nki, rtol=2e-2) diff --git a/test/unit/test_flash_attn_bwd.py b/test/unit/test_flash_attn_bwd.py deleted file mode 100644 index d1f0dfb..0000000 --- a/test/unit/test_flash_attn_bwd.py +++ /dev/null @@ -1,206 +0,0 @@ -""" -Copyright (c) 2023, Amazon.com. All Rights Reserved -""" -import pytest -from nki_samples.reference.attention import flash_attn_bwd -from neuronxcc.nki import benchmark, baremetal, simulate_kernel -import neuronxcc.nki.language as nl -import numpy as np - -xfail = pytest.mark.arch_specific_xfail - - -bench_func = benchmark(warmup=5, iters=10)(flash_attn_bwd) - -def softmax(x: np.ndarray, dim: int, zero_max_mode=False, - mixed_precision=False, return_max_reduce=False): - max_value = np.amax(x, axis=dim, keepdims=True) - max_value = np.maximum(0, max_value) if zero_max_mode else max_value - exp = np.exp(x - max_value) - if mixed_precision: - reduce = np.add.reduce(exp.astype(np.float32), axis=dim, keepdims=True).astype(x.dtype) - else: - reduce = np.add.reduce(exp, axis=dim, keepdims=True) - if return_max_reduce: - return exp / reduce, -max_value, np.reciprocal(reduce) - return exp / reduce - -def softmax_dx(dy: np.ndarray, y: np.ndarray, dim: int, mixed_precision=False): - # dx_i = (dy_i - sum(dy_k*y_k)) * y_i - prod = dy * y - if mixed_precision: - reduce = np.add.reduce(prod.astype(np.float32), axis=dim, keepdims=True).astype(dy.dtype) - else: - reduce = np.add.reduce(prod, axis=dim, keepdims=True) - subtract = dy - reduce - return subtract * y - -def cpu_attention_backward(q, k, v, dy, use_causal_mask=True, mixed_precision=True, sliding_window=-1, sinks=None): - """ - Compute the attention backward with the softmax recomputation - """ - def mixed_precision_matmul(a, b): - input_dtype = a.dtype - a, b = a.astype(np.float32), b.astype(np.float32) - c = np.matmul(a, b) - return c.astype(input_dtype) - - bs, nheads, d, seqlen = q.shape - _, nheads_kv, _, _ = k.shape - nheads_per_kv_head = nheads // nheads_kv - - # Expand key and value to match query heads - k_expanded = np.repeat(k, nheads_per_kv_head, axis=1) - v_expanded = np.repeat(v, nheads_per_kv_head, axis=1) - - # Compute golden output - softmax_scale = 1.0 / (d ** 0.5) - q_scaled = q * softmax_scale - raw_score = mixed_precision_matmul(q_scaled.transpose(0, 1, 3, 2), k_expanded) - - if use_causal_mask: - # raw_score has K seq in the most inner dim - # we want to mask all elements where Q idx is smaller than K idx with -inf - # this maps to the upper triangle of the final two axes - for i in range(raw_score.shape[0]): - for j in range(raw_score.shape[1]): - # -inf triggers invalid input error in softmax implementation, use a small negative instead - # k=1 to exclude the diagonal, because each token can still attend to itself - raw_score[i, j][np.triu_indices_from(raw_score[i, j], k=1)] = -9984.0 - - if sliding_window > 0: - q_pos = np.arange(raw_score.shape[2])[:, None] - k_pos = np.arange(raw_score.shape[3])[None, :] - sliding_window_mask = k_pos < (q_pos - sliding_window + 1) - raw_score[i, j][sliding_window_mask] = -9984.0 - - # Add sink tokens to the raw_score along the innermost dimension - if sinks is not None: - bs, nheads, seq_q, seq_k = raw_score.shape - sinks_expanded = np.broadcast_to(sinks.reshape(bs, nheads, 1, 1), (bs, nheads, seq_q, 1)) - combined_logits = np.concatenate([raw_score, sinks_expanded], axis=-1) - else: - combined_logits = raw_score - - norm_score_combined, cached_negative_max, cached_sum_reciprocal = \ - softmax(combined_logits, dim=-1, mixed_precision=mixed_precision, return_max_reduce=True) - - # Drop sink probabilities if sinks were added, keep only attention scores - if sinks is not None: - norm_score = norm_score_combined[..., :-1] # Remove last dimension (sink) - else: - norm_score = norm_score_combined - - # Calculate softmax_dy = (dL/dy)^T @ V - softmax_dy = mixed_precision_matmul(dy.transpose(0, 1, 3, 2), v_expanded) - - if sinks is not None: - # Combine gradients: softmax_dy for attention, zeros for sinks - combined_dy = np.concatenate([softmax_dy, np.zeros_like(norm_score_combined[..., -1:])], axis=-1) - combined_dx = softmax_dx(combined_dy, norm_score_combined, dim=-1, mixed_precision=mixed_precision) - softmax_dx_golden = combined_dx[..., :-1] # Extract attention gradients - dsinks_golden = np.sum(combined_dx[..., -1:], axis=(2, 3)) # Extract and sum sink gradients - else: - softmax_dx_golden = softmax_dx(softmax_dy, norm_score, dim=-1, mixed_precision=mixed_precision) - dsinks_golden = None - - # Calculate dv = (dL/dy) @ softmax_y - dv_expanded = mixed_precision_matmul(dy, norm_score) - dv_reshaped = dv_expanded.reshape(bs, nheads_kv, nheads_per_kv_head, d, seqlen) - dv_golden = np.sum(dv_reshaped, axis=2) - - # Calculate dq - dq_golden = mixed_precision_matmul(k_expanded, softmax_dx_golden.transpose(0, 1, 3, 2)) * softmax_scale - - # Calculate dk - dk_expanded = mixed_precision_matmul(q_scaled, softmax_dx_golden) - dk_reshaped = dk_expanded.reshape(bs, nheads_kv, nheads_per_kv_head, d, seqlen) - dk_golden = np.sum(dk_reshaped, axis=2) - - # Calculate output projection - o_proj = np.matmul(norm_score, v_expanded.transpose(0, 1, 3, 2)).transpose(0, 1, 3, 2) - - # Calculate - return dq_golden, dk_golden, dv_golden, cached_negative_max, cached_sum_reciprocal, o_proj, dsinks_golden - -class TestAttention: - - @xfail # P167481231 - @pytest.mark.parametrize("bs, nheads, seqlen, d, dtype, latency", [ - [1, 4, 32*1024, 128, nl.bfloat16, 117000], - ]) - @pytest.mark.parametrize("sinks", [False, True]) - def test_flash_attn_bwd_perf(self, bs, nheads, seqlen, d, dtype, latency, sinks): - q = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2 - k = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2 - v = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2 - dy = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2 - o_proj = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2 - lse = np.random.random_sample([bs, nheads, nl.tile_size.pmax, seqlen // nl.tile_size.pmax]).astype(np.float32) - seed = None - sinks_tensor = np.random.randn(bs, nheads).astype(dtype) if sinks else None - - q = nl.static_cast(q, dtype) - k = nl.static_cast(k, dtype) - v = nl.static_cast(v, dtype) - o_proj = nl.static_cast(o_proj, dtype) - dy = nl.static_cast(dy, dtype) - - bench_func_ = bench_func[bs, nheads] - bench_func_(q, k, v, o_proj, dy, lse, seed, - use_causal_mask=True, mixed_precision=True, sinks=sinks_tensor) - latency_res = bench_func_.benchmark_result.nc_latency - p99 = latency_res.get_latency_percentile(50) - - @pytest.mark.simulation - @pytest.mark.parametrize("bs, nheads, nheads_kv, seqlen, d, dtype", [ - [1, 4, 4, 4096, 128, np.float32], - [1, 4, 1, 4096, 128, np.float32], - [1, 8, 2, 4096, 128, np.float32], - [1, 8, 2, 8192, 128, np.float32], - ]) - @pytest.mark.parametrize("sliding_window", [-1, 128]) - @pytest.mark.parametrize("sinks", [False, True]) - def test_flash_attn_bwd_numerical(self, simulation_only, bs, nheads, nheads_kv, seqlen, d, dtype, sliding_window, sinks): - q = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2 - k = (np.random.random_sample([bs, nheads_kv, d, seqlen]) - 0.5) * 2 - v = (np.random.random_sample([bs, nheads_kv, d, seqlen]) - 0.5) * 2 - dy = (np.random.random_sample([bs, nheads, d, seqlen]) - 0.5) * 2 - q = nl.static_cast(q, dtype) - k = nl.static_cast(k, dtype) - v = nl.static_cast(v, dtype) - dy = nl.static_cast(dy, dtype) - seed = None - sinks_tensor = np.random.randn(bs, nheads).astype(dtype) if sinks else None - - dq_golden, dk_golden, dv_golden, cached_negative_max, cached_sum_reciprocal, o_proj, dsinks_golden = \ - cpu_attention_backward(q, k, v, dy, use_causal_mask=True, sliding_window=sliding_window, sinks=sinks_tensor) - cached_negative_max = cached_negative_max.reshape(bs, nheads, seqlen // nl.tile_size.pmax, - nl.tile_size.pmax).transpose(0, 1, 3, 2) - cached_sum_reciprocal = cached_sum_reciprocal.reshape(bs, nheads, seqlen // nl.tile_size.pmax, - nl.tile_size.pmax).transpose(0, 1, 3, 2) - lse = -1.0 * (cached_negative_max + np.log(cached_sum_reciprocal)) - - numeric_func = baremetal(flash_attn_bwd) - if simulation_only: - kernel_outputs = simulate_kernel(numeric_func[bs, nheads_kv], q, k, v, o_proj, dy, lse, seed, - use_causal_mask=True, - mixed_precision=True, - sliding_window=sliding_window, sinks=sinks_tensor) - else: - kernel_outputs = numeric_func[bs, nheads_kv](q, k, v, o_proj, dy, lse, seed, - use_causal_mask=True, - mixed_precision=True, - sliding_window=sliding_window, sinks=sinks_tensor) - if sinks_tensor is not None: - if len(kernel_outputs) == 4: - out_dq, out_dk, out_dv, dsinks = kernel_outputs - assert np.allclose(dsinks, dsinks_golden, atol=1e-2) - else: - out_dq, out_dk, out_dv = kernel_outputs - else: - out_dq, out_dk, out_dv = kernel_outputs - - assert np.allclose(out_dq, dq_golden, atol=1e-2) - assert np.allclose(out_dk, dk_golden, atol=1e-2) - assert np.allclose(out_dv, dv_golden, atol=1e-2) diff --git a/test/unit/test_flash_attn_fwd.py b/test/unit/test_flash_attn_fwd.py deleted file mode 100644 index 33e02b0..0000000 --- a/test/unit/test_flash_attn_fwd.py +++ /dev/null @@ -1,166 +0,0 @@ -""" -Copyright (c) 2023, Amazon.com. All Rights Reserved -""" -import pytest -from nki_samples.reference.attention import flash_fwd, FlashConfig -from neuronxcc.nki import benchmark, baremetal, simulate_kernel -import neuronxcc.nki.language as nl -import numpy as np - -bench_func = benchmark(warmup=5, iters=10)(flash_fwd) - -def softmax(x: np.ndarray, dim: int, zero_max_mode=False, - mixed_precision=False, return_max_reduce=False): - max_value = np.amax(x, axis=dim, keepdims=True) - max_value = np.maximum(0, max_value) if zero_max_mode else max_value - exp = np.exp(x - max_value) - if mixed_precision: - reduce = np.add.reduce(exp.astype(np.float32), axis=dim, keepdims=True).astype(x.dtype) - else: - reduce = np.add.reduce(exp, axis=dim, keepdims=True) - if return_max_reduce: - return exp / reduce, -max_value, np.reciprocal(reduce) - return exp / reduce - - -def cpu_attention_forward(q, k, v, use_causal_mask=True, sliding_window=-1, mixed_precision=True): - def mixed_precision_matmul(a, b): - input_dtype = a.dtype - a, b = a.astype(np.float32), b.astype(np.float32) - c = np.matmul(a, b) - return c.astype(input_dtype) - - _, _, d, _ = q.shape - - # Compute golden output - softmax_scale = 1.0 / (d ** 0.5) - q_scaled = q * softmax_scale - nheads = q.shape[1] - kv_heads = k.shape[1] - if nheads > kv_heads: - k = np.repeat(k, nheads//kv_heads, axis=1) - v = np.repeat(v, nheads//kv_heads, axis=1) - raw_score = mixed_precision_matmul(q_scaled.transpose(0, 1, 3, 2), k) - - if use_causal_mask: - _, _, Q, K = raw_score.shape - - q_idx = np.arange(Q)[:, None] - k_idx = np.arange(K)[None, :] - - if sliding_window > 0: - mask = (k_idx > q_idx) | (k_idx < q_idx - sliding_window + 1) - else: # causal - mask = k_idx > q_idx - - # Broadcast mask to shape (1, 1, Q, K) - raw_score = raw_score.copy() - # -inf triggers invalid input error in softmax implementation, use a small negative instead - raw_score[:, :, mask] = -9984.0 - - - norm_score, cached_negative_max, cached_sum_reciprocal = \ - softmax(raw_score, dim=-1, mixed_precision=mixed_precision, return_max_reduce=True) - - # Transpose the result so it has the same layout as ours - out_golden = mixed_precision_matmul(norm_score, v.transpose(0, 1, 3, 2)).transpose(0, 1, 3, 2) - - return out_golden, cached_negative_max, cached_sum_reciprocal - -class TestAttention: - - @pytest.mark.parametrize("bs, nheads, seqlen_q, seqlen_k, d, dtype, use_causal_mask, \ - sliding_window, mixed_precision, training, tile_size, kv_heads, should_transpose_v, latency", [ - [1, 6, 32*1024, 32*1024, 96, nl.bfloat16, True, -1, True, True, 2048, 3, False, 87000000000], - [1, 1, 32*1024, 32*1024, 96, nl.bfloat16, True, -1, True, False, 2048, None, False, 15100000000], - # Non-square - [1, 3, 32*1024, 16*1024, 96, nl.bfloat16, True, -1, True, False, 2048, None, False, 7550000000], - [1, 3, 16*1024, 32*1024, 96, nl.bfloat16, True, -1, True, False, 2048, None, False, 7550000000], - # Causal vs. Sliding - test sliding window is faster - [1, 1, 16*1024, 16*1024, 96, nl.bfloat16, True, -1, True, False, 2048, None, False, 4000000000], - [1, 1, 16*1024, 16*1024, 96, nl.bfloat16, True, 4096, True, False, 2048, None, False, 3000000000], - ]) - def test_flash_attn_fwd_perf(self, bs, nheads, seqlen_q, seqlen_k, d, dtype, use_causal_mask, - sliding_window, mixed_precision, training, tile_size, kv_heads, should_transpose_v,latency): - q = (np.random.random_sample([bs, nheads, d, seqlen_q]) - 0.5) * 2 - k = (np.random.random_sample([bs, nheads, d, seqlen_k]) - 0.5) * 2 - if should_transpose_v: - v = (np.random.random_sample([bs, nheads, d, seqlen_k]) - 0.5) * 2 - else: - v = (np.random.random_sample([bs, nheads, seqlen_k, d]) - 0.5) * 2 - o_proj = np.zeros(shape=[bs, nheads, seqlen_q, d], dtype=dtype) - out_lse = np.zeros(shape=[bs, nheads, int(nl.tile_size.pmax), seqlen_q // nl.tile_size.pmax], - dtype=nl.float32 if mixed_precision else dtype) if training else None - seed = None - - q = nl.static_cast(q, dtype) - k = nl.static_cast(k, dtype) - v = nl.static_cast(v, dtype) - config = FlashConfig(**{'seq_tile_size':tile_size, 'training':training, 'should_transpose_v':should_transpose_v}) - - heads = nheads if kv_heads is None else kv_heads - - bench_func_ = bench_func[bs, heads] - bench_func_(q, k, v, seed, use_causal_mask=use_causal_mask, sliding_window=sliding_window, - mixed_precision=mixed_precision, config=config) - latency_res = bench_func_.benchmark_result.nc_latency - p50 = latency_res.get_latency_percentile(50) - - @pytest.mark.simulation - @pytest.mark.parametrize("bs, nheads, seqlen_q, seqlen_k, d, dtype, use_causal_mask, \ - sliding_window, training, tile_size, kv_heads, should_transpose_v", [ - [1, 6, 4096, 4096, 128, np.float32, True, -1, True, 2048, 3, False], - [1, 1, 4096, 4096, 128, np.float32, True, -1, False, 2048, None, False], - [1, 1, 8192, 4096, 128, np.float32, True, -1, False, 2048, None, False], - [1, 1, 4096, 8192, 128, np.float32, True, -1, False, 2048, None, False], - [1, 1, 4096, 4096, 128, np.float32, True, 1024, False, 2048, None, False], - ]) - def test_flash_attn_fwd_numerical(self, simulation_only, bs, nheads, seqlen_q, seqlen_k, d, dtype, use_causal_mask, - sliding_window, training, tile_size, kv_heads, should_transpose_v): - q = (np.random.random_sample([bs, nheads, d, seqlen_q]) - 0.5) * 2 - k = (np.random.random_sample([bs, kv_heads or nheads, d, seqlen_k]) - 0.5) * 2 - if should_transpose_v: - v = (np.random.random_sample([bs, nheads, d, seqlen_k]) - 0.5) * 2 - cpu_permute = (0, 1, 2, 3) - else: - v = (np.random.random_sample([bs, kv_heads or nheads, seqlen_k, d]) - 0.5) * 2 - cpu_permute = (0, 1, 3, 2) - - q = nl.static_cast(q, dtype) - k = nl.static_cast(k, dtype) - v = nl.static_cast(v, dtype) - seed = None - - o_proj_golden, cached_negative_max, cached_sum_reciprocal = \ - cpu_attention_forward(q, k, v.transpose(cpu_permute), use_causal_mask=use_causal_mask, sliding_window=sliding_window, mixed_precision=True) - o_proj_golden = o_proj_golden.transpose(0,1,3,2) # (b,h, d, seq) - cached_negative_max = cached_negative_max.reshape(bs, nheads, seqlen_q // nl.tile_size.pmax, - nl.tile_size.pmax).transpose(0, 1, 3, 2) - cached_sum_reciprocal = cached_sum_reciprocal.reshape(bs, nheads, seqlen_q // nl.tile_size.pmax, - nl.tile_size.pmax).transpose(0, 1, 3, 2) - lse_golden = -1.0 * (cached_negative_max + np.log(cached_sum_reciprocal)) if training else None - config = FlashConfig(**{'seq_tile_size':tile_size, 'training':training, 'should_transpose_v':should_transpose_v}) - - heads = nheads if kv_heads is None else kv_heads - - numeric_func = baremetal(flash_fwd) - if simulation_only: - results = simulate_kernel(numeric_func[bs, heads], q, k, v, seed, - use_causal_mask=use_causal_mask, - sliding_window=sliding_window, - mixed_precision=True, - config=config) - else: - results = numeric_func[bs, heads](q, k, v, seed, - use_causal_mask=use_causal_mask, - sliding_window=sliding_window, - mixed_precision=True, - config=config) - - if training: - o_proj, out_lse = results - assert np.allclose(o_proj, o_proj_golden, atol=1e-2) - assert np.allclose(out_lse, lse_golden, atol=1e-2) - else: - o_proj = results - assert np.allclose(o_proj, o_proj_golden, atol=1e-2) diff --git a/test/unit/test_neuron_profile.py b/test/unit/test_neuron_profile.py deleted file mode 100644 index e607705..0000000 --- a/test/unit/test_neuron_profile.py +++ /dev/null @@ -1,86 +0,0 @@ -from neuronxcc.nki import benchmark -from neuronxcc.nki import profile -import neuronxcc.nki.language as nl -import numpy as np -import pytest -import os -import shutil -import tempfile - - -WORKING_DIRECTORY = tempfile.mkdtemp() -SAVE_NEFF_NAME = "cus_file123.neff" -SAVE_TRACE_NAME = "profile-custom.ntff" -NUM_EXECS = 20 -PROFILE_NTH = 10 -JSON_REPORTS = "json_reports" - -@profile(working_directory=WORKING_DIRECTORY, save_neff_name=SAVE_NEFF_NAME, overwrite=False , save_trace_name=SAVE_TRACE_NAME, num_execs=NUM_EXECS, profile_nth=PROFILE_NTH) -def nki_tensor_tensor_add(a_tensor, b_tensor): - c_output = nl.ndarray(a_tensor.shape, dtype=a_tensor.dtype, buffer=nl.shared_hbm) - - a = nl.load(a_tensor) - b = nl.load(b_tensor) - - c_tile = a + b - - nl.store(c_output, value=c_tile) - - return c_output - -class TestNeuronProfile: - def _get_ntff_path(self, trace_val): - """ - Prepares ntff file name based on execution trace number - """ - if trace_val == 1: - return os.path.join(WORKING_DIRECTORY, f"{os.path.splitext(os.path.basename(SAVE_TRACE_NAME))[0]}.ntff") - else: - return os.path.join(WORKING_DIRECTORY, f"{os.path.splitext(os.path.basename(SAVE_TRACE_NAME))[0]}_exec_{trace_val}.ntff") - - @pytest.fixture - def traces(self): - ret = [] - if NUM_EXECS < PROFILE_NTH: - ret.append(self._get_ntff_path(PROFILE_NTH)) - else: - curr = PROFILE_NTH - while curr <= NUM_EXECS: - ret.append(self._get_ntff_path(curr)) - curr += PROFILE_NTH - return ret - - @pytest.fixture - def num_reports(self): - if NUM_EXECS < PROFILE_NTH: - return 1 - else: - return NUM_EXECS // PROFILE_NTH - - def test_output_artifacts_created(self, traces, num_reports): - # delete artifact directory, only testing non-overwrite functionality - if os.path.exists(WORKING_DIRECTORY): - shutil.rmtree(WORKING_DIRECTORY) - - # creates dummy input to invoke profile kernel - a = np.zeros([128, 1024]).astype(np.float16) - b = np.random.random_sample([128, 1024]).astype(np.float16) - - output_nki = nki_tensor_tensor_add(a, b) - - # now asserting artifacts are correctly created - assert os.path.exists(os.path.join(WORKING_DIRECTORY, SAVE_NEFF_NAME)) # neff - - for trace in traces: - assert os.path.exists(trace) # trace - - # json reports - report_dir = os.path.join(WORKING_DIRECTORY, JSON_REPORTS) - - assert os.path.exists(report_dir) # actually exists - assert len(os.listdir(report_dir)) == num_reports # report all iterations queried - - # post condition cleanup - if os.path.exists(WORKING_DIRECTORY): - shutil.rmtree(WORKING_DIRECTORY) - diff --git a/test/unit/test_resize_nearest.py b/test/unit/test_resize_nearest.py deleted file mode 100644 index ab5a606..0000000 --- a/test/unit/test_resize_nearest.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -Copyright (c) 2023, Amazon.com. All Rights Reserved -""" -import pytest - -from nki_samples.reference.vision import resize_nearest_fixed_dma_kernel -from neuronxcc.nki import benchmark, baremetal, simulate_kernel -import neuronxcc.nki.language as nl -import numpy as np - -bench_func = benchmark(warmup=5, iters=10)(resize_nearest_fixed_dma_kernel) - - -def cpu_golden_result(data_tensor, output_shape): - in_b, in_h, in_w, in_c = data_tensor.shape - out_b, out_h, out_w, out_c = output_shape - - # Generate nearest map - h_scale, w_scale = 1.0 * in_h / out_h, 1.0 * in_w / out_w - h_map = np.floor(np.fromfunction(lambda i, _: i * h_scale, (out_h, out_w), dtype=np.float32)) - w_map = np.floor(np.fromfunction(lambda _, j: j * w_scale, (out_h, out_w), dtype=np.float32)) - map = (h_map * in_w + w_map).astype(np.int32).flatten() - - in_seqlen, out_seqlen = in_h * in_w, out_h * out_w - - data_tensor = data_tensor.reshape((in_b, in_seqlen, in_c)) - out_tensor = np.zeros((out_b, out_seqlen, out_c)) - - for b_map in range(in_b): - for i in range(len(map)): - for c_map in range(out_c): - out_tensor[b_map, i, c_map] = data_tensor[b_map, map[i], c_map] - - return out_tensor.reshape(( out_b, out_h, out_w, out_c )) - -class TestResizeNearest: - - @pytest.mark.parametrize("in_b, in_h, in_w, in_c, out_b, out_h, out_w, out_c, dtype, latency", [ - [10, 30, 20, 1280, 10, 59, 38, 1280, np.float32, 1740], - [1, 30, 20, 1280, 1, 59, 38, 1280, nl.float16, 659], - [1, 30, 20, 1280, 1, 59, 38, 1280, nl.bfloat16, 659], - ]) - def test_resize_nearest_for_perf(self, in_b, in_h, in_w, in_c, out_b, out_h, out_w, out_c, dtype, latency): - input_tensor = np.random.random_sample((in_b, in_h, in_w, in_c)).astype(np.float32) - - input_dev = nl.static_cast(input_tensor, dtype) - - bench_func_ = bench_func[in_b] - bench_func_(input_dev, (out_b, out_h, out_w, out_c)) - latency_res = bench_func_.benchmark_result.nc_latency - p99 = latency_res.get_latency_percentile(50) - - @pytest.mark.simulation - @pytest.mark.parametrize("in_b, in_h, in_w, in_c, out_b, out_h, out_w, out_c, dtype", [ - [10, 30, 20, 1280, 10, 59, 38, 1280, np.float32], - [1, 30, 20, 1280, 1, 59, 38, 1280, nl.float16], - [1, 30, 20, 1280, 1, 59, 38, 1280, nl.bfloat16], - ]) - def test_resize_nearest_for_numberic(self, simulation_only, in_b, in_h, in_w, in_c, out_b, out_h, out_w, out_c, dtype): - input_tensor = np.random.random_sample((in_b, in_h, in_w, in_c)).astype(np.float32) - - input_dev = nl.static_cast(input_tensor, dtype) - - numeric_func = baremetal(resize_nearest_fixed_dma_kernel) - if simulation_only: - output_tensor = simulate_kernel(numeric_func[in_b], input_dev, (out_b, out_h, out_w, out_c)) - else: - output_tensor = numeric_func[in_b](input_dev, (out_b, out_h, out_w, out_c)) - output_tensor = nl.static_cast(output_tensor, np.float32) - golden_result = cpu_golden_result(input_tensor, output_tensor.shape) - assert np.allclose(output_tensor, golden_result, atol=1e-2) diff --git a/test/unit/test_rmsnorm_qkv.py b/test/unit/test_rmsnorm_qkv.py deleted file mode 100644 index 2344a59..0000000 --- a/test/unit/test_rmsnorm_qkv.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -Copyright (c) 2024, Amazon.com. All Rights Reserved -""" -import pytest -from nki_samples.reference.allocated_fused_linear import allocated_fused_rms_norm_qkv -from neuronxcc.nki import benchmark, baremetal, simulate_kernel -import neuronxcc.nki.language as nl -import numpy as np - -bench_func = benchmark(warmup=5, iters=10)(allocated_fused_rms_norm_qkv) - -np.random.seed(0) - - -def rms_norm(hidden, gamma, eps=1e-6): - rms = np.sqrt(np.mean(np.square(hidden), axis=-1, keepdims=True)) - norm = hidden * np.reciprocal(rms + eps) - if gamma is not None: - norm *= gamma - return norm - -def cpu_golden_result(hidden, gamma, qkv_weights, dtype, do_norm=True): - if do_norm: - hidden = rms_norm(hidden, gamma) - qkv_out = (hidden @ qkv_weights).astype(dtype) - return qkv_out - -class TestRMSNormQKV: - @pytest.mark.parametrize("batch, seqlen, dim, d_head, dtype, latency", [ - [1, 128, 512, 512, np.float16, 25], - [1, 512, 1024, 384, nl.bfloat16, 40], - [1, 128, 1024, 512, nl.bfloat16, 28], - # [1, 1024, 8192, 512, nl.bfloat16, 301 * 1.02], # FIXME: performance is flaky - ]) - def test_allocated_rmsnorm_qkv_perf(self, batch, seqlen, dim, d_head, dtype, latency): - hidden = np.random.random_sample((batch, seqlen, dim)).astype(np.float32) - weights = np.random.random_sample((dim, d_head)).astype(np.float32) - - hidden = nl.static_cast(hidden, dtype) - weights = nl.static_cast(weights, dtype) - - bench_func(hidden, weights) - latency_res = bench_func.benchmark_result.nc_latency - p99 = latency_res.get_latency_percentile(50) - - @pytest.mark.simulation - @pytest.mark.parametrize("batch, seqlen, dim, d_head, dtype", [ - [1, 128, 512, 512, np.float16], - [1, 512, 1024, 384, nl.bfloat16], - [1, 128, 1024, 512, nl.bfloat16], - [1, 1024, 8192, 512, nl.bfloat16] - ]) - def test_allocated_rmsnorm_qkv_numeric(self, simulation_only, batch, seqlen, dim, d_head, dtype): - hidden = np.random.random_sample((batch, seqlen, dim)) - weights = np.random.random_sample((dim, d_head)) - - hidden_dev = nl.static_cast(hidden, dtype) - weights_dev = nl.static_cast(weights, dtype) - - numeric_func = baremetal(allocated_fused_rms_norm_qkv) - if simulation_only: - out = simulate_kernel(numeric_func, hidden_dev, weights_dev) - else: - out = numeric_func(hidden_dev, weights_dev) - out = nl.static_cast(out, np.float32) - golden_res = nl.static_cast(cpu_golden_result(hidden, None, weights, dtype, do_norm=True), np.float32) - assert np.allclose(out, golden_res, atol=1e-2, rtol=1e-2) -